<a href="https://colab.research.google.com/github/ShabnaIlmi/Data-Science-Group-Project/blob/recipe-risk-analyzer/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install transformers tensorflow

import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder





In [19]:
# Load dataset
file_path = "/content/drive/MyDrive/CW_ML/chemical_recipe_dataset.csv"
data = pd.read_csv(file_path)

# Combine 'Chemical Names' and 'Potential Reaction' as input text
data["text"] = data["Chemical Names"] + " [SEP] " + data["Potential Reaction"]

# Encode Target Labels
label_encoder = LabelEncoder()
data["Risk Level Encoded"] = label_encoder.fit_transform(data["Risk Level"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["Risk Level Encoded"], test_size=0.2, random_state=42
)

print("Sample Data:\n", data.head())


Sample Data:
   Recipe ID                                     Chemical Names  \
0     R0001  Acetone + Hydrogen Peroxide + Sulfuric Acid + ...   
1     R0002              Charcoal + Potassium Nitrate + Sulfur   
2     R0003                         Hydrogen Sulfide + Ammonia   
3     R0004                          Sulfur + Ammonium Nitrate   
4     R0005    Hydrogen Sulfide + Ammonia + Methane + Chlorine   

                  Quantities   Category  \
0  485g + 398g + 275g + 197g  Explosive   
1          465g + 134g + 72g  Explosive   
2                272g + 358g  Corrosive   
3                297g + 304g   Unstable   
4   74g + 376g + 285g + 199g  Corrosive   

                                 Potential Reaction Risk Level  \
0   Explosion risk when exposed to heat or friction        Low   
1   Explosion risk when exposed to heat or friction     Medium   
2                    Causes severe burns on contact     Medium   
3  May decompose violently under certain conditions       High   


In [20]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text into BERT input format
def tokenize_text(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )

# Convert text into tokenized format
X_train_tokens = tokenize_text(X_train, tokenizer)
X_test_tokens = tokenize_text(X_test, tokenizer)

print("Tokenization Complete. Sample tokens:\n", X_train_tokens["input_ids"][:2])


Tokenization Complete. Sample tokens:
 tf.Tensor(
[[  101 17864  2594  5648  1009  9078  5524  1009  9732  2566 28479  1009
   9152 12412  5648   102  2089 21933  8737  9232 14196  2104  3056  3785
    102     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  101 25874  1009  9732 21396 20740   102  7085 11704 11865  7834 17631
   2000  2740   102     0     0     0     0     0     0     0     0     0
      0    

In [22]:
# Define Input Layers (KerasTensors)
input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Load Pretrained BERT Model (Feature Extractor)
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Lambda layer ensures correct conversion for TensorFlow
# Specify output_shape for the Lambda layer
bert_output = Lambda(lambda x: bert_model(x)[0][:, 0, :], output_shape=(bert_model.config.hidden_size, ))([input_ids, attention_mask])  # Extract CLS token and specify output shape

# Add Custom Classification Layers
x = Dense(256, activation="relu")(bert_output)
x = Dropout(0.3)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.3)(x)
output = Dense(len(np.unique(y_train)), activation="softmax")(x)

# Build Model
bert_classifier = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile Model
bert_classifier.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Print Model Summary
bert_classifier.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [23]:
# Train Model
bert_classifier.fit(
    {"input_ids": X_train_tokens["input_ids"], "attention_mask": X_train_tokens["attention_mask"]},
    y_train,
    validation_data=(
        {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
        y_test
    ),
    epochs=5,
    batch_size=16
)


Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 11s/step - accuracy: 0.3619 - loss: 1.2269 - val_accuracy: 0.3050 - val_loss: 1.1097
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 10s/step - accuracy: 0.3257 - loss: 1.1356 - val_accuracy: 0.2850 - val_loss: 1.1063
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 11s/step - accuracy: 0.3419 - loss: 1.1052 - val_accuracy: 0.3100 - val_loss: 1.1142
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 10s/step - accuracy: 0.3178 - loss: 1.1168 - val_accuracy: 0.2850 - val_loss: 1.1088
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 11s/step - accuracy: 0.3329 - loss: 1.1129 - val_accuracy: 0.3150 - val_loss: 1.1143


<keras.src.callbacks.history.History at 0x7841933ad290>

In [24]:
# Evaluate on Test Data
loss, accuracy = bert_classifier.evaluate(
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test
)

print(f"\nBERT Model Accuracy: {accuracy:.4f}")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 14s/step - accuracy: 0.3062 - loss: 1.1163

BERT Model Accuracy: 0.3150
