In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, SimpleRNN
from tensorflow.keras.layers import GlobalAveragePooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('TP_DS.csv')  # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split data
X = data['cleaned_text']
y = data['label_encoded']
X = data['cleaned_text'].astype(str)  # Convert to string to handle any float or NaN issues
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and Padding
vocab_size = 10000
max_length = 100
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Define a function to build models
def build_model(model_type="RNN"):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))

    if model_type == "RNN":
        model.add(SimpleRNN(64, return_sequences=False))
    elif model_type == "LSTM":
        model.add(LSTM(64, return_sequences=False))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(64, return_sequences=False)))
        
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(), Recall()])
    return model

# Training and Evaluation
def train_and_evaluate(model_type):
    model = build_model(model_type)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(X_train_pad, y_train, 
                        epochs=10, 
                        batch_size=64, 
                        validation_split=0.2,
                        callbacks=[early_stopping])
    
    # Evaluation
    y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
    print(f"Classification Report for {model_type}:")
    print(classification_report(y_test, y_pred, target_names=['CG', 'OR']))
    
# Train and evaluate RNN, LSTM, and BiLSTM models
for model_type in ["RNN", "LSTM", "BiLSTM"]:
    train_and_evaluate(model_type)




Epoch 1/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.6708 - loss: 0.6042 - precision: 0.6749 - recall: 0.6552 - val_accuracy: 0.7364 - val_loss: 0.5315 - val_precision: 0.8005 - val_recall: 0.6326
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 32ms/step - accuracy: 0.7485 - loss: 0.5327 - precision: 0.7403 - recall: 0.7631 - val_accuracy: 0.7806 - val_loss: 0.4932 - val_precision: 0.7660 - val_recall: 0.8106
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - accuracy: 0.7588 - loss: 0.5169 - precision: 0.7663 - recall: 0.7425 - val_accuracy: 0.5233 - val_loss: 0.7166 - val_precision: 0.5191 - val_recall: 0.6828
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 27ms/step - accuracy: 0.5943 - loss: 0.6639 - precision: 0.5783 - recall: 0.6870 - val_accuracy: 0.5560 - val_loss: 0.7231 - val_precision: 0.5600 - val_recall: 0.5387
Epoch 5/10
[1m4



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 72ms/step - accuracy: 0.5576 - loss: 0.6684 - precision_1: 0.5911 - recall_1: 0.3651 - val_accuracy: 0.6690 - val_loss: 0.6260 - val_precision_1: 0.7189 - val_recall_1: 0.5593
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 60ms/step - accuracy: 0.7683 - loss: 0.4872 - precision_1: 0.7933 - recall_1: 0.7238 - val_accuracy: 0.8766 - val_loss: 0.2881 - val_precision_1: 0.8995 - val_recall_1: 0.8491
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 54ms/step - accuracy: 0.9009 - loss: 0.2377 - precision_1: 0.9044 - recall_1: 0.8960 - val_accuracy: 0.9034 - val_loss: 0.2264 - val_precision_1: 0.9107 - val_recall_1: 0.8953
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 52ms/step - accuracy: 0.9316 - loss: 0.1695 - precision_1: 0.9349 - recall_1: 0.9274 - val_accuracy: 0.9017 - val_loss: 0.2304 - val_precision_1: 0.9310 - val_recall_1: 0.



[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 78ms/step - accuracy: 0.8453 - loss: 0.3305 - precision_2: 0.8659 - recall_2: 0.8160 - val_accuracy: 0.8941 - val_loss: 0.2366 - val_precision_2: 0.9360 - val_recall_2: 0.8469
Epoch 2/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 75ms/step - accuracy: 0.9241 - loss: 0.1864 - precision_2: 0.9229 - recall_2: 0.9250 - val_accuracy: 0.9141 - val_loss: 0.2100 - val_precision_2: 0.8970 - val_recall_2: 0.9362
Epoch 3/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 76ms/step - accuracy: 0.9461 - loss: 0.1323 - precision_2: 0.9458 - recall_2: 0.9461 - val_accuracy: 0.9127 - val_loss: 0.2108 - val_precision_2: 0.9244 - val_recall_2: 0.8996
Epoch 4/10
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 75ms/step - accuracy: 0.9610 - loss: 0.0985 - precision_2: 0.9601 - recall_2: 0.9616 - val_accuracy: 0.9110 - val_loss: 0.2547 - val_precision_2: 0.9141 - val_recall_2: 0.

Transformers and PreTrained Models

In [2]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import sys, inspect
import transformers
from transformers import TrainingArguments

print("Python executable:", sys.executable)
print("Python version:", sys.version.splitlines()[0])
print("Transformers version:", transformers.__version__)
print("Transformers file:", transformers.__file__)
print("TrainingArguments __init__ signature:", inspect.signature(TrainingArguments.__init__))
print("TrainingArguments defined in:", inspect.getsourcefile(TrainingArguments))


Python executable: c:\Users\Nithin Srinivaas\AppData\Local\Programs\Python\Python313\python.exe
Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
Transformers version: 4.57.1
Transformers file: c:\Users\Nithin Srinivaas\AppData\Local\Programs\Python\Python313\Lib\site-packages\transformers\__init__.py
TrainingArguments defined in: c:\Users\Nithin Srinivaas\AppData\Local\Programs\Python\Python313\Lib\site-packages\transformers\training_args.py


In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used (CPU or GPU)
print(f"Using device: {device}")

# Load dataset and preprocess
data = pd.read_csv('TP_DS.csv')  # Replace with your file path
data['label_encoded'] = LabelEncoder().fit_transform(data['label'])

# Split dataset
X = data['cleaned_text'].astype(str)
y = data['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset class to handle text and labels
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

# Create dataset and data loaders
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
test_dataset = ReviewDataset(X_test, y_test, tokenizer)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)




# Define Trainer for training BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train model
trainer.train()

# Evaluate on test set
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

from sklearn.metrics import classification_report
print(classification_report(y_test, pred_labels, target_names=['CG', 'OR']))



Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3741,0.375257
2,0.2742,0.316264
3,0.2046,0.287347


              precision    recall  f1-score   support

          CG       0.90      0.94      0.92      4016
          OR       0.94      0.90      0.92      4071

    accuracy                           0.92      8087
   macro avg       0.92      0.92      0.92      8087
weighted avg       0.92      0.92      0.92      8087



In [2]:
import os
import json

# Define a directory to save the model and tokenizer
model_dir = 'BERT./saved_model'
os.makedirs(model_dir, exist_ok=True)

# Save the trained BERT model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

# Save training arguments as a JSON file
with open(os.path.join(model_dir, 'training_args.json'), 'w') as f:
    json.dump(training_args.to_dict(), f)

print(f"Model, tokenizer, and training arguments saved in {model_dir}")


Model, tokenizer, and training arguments saved in BERT./saved_model


In [3]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("Number of GPUs available:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


Is CUDA available: True
Number of GPUs available: 1
GPU Name: NVIDIA GeForce GTX 1650


In [4]:
'''This saved setup can be used directly for deployment, where you can reload the model and tokenizer using the from_pretrained method as follows:'''
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer for deployment
loaded_model = BertForSequenceClassification.from_pretrained(model_dir)
loaded_tokenizer = BertTokenizer.from_pretrained(model_dir)
