In [1]:
import transformers
print(transformers.__version__)


4.51.3


In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"


In [3]:
pip uninstall keras keras-nightly keras-preprocessing tf-keras tensorflow tensorflow-gpu -y


Note: you may need to restart the kernel to use updated packages.




In [4]:
pip install transformers torch

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import BertTokenizer, BertForSequenceClassification


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

In [7]:
dataset_path = 'preprocessed_english_1.csv'

In [8]:
df = pd.read_csv(dataset_path, encoding='latin-1')

In [9]:
print(df.head()['cleaned_text'])

0    in other words katandandre, your food was crap...
1    why is aussietv so white? mkr theblock imacele...
2         a classy whore? or more red velvet cupcakes?
3    meh. p  thanks for the heads up, but not too c...
4    this is an isis account pretending to be a kur...
Name: cleaned_text, dtype: object


In [10]:
# Preprocessing
classes = df['label'].unique()
class_to_id = {cls: idx for idx, cls in enumerate(classes)}
id_to_class = {idx: cls for cls, idx in class_to_id.items()}

In [11]:
df = df.drop(columns=['comments'])

In [12]:
# Map class labels to IDs
df['label'] = df['label'].map(class_to_id)

In [13]:
print(df.head())

   label                                       cleaned_text
0      0  in other words katandandre, your food was crap...
1      0  why is aussietv so white? mkr theblock imacele...
2      0       a classy whore? or more red velvet cupcakes?
3      0  meh. p  thanks for the heads up, but not too c...
4      0  this is an isis account pretending to be a kur...


In [14]:
# Print unique classes
print("Unique Classes:", classes)

Unique Classes: ['not bully' 'religious' 'others' 'sexual']


In [15]:
# Print the mapping from class labels to IDs
print("Class to ID Mapping:", class_to_id)

Class to ID Mapping: {'not bully': 0, 'religious': 1, 'others': 2, 'sexual': 3}


In [16]:
# Print the mapping from IDs back to class labels
print("ID to Class Mapping:", id_to_class)

ID to Class Mapping: {0: 'not bully', 1: 'religious', 2: 'others', 3: 'sexual'}


In [17]:
# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_text'].values, df['label'].values, test_size=0.2, random_state=42
)

In [18]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
# Tokenize the text
def tokenize_function(texts):
    # Ensure all elements are strings
    texts = [str(text) for text in texts]  # Convert each element to string explicitly
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [20]:
train_texts.size

22597

In [21]:
# Example: Get tokenized text for the first row of train_texts
row_index = 0  # Index of the row you want to examine
input_ids = train_encodings['input_ids'][row_index]  # Get the token IDs for the row

# Decode the token IDs back to text
tokenized_text = tokenizer.decode(input_ids)

# Print the tokenized text
print(f"Original Text: {train_texts[row_index]}")
print(f"Tokenized Text: {tokenized_text}")

Original Text: rt    also the awesome  counted the  of women he follows amp wrote about it here
Tokenized Text: [CLS] rt also the awesome counted the of women he follows amp wrote about it here [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [22]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")

GPU is available and being used: NVIDIA GeForce RTX 3070


In [23]:
# Create a custom dataset class
class CyberbullyingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [24]:
train_dataset = CyberbullyingDataset(train_encodings, train_labels)
test_dataset = CyberbullyingDataset(test_encodings, test_labels)

In [25]:
# # Display a few samples from train_dataset
# print("Sample Data from train_dataset:")
# for i in range(5):  # Display first 5 samples
#     print(train_dataset[i])


In [26]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(classes))

model.config.hidden_dropout_prob = 0.3
model.config.attention_probs_dropout_prob = 0.3

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [28]:
import evaluate

metric = evaluate.load("accuracy")

In [29]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.05)

In [30]:
from transformers import get_scheduler

In [46]:
# Define the learning rate scheduler
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
lr_scheduler = get_scheduler(
    "linear",  # Options: 'linear', 'cosine', 'constant', etc.
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps
)

In [47]:
# Set up Trainer
training_args = TrainingArguments(
    output_dir="./resultsBERT0/T",  # Directory to save model checkpoints and logs
    num_train_epochs=15,  # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.05,  # Weight decay (L2 regularization)
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=200,  # Log metrics every 200 steps
    eval_strategy="epoch",  # Evaluate the model at the end of every epoch
    save_strategy="epoch",  # Save model at the end of every epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)


In [48]:
import torch
from torch import nn
from transformers import BertModel, BertPreTrainedModel

class BertWithExtraDropout(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels  # Fix: assign num_labels here
        self.init_weights()
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        if loss is not None:
            return loss, logits
        else:
            return logits

        

from transformers import BertConfig

num_labels = len(classes)  # your number of classes
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=4)

model = BertWithExtraDropout(config)


In [49]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.4279,1.407943,0.244779
2,1.4166,1.407943,0.244779
3,1.424,1.407943,0.244779
4,1.4204,1.407943,0.244779
5,1.4235,1.407943,0.244779


KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

In [None]:
from sklearn.metrics import accuracy_score

predictions = trainer.predict(test_dataset)

# Get predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)

print(f"Accuracy: {accuracy}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Get model predictions
predictions, labels, _ = trainer.predict(test_dataset)

# Convert logits to predicted labels
predicted_labels = np.argmax(predictions, axis=1)

# Compute metrics
accuracy = accuracy_score(labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average="weighted")

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


In [None]:
train_results = trainer.evaluate(train_dataset)
print("Training Set Metrics:", train_results)


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from collections import Counter
from transformers import BertForSequenceClassification

# Define the same model architecture
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Load the saved weights
bert_model.load_state_dict(torch.load("bert_model.pth", map_location=torch.device('cpu')))

# Set the model to evaluation mode
bert_model.eval()

print("BERT model loaded successfully!")



from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
        
bert_test_dataset = torch.load("bert_test_dataset.pth")


def evaluate_model(model, test_dataset, batch_size=16):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, all_preds, all_labels


bert_accuracy, predictions, true_labels = evaluate_model(bert_model, bert_test_dataset)

print(f"BERT Model Accuracy: {bert_accuracy:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    precision_recall_fscore_support,
    roc_auc_score,
    cohen_kappa_score
)
from sklearn.preprocessing import label_binarize


class_names = ['not bully', 'religious', 'others', 'sexual']

cm = confusion_matrix(true_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title("Confusion Matrix - BERT Model")
plt.show()


In [None]:
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=class_names))


In [None]:
kappa = cohen_kappa_score(true_labels, predictions)
print(f"\nCohen's Kappa Score: {kappa:.4f}")

In [None]:
def evaluate_logits(model, test_dataset, batch_size=16):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating (for ROC-AUC)"):
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)

            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_probs), np.array(all_labels)


probs, y_true = evaluate_logits(bert_model, bert_test_dataset)
y_true_bin = label_binarize(y_true, classes=[0, 1, 2, 3])
roc_auc = roc_auc_score(y_true_bin, probs, average='macro')
print(f"\nMacro ROC-AUC Score: {roc_auc:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Your class names
class_names = ['not bully', 'religious', 'others', 'sexual']

# Binarize true labels
y_true_bin = label_binarize(y_true, classes=[0, 1, 2, 3])
n_classes = y_true_bin.shape[1]

# Compute ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Create single ROC plot
plt.figure(figsize=(6, 4))

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"{class_names[i]} (AUC = {roc_auc[i]*100:.2f}%)")

# Plot the diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', label='Chance (AUC = 50%)')

# Formatting
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-Class ROC Curves - BERT Model')
plt.legend(loc="lower right", fontsize=8)
plt.grid(True)
plt.tight_layout()
plt.savefig("bert_roc_curve.png", dpi=300)
plt.show()


In [None]:
import os
import torch
model_size_kb = os.path.getsize("bert_model.pth") / 1024
print(f"Model size: {model_size_kb:.0f} KB")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Count trainable parameters
param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameter count: {param_count}")


In [None]:
from sklearn.metrics import matthews_corrcoef

mcc = matthews_corrcoef(true_labels, predictions)
print(f"MCC: {mcc * 100:.2f}%")


In [None]:
# def classify_text(text):
#     """Classifies a given text using the trained model.

#     Args:
#         text (str): The text to classify.

#     Returns:
#         str: The predicted class label.
#     """
#     # Tokenize the input text
#     inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

#     # Move inputs to the device (GPU if available, otherwise CPU)
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     # Get model predictions
#     with torch.no_grad():  # Disable gradient calculation during inference
#         outputs = model(**inputs)

#     # Get the predicted class label
#     predicted_class_id = torch.argmax(outputs.logits).item()
#     predicted_class_label = id_to_class[predicted_class_id]

#     return predicted_class_label

# # Hardcoded input text
# custom_text = "i hate to do this"  # Replace with your desired text

# # Classify the text
# predicted_label = classify_text(custom_text)

# print(f"Predicted Label for '{custom_text}': {predicted_label}")