<a href="https://colab.research.google.com/github/SyedSihabUsSakib/Thesis/blob/master/Multi_feature_Transformer_for_Multiclass_Cyberbullying_Detection_in_Bangla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from transformers import BertModel, BertTokenizer
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('punkt')

# Mount Google Drive (if your dataset is in Drive)
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


**# Load and preprocess your data**

In [2]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/dataset.csv')  # Adjust path as needed
df['tokenized'] = df['text'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [3]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

**# Train Word2Vec model**

In [4]:
word2vec_model = Word2Vec(sentences=df['tokenized'], vector_size=300, window=5, min_count=1, workers=4)

In [5]:
class CyberbullyingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class CyberbullyingDetector(nn.Module):
    def __init__(self, num_classes=5):
        super(CyberbullyingDetector, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.word2vec_dim = 300

        self.gating = nn.Linear(768 + self.word2vec_dim, 768 + self.word2vec_dim)
        self.classifier = nn.Linear(768 + self.word2vec_dim, num_classes)

    def forward(self, input_ids, attention_mask, word2vec_input):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1]
        word2vec_output = word2vec_input

        combined = torch.cat((bert_output, word2vec_output), dim=1)

        gate = torch.sigmoid(self.gating(combined))
        gated_output = gate * combined

        logits = self.classifier(gated_output)
        return logits

def get_word2vec_embedding(text):
    words = nltk.word_tokenize(text.lower())
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if not word_vectors:
        return torch.zeros(300)
    return torch.tensor(sum(word_vectors) / len(word_vectors))

**# Prepare data**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = CyberbullyingDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = CyberbullyingDataset(X_test.tolist(), y_test.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

**# Initialize model and training components**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_encoded'], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = CyberbullyingDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = CyberbullyingDataset(X_test.tolist(), y_test.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

**Initialize training model**

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CyberbullyingDetector().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#Training loop


In [9]:
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        word2vec_embeddings = torch.stack([get_word2vec_embedding(text) for text in X_train[batch['input_ids'].cpu().numpy()[:, 0]]]).to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, word2vec_embeddings)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 1.4283236265182495
Epoch 10, Loss: 0.11129895597696304
Epoch 20, Loss: 0.012371429242193699
Epoch 30, Loss: 0.0071983798407018185
Epoch 40, Loss: 0.13607577979564667


In [14]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Get word2vec embeddings for the current batch
        texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        word2vec_embeddings = torch.stack([get_word2vec_embedding(text) for text in texts]).to(device)

        outputs = model(input_ids, attention_mask, word2vec_embeddings)
        _, predicted = torch.max(outputs.data, 1)

        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
import numpy as np

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Overall Accuracy: {accuracy * 100:.2f}%')

# Calculate precision, recall, and F1-score for each class
precision, recall, f1_score, _ = precision_recall_fscore_support(all_labels, all_predictions, average=None)

# Print metrics for each class
print("\nMetrics for each class:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"\nClass: {class_name}")
    print(f"Precision: {precision[i]:.4f}")
    print(f"Recall: {recall[i]:.4f}")
    print(f"F1-score: {f1_score[i]:.4f}")

# Print classification report (includes support for each class)
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions, target_names=label_encoder.classes_))

Overall Accuracy: 74.41%

Metrics for each class:

Class: Cy-Flaming
Precision: 0.6846
Recall: 0.7500
F1-score: 0.7158

Class: Cy-Pull-a-Pig
Precision: 0.7468
Recall: 0.6629
F1-score: 0.7024

Class: Cy-Racism
Precision: 0.8673
Recall: 0.7143
F1-score: 0.7834

Class: Cy-Threat
Precision: 0.6847
Recall: 0.7677
F1-score: 0.7238

Class: Not Bullying
Precision: 0.7719
Recall: 0.8148
F1-score: 0.7928

Classification Report:
               precision    recall  f1-score   support

   Cy-Flaming       0.68      0.75      0.72       136
Cy-Pull-a-Pig       0.75      0.66      0.70        89
    Cy-Racism       0.87      0.71      0.78       119
    Cy-Threat       0.68      0.77      0.72        99
 Not Bullying       0.77      0.81      0.79       108

     accuracy                           0.74       551
    macro avg       0.75      0.74      0.74       551
 weighted avg       0.75      0.74      0.74       551

