Step 1: Install dependencies

In [1]:
!pip install transformers torch scikit-learn emoji nltk tqdm pandas --quiet
!pip install matplotlib-venn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m31.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m


Step 2: Import libraries


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch.nn as nn
import tqdm, emoji, re, random
import io
from sklearn.preprocessing import LabelEncoder

Step 3: Set configuration

In [3]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7c4cfe33f070>

Step 4: Upload dataset

In [4]:
from google.colab import files

# uploaded variable
uploaded = files.upload()

Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv


In [33]:
# first uploaded file
file_name = list(uploaded.keys())[0]
print(f"Loading file: {file_name}")

# Load the file using the retrieved name
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

print("Data Loaded:", df.shape)
df.head()

Loading file: cyberbullying_tweets.csv
Data Loaded: (9996, 2)


Unnamed: 0,tweet_text,cyberbullying_type
0,Every single one is a girl that would have bul...,age
1,Weâve shown my kids a lot of #80smovies and ...,age
2,The only reason i didn't get bullied for these...,age
3,People who say that high school cis boys would...,age
4,I super relate to this story. I was bullied in...,age


Step 5: Preprocessing function

In [17]:


def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r"http\S+", "", text)
    text = emoji.demojize(text)
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)
    text = re.sub(r"#[A-Za-z0-9_]+", "", text)
    text = re.sub(r"[^a-zA-Z\s:]", "", text)
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text



Step 6: Encode labels

In [18]:
labels = sorted(df['cyberbullying_type'].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {v:k for k,v in label2id.items()}
df['label'] = df['cyberbullying_type'].map(label2id)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['cyberbullying_type'])





Step 7: Split dataset

In [19]:
from sklearn.model_selection import train_test_split

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet_text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 7996
Validation samples: 2000


Initialize tokenizer

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Correctly indented Dataset class

In [21]:
# Dataset class
class CyberBullyingDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len=128):

        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Create datasets

In [22]:
train_dataset = CyberBullyingDataset(train_texts, train_labels, tokenizer)
val_dataset = CyberBullyingDataset(val_texts, val_labels, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Initialize model

In [23]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Optimizer and scheduler

In [24]:

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3 # 3 epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)


loss_fn = torch.nn.CrossEntropyLoss().to(device)

Training loop

In [25]:
# Train epoch
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    # Training function
    model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm.tqdm(data_loader, desc='Training'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

Evaluation loop

In [26]:
# model evaluation function
def eval_model(model, data_loader, device, n_examples):
    model.eval()
    predictions, real_values = [], []
    with torch.no_grad():
        for d in tqdm.tqdm(data_loader, desc='Evaluating'):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            real_values.extend(labels.cpu().numpy())
    print(classification_report(real_values, predictions, target_names=label_encoder.classes_))

Training for few epochs

In [27]:
import numpy as np

# Set EPOCHS = 1 to train for one full pass
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, scheduler, len(train_dataset)
    )
    print(f'Train loss {train_loss}, accuracy {train_acc}')

# This will run after the training loop is finished
print('\nEvaluation:')
eval_model(model, val_loader, device, len(val_dataset))



Epoch 1/3


Training: 100%|██████████| 500/500 [02:49<00:00,  2.95it/s]


Train loss 0.6710292314589024, accuracy 0.7506253126563281

Epoch 2/3


Training: 100%|██████████| 500/500 [02:49<00:00,  2.95it/s]


Train loss 0.3540302021354437, accuracy 0.8664332166083041

Epoch 3/3


Training: 100%|██████████| 500/500 [02:49<00:00,  2.95it/s]


Train loss 0.2506668368168175, accuracy 0.9109554777388694

Evaluation:


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.63it/s]

                     precision    recall  f1-score   support

                age       0.98      0.97      0.98       351
          ethnicity       0.98      0.95      0.96       333
             gender       0.87      0.89      0.88       352
  not_cyberbullying       0.62      0.60      0.61       332
other_cyberbullying       0.66      0.68      0.67       320
           religion       0.95      0.97      0.96       312

           accuracy                           0.84      2000
          macro avg       0.84      0.84      0.84      2000
       weighted avg       0.84      0.84      0.84      2000






Save model and tokenizer

In [28]:
model.save_pretrained('/content/bert_cyberbullying_model')
tokenizer.save_pretrained('/content/bert_cyberbullying_model')
print('Model saved successfully.')

Model saved successfully.
