Step 1: Install dependencies

In [1]:
!pip install transformers torch scikit-learn emoji nltk tqdm pandas --quiet
!pip install matplotlib-venn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m


Step 2: Import libraries


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch.nn as nn
import tqdm, emoji, re, random
import io
from sklearn.preprocessing import LabelEncoder

Step 3: Set configuration

In [3]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f51c352afd0>

Step 4: Upload dataset

In [4]:
from google.colab import files

uploaded = files.upload()

Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv


In [5]:
file_name = list(uploaded.keys())[0]
print(f"Loading file: {file_name}")
df = pd.read_csv(io.BytesIO(uploaded[file_name]))

print("Data Loaded:", df.shape)
df.head()

Loading file: cyberbullying_tweets.csv
Data Loaded: (9996, 2)


Unnamed: 0,tweet_text,cyberbullying_type
0,Every single one is a girl that would have bul...,age
1,Weâve shown my kids a lot of #80smovies and ...,age
2,The only reason i didn't get bullied for these...,age
3,People who say that high school cis boys would...,age
4,I super relate to this story. I was bullied in...,age


Step 5: Preprocessing function

In [6]:


def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r"http\S+", "", text)
    text = emoji.demojize(text)
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)
    text = re.sub(r"#[A-Za-z0-9_]+", "", text)
    text = re.sub(r"[^a-zA-Z\s:]", "", text)
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text



Step 6: Encode labels

In [7]:
labels = sorted(df['cyberbullying_type'].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {v:k for k,v in label2id.items()}
df['label'] = df['cyberbullying_type'].map(label2id)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['cyberbullying_type'])





Step 7: Split dataset

In [8]:
from sklearn.model_selection import train_test_split

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet_text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 7996
Validation samples: 2000


Initialize tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Correctly indented Dataset class

In [10]:
# Dataset class
class CyberBullyingDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len=128):

        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Create datasets

In [11]:
train_dataset = CyberBullyingDataset(train_texts, train_labels, tokenizer)
val_dataset = CyberBullyingDataset(val_texts, val_labels, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Initialize model

In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Optimizer and scheduler

In [13]:

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3 # 3 epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)


loss_fn = torch.nn.CrossEntropyLoss().to(device)

Training loop

In [14]:
# Train epoch
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    # Training function
    model.train()
    losses = []
    correct_predictions = 0
    for d in tqdm.tqdm(data_loader, desc='Training'):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

Evaluation loop

In [15]:
# model evaluation function
def eval_model(model, data_loader, device, n_examples):
    model.eval()
    predictions, real_values = [], []
    with torch.no_grad():
        for d in tqdm.tqdm(data_loader, desc='Evaluating'):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            real_values.extend(labels.cpu().numpy())
    print(classification_report(real_values, predictions, target_names=label_encoder.classes_))

Training for few epochs

In [16]:
import numpy as np

# Set EPOCHS = 1 to train for one full pass
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, scheduler, len(train_dataset)
    )
    print(f'Train loss {train_loss}, accuracy {train_acc}')

# This will run after the training loop is finished
print('\nEvaluation:')
eval_model(model, val_loader, device, len(val_dataset))



Epoch 1/3


Training: 100%|██████████| 500/500 [02:54<00:00,  2.87it/s]


Train loss 0.6813089804202318, accuracy 0.7441220610305153

Epoch 2/3


Training: 100%|██████████| 500/500 [02:55<00:00,  2.84it/s]


Train loss 0.3604476069286466, accuracy 0.8621810905452726

Epoch 3/3


Training: 100%|██████████| 500/500 [02:55<00:00,  2.84it/s]


Train loss 0.25427728951722384, accuracy 0.9102051025512756

Evaluation:


Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.24it/s]

                     precision    recall  f1-score   support

                age       0.98      0.97      0.97       351
          ethnicity       0.97      0.96      0.96       333
             gender       0.89      0.88      0.88       352
  not_cyberbullying       0.64      0.61      0.63       332
other_cyberbullying       0.66      0.69      0.68       320
           religion       0.94      0.97      0.95       312

           accuracy                           0.85      2000
          macro avg       0.85      0.85      0.85      2000
       weighted avg       0.85      0.85      0.85      2000






Save model and tokenizer

In [17]:
model.save_pretrained('/content/bert_cyberbullying_model')
tokenizer.save_pretrained('/content/bert_cyberbullying_model')
print('Model saved successfully.')

Model saved successfully.


In [18]:
from google.colab import drive, files
import os, shutil, glob, sys

FILE_NAME = "content_bert_cyberbullying_model.ZIP"

print("1) Mounting Google Drive...")
drive.mount('/content/drive', force_remount=False)

print("\n2) Searching Drive for file name:", FILE_NAME)
matches = []

for root, dirs, files_list in os.walk('/content/drive/MyDrive'):
    for f in files_list:
        if f == FILE_NAME:
            matches.append(os.path.join(root, f))

if not matches:
    matches = glob.glob(f"/content/drive/MyDrive/**/{FILE_NAME}", recursive=True)

if matches:
    print(f"\nFound {len(matches)} match(es). Using first one:")
    src = matches[0]
    print(" ->", src)
    dst = os.path.join('/content', FILE_NAME)
    try:
        shutil.copy(src, dst)
        print("\nCopied to:", dst)
    except Exception as e:
        print("Failed to copy file from Drive to /content:", e)
        print("You can still use the Drive path directly in your code:", src)
        dst = src
else:

    print("\nOpening upload dialog — choose your ZIP file (it will be saved to /content)...")
    uploaded = files.upload()
    if uploaded:
        uploaded_name = next(iter(uploaded.keys()))
        dst = os.path.join('/content', uploaded_name)
        print("Uploaded file saved to:", dst)
        FILE_NAME = uploaded_name
    else:
        print("No file uploaded. Please upload or place the file in Google Drive and re-run.")
        dst = None


1) Mounting Google Drive...
Mounted at /content/drive

2) Searching Drive for file name: content_bert_cyberbullying_model.ZIP

Opening upload dialog — choose your ZIP file (it will be saved to /content)...


Saving content_bert_cyberbullying_model.zip to content_bert_cyberbullying_model.zip
Uploaded file saved to: /content/content_bert_cyberbullying_model.zip


In [20]:
!pip install -q transformers gradio torch sentencepiece --upgrade

import os
import zipfile
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import gradio as gr

MODEL_ZIP = "/content/content_bert_cyberbullying_model.zip"
EXTRACT_DIR = "/content/bert_model_extracted"
LABELS = ["age","ethnicity","gender","religion","other_cyberbullying","not_cyberbullying"]
THRESHOLD = 0.5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def extract_model(zip_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(out_dir)
    return out_dir

def load_model_and_tokenizer(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.to(DEVICE).eval()
    return tokenizer, model

if not os.path.isdir(EXTRACT_DIR) or not os.listdir(EXTRACT_DIR):
    extract_model(MODEL_ZIP, EXTRACT_DIR)

def find_model_root(d):
    for root, dirs, files in os.walk(d):
        if "config.json" in files:
            return root
    return d

MODEL_ROOT = find_model_root(EXTRACT_DIR)
tokenizer, model = load_model_and_tokenizer(MODEL_ROOT)

sigmoid = torch.nn.Sigmoid()
softmax = torch.nn.Softmax(dim=-1)

def predict_text(text):
    if not text or not text.strip():
        return "Enter text.", []

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits.squeeze(0).cpu()

    num_labels = logits.shape[-1]
    problem_type = getattr(model.config, "problem_type", None)

    if problem_type == "multi_label_classification" or num_labels == len(LABELS):
        probs = sigmoid(logits).numpy()
        predictions = [LABELS[i] for i,p in enumerate(probs) if p >= THRESHOLD]
        if not predictions:
            predictions = [LABELS[int(np.argmax(probs))]]
        prob_table = [[LABELS[i], float(probs[i])] for i in range(len(LABELS))]
        return ", ".join(predictions), prob_table

    probs = softmax(logits.unsqueeze(0)).squeeze(0).numpy()
    top = int(np.argmax(probs))
    pred = LABELS[top]
    prob_table = [[LABELS[i], float(probs[i])] for i in range(len(LABELS))]
    return pred, prob_table


with gr.Blocks(title="Cyberbullying Entity Predictor") as demo:

    gr.Markdown(
        """
        <div style='padding:18px; border:1px solid #ddd; text-align:center; border-radius:10px; margin-bottom:20px;'>
            <h3 style='margin-top:0;'>Cyberbullying Entity Predictor</h3>
            <div style='text-align:center; font-size:16px; color:gray; margin-bottom:25px;'>
            Enter text and get predictions for 6 cyberbullying categories.
        </div>
        </div>
        """,
        elem_id="input_header"
    )

    text_input = gr.Textbox(
        lines=4,
        label="Enter text",
        placeholder="Type your message here...",
    )

    predict_btn = gr.Button("Predict")

    gr.Markdown(
        """
        <div style='padding:18px; border:1px solid #ddd; border-radius:10px; margin-top:20px;'>
            <h3 style='margin-top:0;'>Prediction Output</h3>
        </div>
        """,
        elem_id="output_header"
    )

    pred_out = gr.Textbox(
        label="Predicted Label(s)",
        interactive=False
    )

    prob_out = gr.Dataframe(
        headers=["Label", "Probability"],
        label="Probabilities",
        interactive=False
    )

    predict_btn.click(
        fn=predict_text,
        inputs=text_input,
        outputs=[pred_out, prob_out]
    )


demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7f465811c45b33ca26.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


