In [1]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from torch.nn import CrossEntropyLoss
from google.colab import files

In [3]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

  # Then you can read it in using pandas
  df = pd.read_csv(fn)

Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv
User uploaded file "cyberbullying_tweets.csv" with length 7174545 bytes


In [4]:
# Filter relevant columns and drop missing values
df = df[['tweet_text', 'cyberbullying_type']].dropna()

In [5]:
# Encode labels (cyberbullying_type) into numeric form
label_encoder = LabelEncoder()
df['cyberbullying_type'] = label_encoder.fit_transform(df['cyberbullying_type'])

In [6]:
# Check number of unique labels
num_labels = df['cyberbullying_type'].nunique()
print(f"Number of unique labels: {num_labels}")

Number of unique labels: 6


In [7]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# Reset indices of both DataFrames
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [9]:
# Dataset Class for PyTorch
class CyberbullyingDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.texts = dataframe['tweet_text']
        self.labels = dataframe['cyberbullying_type']
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels.iloc[index], dtype=torch.long)  # Change to LongTensor for CrossEntropyLoss
        }

In [10]:
# Load BERT tokenizer for Hindi-English (Multilingual)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [11]:
# Create dataset objects for training and validation
train_dataset = CyberbullyingDataset(train_df, tokenizer, max_len=128)
val_dataset = CyberbullyingDataset(val_df, tokenizer, max_len=128)

In [12]:
# Load pre-trained BERT for sequence classification with the correct number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Override Trainer to use CrossEntropyLoss explicitly
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # add num_items_in_batch argument
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",        # Save model at the end of every epoch
    load_best_model_at_end=True,
    report_to="none"
)

# Define compute_metrics to calculate accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

# Custom Trainer using CrossEntropyLoss
trainer = CustomTrainer( # Recreate the trainer object to use updated CustomTrainer class
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Validation dataset
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.3296,0.43108,0.824195
2,0.2861,0.341584,0.865395
3,0.2009,0.360511,0.869169


TrainOutput(global_step=3579, training_loss=0.3810048966009412, metrics={'train_runtime': 2937.1002, 'train_samples_per_second': 38.97, 'train_steps_per_second': 1.219, 'total_flos': 7529127465641472.0, 'train_loss': 0.3810048966009412, 'epoch': 3.0})

In [14]:
# Evaluate the model
eval_result = trainer.evaluate()

In [15]:
# Print accuracy
print(f"Accuracy: {eval_result['eval_accuracy'] * 100:.2f}%")

Accuracy: 86.54%
