# Download/Upload Data

In [None]:
!git clone https://github.com/ciol-researchlab/CIOL-Winter-ML-Bootcamp.git

Cloning into 'CIOL-Winter-ML-Bootcamp'...
remote: Enumerating objects: 149, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 149 (delta 0), reused 4 (delta 0), pack-reused 142 (from 1)[K
Receiving objects: 100% (149/149), 44.71 MiB | 17.97 MiB/s, done.
Resolving deltas: 100% (26/26), done.


# 2. Setting up the enviroment

In [None]:
# Tabjular Data Analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import time
import warnings
warnings.filterwarnings('ignore')

# 4. Load the dataset

In [None]:
df = pd.read_csv("/content/CIOL-Winter-ML-Bootcamp/datasets/session4/support/bangla_political_comments.csv")
df.head(10)

Unnamed: 0,Comment,Label,Language
0,এখন এদের এক এক করে ধরেন....,Negative,BANGLA
1,হাসিনার তার দালালদের সবার বিচার করতে হবে,Negative,BANGLA
2,"ভাই এখানে রাম রাজত্ব হবে না,এখানে হবে শেখ রাজত্ব।",Negative,BANGLA
3,সাংগাতিক নিউজ 😂🥱,Negative,BANGLA
4,"ওদের অস্থিত্ব বিলিন করে দিলাম,,,খুনের প্রতিসোধ...",Negative,BANGLA
5,আদু ভাই মামা ভারি পলাইছে,Negative,BANGLA
6,আল্লাহ শয়তানদের বিচার করেছে,Negative,BANGLA
7,সাদ্দাম আর ইনান এই দুইটাকে ধরতে হবে। ধরার পর ব...,Negative,BANGLA
8,সাদ্দাম হালা কই,Negative,BANGLA
9,সাদ্দাম তো বলছেই তার জন্ম রাজপথে 🤣,Negative,BANGLA


In [None]:
test_df = pd.read_csv("/content/CIOL-Winter-ML-Bootcamp/datasets/session4/support/test.csv")
test_df.head(3)

Unnamed: 0,Comment,Label,Language
0,ইদুরের গর্তে,Negative,BANGLA
1,এরা এখন কই,Negative,BANGLA
2,আদু ভাই,Negative,BANGLA


Labels are not numerical. Let's make them numerical.

In [None]:
# Map text labels to numerical values
label_mapping = {label: idx for idx, label in enumerate(df["Label"].unique())}
df["Label"] = df["Label"].map(label_mapping)  # Change as necessary
test_df["Label"] = test_df["Label"].map(label_mapping)  # Change as necessary

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim

We do not have validation data given, so let's create some.

In [None]:
# Split into train, validation
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["Label"], random_state=42)

In [None]:
# Hugging Face BanglaBERT Model
model_name = "csebuetnlp/banglabert"  # Change as necessary
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

In [None]:
# Dataset Preparation
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row["Comment"]  # Change as necessary
        label = row["Label"]  # Change as necessary
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Hyperparameters
max_length = 256  # Change as necessary
batch_size = 8  # Keep it small for low GPU memory  # Change as necessary
learning_rate = 1e-4  # Change as necessary
epochs = 5  # Change as necessary

In [None]:
# Create DataLoaders
train_dataset = SentimentDataset(train_df, tokenizer, max_length)
val_dataset = SentimentDataset(val_df, tokenizer, max_length)
test_dataset = SentimentDataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# MLP Model
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim1, hidden_dim2, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc1 = nn.Linear(bert_model.config.hidden_size, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze BERT for embedding extraction
            outputs = self.bert_model(input_ids, attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        x = self.relu(self.fc1(cls_embeddings))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

In [None]:
# Initialize Model
num_labels = len(df["Label"].unique())
model = SentimentClassifier(bert_model, hidden_dim1=512, hidden_dim2=256, num_classes=num_labels)   # Change hidden_dim1 hidden_dim2 as necessary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentimentClassifier(
  (bert_model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [None]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Training Function
def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["label"].to(device),
        )
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.detach().cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    return total_loss / len(dataloader), acc

In [None]:
# Validation Function
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    precision = precision_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    return total_loss / len(dataloader), acc, f1, precision, recall

In [None]:
# Training Loop
for epoch in range(epochs):
    train_loss, train_acc = train_model(model, train_loader, optimizer, criterion)
    val_loss, val_acc, val_f1, val_precision, val_recall = evaluate_model(model, val_loader, criterion)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")

Epoch 1/5
Train Loss: 0.6272, Train Acc: 0.6732
Val Loss: 0.5924, Val Acc: 0.6794, Val F1: 0.5497, Precision: 0.4616, Recall: 0.6794
Epoch 2/5
Train Loss: 0.5898, Train Acc: 0.6903
Val Loss: 0.5380, Val Acc: 0.7328, Val F1: 0.6594, Precision: 0.8082, Recall: 0.7328
Epoch 3/5
Train Loss: 0.5459, Train Acc: 0.7193
Val Loss: 0.4880, Val Acc: 0.7481, Val F1: 0.6931, Precision: 0.7897, Recall: 0.7481
Epoch 4/5
Train Loss: 0.4952, Train Acc: 0.7491
Val Loss: 0.4365, Val Acc: 0.8321, Val F1: 0.8283, Precision: 0.8286, Recall: 0.8321
Epoch 5/5
Train Loss: 0.4567, Train Acc: 0.7952
Val Loss: 0.4084, Val Acc: 0.8244, Val F1: 0.8213, Precision: 0.8208, Recall: 0.8244


Let's identify and display 5 random incorrect predictions from the validation set.

In [None]:
def get_error_examples(model, dataloader, tokenizer, data):
    model.eval()
    incorrect_examples = []
    all_preds, all_labels = [], []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            # Collect incorrect examples
            for idx, (pred, label) in enumerate(zip(preds, labels)):
                if pred != label:  # Only collect incorrect ones
                    incorrect_examples.append({
                        "text": data.iloc[i * dataloader.batch_size + idx]["Comment"],
                        "true_label": label,
                        "predicted_label": pred
                    })

    # Randomly sample 5 incorrect examples
    if len(incorrect_examples) >= 5:
        sampled_errors = random.sample(incorrect_examples, 5)
    else:
        sampled_errors = incorrect_examples  # If less than 5 errors exist

    # Display the examples
    for error in sampled_errors:
        print(f"Text: {error['text']}")
        print(f"True Label: {error['true_label']} ({list(label_mapping.keys())[list(label_mapping.values()).index(error['true_label'])]})")
        print(f"Predicted Label: {error['predicted_label']} ({list(label_mapping.keys())[list(label_mapping.values()).index(error['predicted_label'])]})")
        print("-" * 50)

In [None]:
# Call the function on validation data
get_error_examples(model, val_loader, tokenizer, val_df)

Text: আমরা চাকুরিদাতা ❤️❤️❤️
True Label: 1 (Positive)
Predicted Label: 0 (Negative)
--------------------------------------------------
Text: সবে মিলে করি কাজ হারি জিতি নাহি লাজ 😀😀
True Label: 1 (Positive)
Predicted Label: 0 (Negative)
--------------------------------------------------
Text: ভালোবাসা অবিরাম ❤❤
True Label: 1 (Positive)
Predicted Label: 0 (Negative)
--------------------------------------------------
Text: প্রাইভেট বিশ্ববিদ্যালয়ের কথা কোথাও বলা হয় না অথচ আমরাই বন্ধ হওয়া আন্দোলন আবার জাগিয়ে তুলেছিলাম
True Label: 1 (Positive)
Predicted Label: 0 (Negative)
--------------------------------------------------
Text: বাঘের মত হুংকার দিয়ে শিয়ালের মত লেজ গুটিয়ে পালালো
True Label: 0 (Negative)
Predicted Label: 1 (Positive)
--------------------------------------------------


Let's predict on test data.

In [None]:
# Predictions on Test Set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1).detach().cpu().numpy()
        test_preds.extend(preds)

In [None]:
test_df.head(10)

Unnamed: 0,Comment,Label,Language,Predicted_Label
0,ইদুরের গর্তে,0,BANGLA,0
1,এরা এখন কই,0,BANGLA,0
2,আদু ভাই,0,BANGLA,0
3,"সাদ্দাম তুই কই,,,?\nতোকে খুঁজতাছি",0,BANGLA,0
4,এখন দেখি বাংলাদেশে ছাত্রলীগের কোনো জায়গা নেই,0,BANGLA,0
5,ক্ষমতা পেয়ে ভাবছিলো পুরা দেশটাই ওর মার,0,BANGLA,0
6,জারজ সাদ্দাম গেল কই?,0,BANGLA,0
7,তুই কোথায় আছিস ফুটফুটে সুন্দর 😅😅😅😅😮,0,BANGLA,0
8,দুই বাটপার কোথায়,0,BANGLA,0
9,ও নিজেই উড়ে গেছে,0,BANGLA,0


In [None]:
# Save Test Predictions
test_df["Predicted_Label"] = test_preds
test_df.to_csv("test_predictions.csv", index=False)