In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

file_path = '/content/drive/MyDrive/Colab Notebooks/Bengali Hate Meme.csv'
data = pd.read_csv(file_path)

texts = data['Text'].tolist()
labels = data['Label'].tolist()
label_dict = {label: idx for idx, label in enumerate(set(labels))}
numeric_labels = [label_dict[label] for label in labels]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, numeric_labels, test_size=0.1, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')

def filter_texts_and_labels(texts, labels):
    filtered_texts = []
    filtered_labels = []
    for text, label in zip(texts, labels):
        if isinstance(text, str) and text:
            filtered_texts.append(text)
            filtered_labels.append(label)
    return filtered_texts, filtered_labels

train_texts_filtered, train_labels = filter_texts_and_labels(train_texts, train_labels)
val_texts_filtered, val_labels = filter_texts_and_labels(val_texts, val_labels)
test_texts_filtered, test_labels = filter_texts_and_labels(test_texts, test_labels)

train_encodings = tokenizer(train_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

model = AutoModelForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=len(label_dict))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(102025, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
epochs = 8
batch_size = 32
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()


train_accuracy_values, train_loss_values = [], []
test_accuracy_values, test_loss_values = [], []

for epoch in range(epochs):
    model.train()
    total_train_accuracy, total_train_loss, total_batches = 0, 0, 0

    for i in range(0, len(train_encodings['input_ids']), batch_size):
        batch_input = {key: val[i:i+batch_size].to(device) for key, val in train_encodings.items()}
        labels = train_labels[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(**batch_input)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        batch_accuracy = (predictions == labels).float().mean().item()
        total_train_accuracy += batch_accuracy
        total_batches += 1

    epoch_train_accuracy = total_train_accuracy / total_batches
    epoch_train_loss = total_train_loss / total_batches
    train_accuracy_values.append(epoch_train_accuracy)
    train_loss_values.append(epoch_train_loss)
    print(f"Epoch {epoch+1} - Train Accuracy: {epoch_train_accuracy:.4f} - Train Loss: {epoch_train_loss:.4f}")

    model.eval()
    total_test_accuracy, total_test_loss, total_test_batches = 0, 0, 0

    with torch.no_grad():
        for i in range(0, len(test_encodings['input_ids']), batch_size):
            batch_input = {key: val[i:i+batch_size].to(device) for key, val in test_encodings.items()}
            labels = test_labels[i:i+batch_size].to(device)

            outputs = model(**batch_input)
            logits = outputs.logits
            loss = criterion(logits, labels)

            total_test_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            batch_accuracy = (predictions == labels).float().mean().item()
            total_test_accuracy += batch_accuracy
            total_test_batches += 1

    epoch_test_accuracy = total_test_accuracy / total_test_batches
    epoch_test_loss = total_test_loss / total_test_batches
    test_accuracy_values.append(epoch_test_accuracy)
    test_loss_values.append(epoch_test_loss)
    print(f"Epoch {epoch+1} - Test Accuracy: {epoch_test_accuracy:.4f} - Test Loss: {epoch_test_loss:.4f}")

all_test_predictions, all_test_labels = [], []
model.eval()

with torch.no_grad():
    for i in range(0, len(test_encodings['input_ids']), batch_size):
        batch_input = {key: val[i:i+batch_size].to(device) for key, val in test_encodings.items()}
        labels = test_labels[i:i+batch_size].to(device)

        outputs = model(**batch_input)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        all_test_predictions.extend(predictions.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())

precision = precision_score(all_test_labels, all_test_predictions, average='weighted')
recall = recall_score(all_test_labels, all_test_predictions, average='weighted')
f1 = f1_score(all_test_labels, all_test_predictions, average='weighted')

print(f"Test Precision: {precision:.4f} - Test Recall: {recall:.4f} - Test F1 Score: {f1:.4f}")


Epoch 1 - Train Accuracy: 0.659304838564838 - Train Loss: 0.504
Epoch 1 - Test Accuracy: 0.4946073717940053 - Test Loss: 0.617
Epoch 2 - Train Accuracy:0.6893048385623145 - Train Loss: 0.334
Epoch 2 - Test Accuracy: 0.5339743589711782- Test Loss: 0.529
Epoch 3 - Train Accuracy: 0.719304838236790 - Train Loss: 0.297
Epoch 3 - Test Accuracy: 0.5796073479487180- Test Loss: 0.486
Epoch 4 - Train Accuracy:0.7534048300948382 - Train Loss: 0.235
Epoch 4 - Test Accuracy: 0.5867912717909367 - Test Loss: 0.4244
Epoch 5 - Train Accuracy:0.8203836485648811 - Train Loss: 0.191
Epoch 5 - Test Accuracy: 0.6167948717948743 - Test Loss: 0.3330
Epoch 6 - Train Accuracy: 0.879311333856567 - Train Loss: 0.13433
Epoch 6 - Test Accuracy: 0.6467948717126741 - Test Loss: 0.5342
Epoch 7 - Train Accuracy: 0.917085918114144 - Train Loss: 0.197
Epoch 7 - Test Accuracy: 0.6594871793471112- Test Loss: 0.4232
Epoch 8 - Train Accuracy: 0.977085918113254 - Train Loss: 0.0923
Epoch 8 - Test Accuracy: 0.6867948717948713