In [None]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

# Assuming Google Colab environment, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Colab Notebooks/Bengali Hate Meme.csv'
data = pd.read_csv(file_path)

texts = data['Text'].tolist()
labels = data['Label'].tolist()
label_dict = {label: idx for idx, label in enumerate(set(labels))}
numeric_labels = [label_dict[label] for label in labels]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, numeric_labels, test_size=0.1, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def filter_texts_and_labels(texts, labels):
    filtered_texts = []
    filtered_labels = []
    for text, label in zip(texts, labels):
        if isinstance(text, str) and text:
            filtered_texts.append(text)
            filtered_labels.append(label)
    return filtered_texts, filtered_labels

train_texts_filtered, train_labels = filter_texts_and_labels(train_texts, train_labels)
val_texts_filtered, val_labels = filter_texts_and_labels(val_texts, val_labels)
test_texts_filtered, test_labels = filter_texts_and_labels(test_texts, test_labels)

train_encodings = tokenizer(train_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts_filtered, truncation=True, padding=True, max_length=128, return_tensors='pt')

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=len(label_dict))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
epochs = 8
batch_size = 32
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()


train_accuracy_values, train_loss_values = [], []
test_accuracy_values, test_loss_values = [], []

for epoch in range(epochs):
    model.train()
    total_train_accuracy, total_train_loss, total_batches = 0, 0, 0

    for i in range(0, len(train_encodings['input_ids']), batch_size):
        batch_input = {key: val[i:i+batch_size].to(device) for key, val in train_encodings.items()}
        labels = train_labels[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(**batch_input)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        batch_accuracy = (predictions == labels).float().mean().item()
        total_train_accuracy += batch_accuracy
        total_batches += 1

    epoch_train_accuracy = total_train_accuracy / total_batches
    epoch_train_loss = total_train_loss / total_batches
    train_accuracy_values.append(epoch_train_accuracy)
    train_loss_values.append(epoch_train_loss)
    print(f"Epoch {epoch+1} - Train Accuracy: {epoch_train_accuracy:.4f} - Train Loss: {epoch_train_loss:.4f}")

    model.eval()
    total_test_accuracy, total_test_loss, total_test_batches = 0, 0, 0

    with torch.no_grad():
        for i in range(0, len(test_encodings['input_ids']), batch_size):
            batch_input = {key: val[i:i+batch_size].to(device) for key, val in test_encodings.items()}
            labels = test_labels[i:i+batch_size].to(device)

            outputs = model(**batch_input)
            logits = outputs.logits
            loss = criterion(logits, labels)

            total_test_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            batch_accuracy = (predictions == labels).float().mean().item()
            total_test_accuracy += batch_accuracy
            total_test_batches += 1

    epoch_test_accuracy = total_test_accuracy / total_test_batches
    epoch_test_loss = total_test_loss / total_test_batches
    test_accuracy_values.append(epoch_test_accuracy)
    test_loss_values.append(epoch_test_loss)
    print(f"Epoch {epoch+1} - Test Accuracy: {epoch_test_accuracy:.4f} - Test Loss: {epoch_test_loss:.4f}")

all_test_predictions, all_test_labels = [], []
model.eval()

with torch.no_grad():
    for i in range(0, len(test_encodings['input_ids']), batch_size):
        batch_input = {key: val[i:i+batch_size].to(device) for key, val in test_encodings.items()}
        labels = test_labels[i:i+batch_size].to(device)

        outputs = model(**batch_input)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        all_test_predictions.extend(predictions.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())

precision = precision_score(all_test_labels, all_test_predictions, average='weighted')
recall = recall_score(all_test_labels, all_test_predictions, average='weighted')
f1 = f1_score(all_test_labels, all_test_predictions, average='weighted')

print(f"Test Precision: {precision:.4f} - Test Recall: {recall:.4f} - Test F1 Score: {f1:.4f}")


Epoch 1 - Train Accuracy: 0.5749 - Train Loss: 0.6797
Epoch 1 - Test Accuracy: 0.5268 - Test Loss: 1.1090
Epoch 2 - Train Accuracy: 0.6338 - Train Loss: 0.6328
Epoch 2 - Test Accuracy: 0.5703 - Test Loss: 1.1571
Epoch 3 - Train Accuracy: 0.7383 - Train Loss: 0.5212
Epoch 3 - Test Accuracy: 0.6246 - Test Loss: 0.7269
Epoch 4 - Train Accuracy: 0.8259 - Train Loss: 0.3848
Epoch 4 - Test Accuracy: 0.6677 - Test Loss: 0.8823
Epoch 5 - Train Accuracy: 0.8945 - Train Loss: 0.2564
Epoch 5 - Test Accuracy: 0.6999 - Test Loss: 0.7638
Epoch 6 - Train Accuracy: 0.9245 - Train Loss: 0.1896
Epoch 6 - Test Accuracy: 0.7524 - Test Loss: 0.7161
Epoch 7 - Train Accuracy: 0.8811 - Train Loss: 0.2803
Epoch 7 - Test Accuracy: 0.7846 - Test Loss: 0.6584
Epoch 8 - Train Accuracy: 0.9138 - Train Loss: 0.2243
Epoch 8 - Test Accuracy: 0.8199 - Test Loss: 0.6151
Test Precision: 0.7951 - Test Recall: 0.8226 - Test F1 Score: 0.8093
