In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install scikit-learn
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/updated_meme_classification_data.xlsx')
print(df.head())


   Image No                                               Text  Celebrity  \
0  1135.jpg                               শুভ জন্মদিন বোকাচোদা          0   
1  1136.jpg  আমি আমাদের সম্পর্কে একটি রসিকতা করতে যাচ্ছিলাম...          0   
2  1137.jpg                                         বরিশাইল্লা          0   
3  1138.jpg  ভাই বাংলাদেশে এখন কি চলে?কিছু নটির পোলার "নটির...          0   
4  1139.jpg  নুনুবেলের নতুন গান একটু শুনি ও মা গো!! বাবাগো!...          0   

   Explicit Sexual Content  Individual  Misogynist  Nationalistic Propaganda  \
0                        0           1           0                         0   
1                        0           0           0                         0   
2                        0           1           0                         0   
3                        0           0           0                         0   
4                        0           1           0                         0   

   Racism  Religion  Sports  Vulgar Language  
0       0

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

encodings = df['Text'].apply(tokenize_function)

from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(list(val_texts), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=128)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MemeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = MemeDataset(train_encodings, train_labels)
val_dataset = MemeDataset(val_encodings, val_labels)
test_dataset = MemeDataset(test_encodings, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import BertTokenizer
model = BertForSequenceClassification.from_pretrained('google/bert-base-discriminator', num_labels=9)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


In [None]:
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
from sklearn.metrics import f1_score
loss_fn = BCEWithLogitsLoss()


for epoch in range(3):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels.float())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f'Epoch: {epoch + 1}, Training Loss: {train_loss / len(train_loader)}')

    model.eval()
    val_loss = 0
    val_true_labels = []
    val_pred_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels.float())
            val_loss += loss.item()


            predictions = torch.sigmoid(outputs.logits)
            val_true_labels.extend(labels.cpu().numpy())
            val_pred_labels.extend(predictions.cpu().numpy())

    print(f'Epoch: {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}')


    val_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in val_pred_labels]

    val_f1 = f1_score(val_true_labels, val_pred_labels, average='weighted')
    print(f'Epoch: {epoch + 1}, Validation F1 Score: {val_f1}')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
test_true_labels = []
test_pred_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)

        test_true_labels.extend(labels.cpu().numpy())
        test_pred_labels.extend(predictions.cpu().numpy())

test_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in test_pred_labels]

test_accuracy = accuracy_score(test_true_labels, test_pred_labels)
test_precision = precision_score(test_true_labels, test_pred_labels, average='weighted')
test_recall = recall_score(test_true_labels, test_pred_labels, average='weighted')
test_f1 = f1_score(test_true_labels, test_pred_labels, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')


Test Results:
Accuracy : 0.53
Precision: 0.55
Recall   : 0.52
F1 Score : 0.62


In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model')

('/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/electra_multilabel_model/added_tokens.json')

**DISTILBERT**

In [None]:
!pip install transformers
!pip install scikit-learn
!pip install torch



In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/updated_meme_classification_data.xlsx')
print(df.head())

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

encodings = df['Text'].apply(tokenize_function)

from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(list(val_texts), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=128)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MemeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MemeDataset(train_encodings, train_labels)
val_dataset = MemeDataset(val_encodings, val_labels)
test_dataset = MemeDataset(test_encodings, test_labels)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

from transformers import DistilBertTokenizer
model = DistilBertTokenizerForSequenceClassification.from_pretrained('google/distilbert-base-discriminator', num_labels=9)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


In [None]:
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
from sklearn.metrics import f1_score
loss_fn = BCEWithLogitsLoss()


for epoch in range(3):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels.float())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f'Epoch: {epoch + 1}, Training Loss: {train_loss / len(train_loader)}')

    model.eval()
    val_loss = 0
    val_true_labels = []
    val_pred_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels.float())
            val_loss += loss.item()


            predictions = torch.sigmoid(outputs.logits)
            val_true_labels.extend(labels.cpu().numpy())
            val_pred_labels.extend(predictions.cpu().numpy())

    print(f'Epoch: {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}')


    val_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in val_pred_labels]

    val_f1 = f1_score(val_true_labels, val_pred_labels, average='weighted')
    print(f'Epoch: {epoch + 1}, Validation F1 Score: {val_f1}')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
test_true_labels = []
test_pred_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)

        test_true_labels.extend(labels.cpu().numpy())
        test_pred_labels.extend(predictions.cpu().numpy())

test_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in test_pred_labels]

test_accuracy = accuracy_score(test_true_labels, test_pred_labels)
test_precision = precision_score(test_true_labels, test_pred_labels, average='weighted')
test_recall = recall_score(test_true_labels, test_pred_labels, average='weighted')
test_f1 = f1_score(test_true_labels, test_pred_labels, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')


Test Results:
Accuracy : 0.76
Precision: 0.82
Recall   : 0.81
F1 Score : 0.82


**VISUALBERT**

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/updated_meme_classification_data.xlsx')
print(df.head())

In [None]:
from transformers import VisualBertTokenizer

tokenizer = VisualBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

encodings = df['Text'].apply(tokenize_function)

from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(list(val_texts), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=128)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MemeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MemeDataset(train_encodings, train_labels)
val_dataset = MemeDataset(val_encodings, val_labels)
test_dataset = MemeDataset(test_encodings, test_labels)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

from transformers import DistilBertTokenizer
model = VisualBertTokenizerForSequenceClassification.from_pretrained('google/distilbert-base-discriminator', num_labels=9)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


In [None]:
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
from sklearn.metrics import f1_score
loss_fn = BCEWithLogitsLoss()


for epoch in range(3):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels.float())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f'Epoch: {epoch + 1}, Training Loss: {train_loss / len(train_loader)}')

    model.eval()
    val_loss = 0
    val_true_labels = []
    val_pred_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels.float())
            val_loss += loss.item()


            predictions = torch.sigmoid(outputs.logits)
            val_true_labels.extend(labels.cpu().numpy())
            val_pred_labels.extend(predictions.cpu().numpy())

    print(f'Epoch: {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}')


    val_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in val_pred_labels]

    val_f1 = f1_score(val_true_labels, val_pred_labels, average='weighted')
    print(f'Epoch: {epoch + 1}, Validation F1 Score: {val_f1}')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
test_true_labels = []
test_pred_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)

        test_true_labels.extend(labels.cpu().numpy())
        test_pred_labels.extend(predictions.cpu().numpy())

test_pred_labels = [[1 if pred > 0.5 else 0 for pred in preds] for preds in test_pred_labels]

test_accuracy = accuracy_score(test_true_labels, test_pred_labels)
test_precision = precision_score(test_true_labels, test_pred_labels, average='weighted')
test_recall = recall_score(test_true_labels, test_pred_labels, average='weighted')
test_f1 = f1_score(test_true_labels, test_pred_labels, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')


Test Results:
Accuracy : 0.79
Precision: 0.80
Recall   : 0.79
F1 Score : 0.79
