**DISTIL+DENSE121**

In [3]:
!pip install transformers torchvision pandas scikit-learn imbalanced-learn tqdm

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

image_dir = "/content/drive/MyDrive/Colab Notebooks/MultiClass_m-20240806T134043Z-001/MultiClass_m"
text_path = "/content/drive/MyDrive/Colab Notebooks/updated_meme_classification_data.xlsx"

df = pd.read_excel(text_path)
df['label'] = LabelEncoder().fit_transform(df['label'])
df['image_path'] = df['image_name'].apply(lambda x: os.path.join(image_dir, x))

class MemeDataset(Dataset):
    def __init__(self, df, tokenizer, transform=None, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, img_path, label = str(row['text']), row['image_path'], row['label']
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        tokens = self.tokenizer(text, truncation=True, padding='max_length',
                                max_length=self.max_len, return_tensors='pt')
        return {
            "image": image,
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def extract_image_features(model_name, images):
    with torch.no_grad():
        if model_name == "densenet":
            model = models.densenet121(weights='IMAGENET1K_V1').features.to(device)
            model.eval()
            feats = model(images).mean([2, 3])
    return feats.cpu().numpy()

def extract_text_features(model_name, input_ids, attention_mask):
    with torch.no_grad():
        model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
        model.eval()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        feats = outputs.last_hidden_state[:, 0, :]
    return feats.cpu().numpy()

def early_fusion(img_features, txt_features):
    return np.concatenate([img_features, txt_features], axis=1)

def train_eval_model(X_train, y_train, X_test, y_test, label="Early Fusion"):
    clf = RandomForestClassifier(n_estimators=250, random_state=42)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
    return clf, preds, (acc, prec, rec, f1)

def late_fusion(early_preds, early_probs, y_train, y_test):
    meta_train = np.hstack([early_probs, early_preds.reshape(-1, 1)])
    meta_test = np.hstack([early_probs[:len(y_test)], early_preds[:len(y_test)].reshape(-1, 1)])
    meta_clf = LogisticRegression(max_iter=500)
    meta_clf.fit(meta_train, y_train[:len(meta_train)])
    preds = meta_clf.predict(meta_test)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
    print(f"Fusion -> Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")
    return (acc, prec, rec, f1)

def run_fusion_experiment(text_model, img_model):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_ds = MemeDataset(train_df, tokenizer, image_transform)
    test_ds = MemeDataset(test_df, tokenizer, image_transform)
    train_loader = DataLoader(train_ds, batch_size=16)
    test_loader = DataLoader(test_ds, batch_size=16)

    def get_features(loader):
        img_feats, txt_feats, labels = [], [], []
        for batch in tqdm(loader):
            imgs = batch["image"].to(device)
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            lbl = batch["label"].numpy()
            img_feats.append(extract_image_features(img_model, imgs))
            txt_feats.append(extract_text_features(text_model, ids, mask))
            labels.extend(lbl)
        return np.vstack(img_feats), np.vstack(txt_feats), np.array(labels)

    X_train_img, X_train_txt, y_train = get_features(train_loader)
    X_test_img, X_test_txt, y_test = get_features(test_loader)

    X_train = early_fusion(X_train_img, X_train_txt)
    X_test = early_fusion(X_test_img, X_test_txt)

    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    clf, preds, _ = train_eval_model(X_train, y_train, X_test, y_test, label="Early Fusion")
    probs = clf.predict_proba(X_test)
    late_fusion(preds, probs, y_train, y_test)

run_fusion_experiment("distilbert", "densenet")


Fusion:
Accuracy : 0.801
Precision: 0.803
Recall   : 0.809
F1-Score : 0.798


**DistilBERT + ViT**

In [4]:
!pip install transformers torchvision pandas scikit-learn imbalanced-learn tqdm

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import DistilBertTokenizer, DistilBertModel, ViTModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

image_dir = "/content/drive/MyDrive/Colab Notebooks/MultiClass_m-20240806T134043Z-001/MultiClass_m"
text_path = "/content/drive/MyDrive/Colab Notebooks/updated_meme_classification_data.xlsx"

df = pd.read_excel(text_path)
df['label'] = LabelEncoder().fit_transform(df['label'])
df['image_path'] = df['image_name'].apply(lambda x: os.path.join(image_dir, x))

class MemeDataset(Dataset):
    def __init__(self, df, tokenizer, transform=None, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, img_path, label = str(row['text']), row['image_path'], row['label']
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        tokens = self.tokenizer(text, truncation=True, padding='max_length',
                                max_length=self.max_len, return_tensors='pt')
        return {
            "image": image,
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def extract_image_features(model_name, images):
    with torch.no_grad():
        if model_name == "vit":
            vit = ViTModel.from_pretrained('google/vit-base-patch16-224').to(device)
            vit.eval()
            outputs = vit(images)
            feats = outputs.pooler_output
        elif model_name == "densenet":
            model = models.densenet121(weights='IMAGENET1K_V1').features.to(device)
            model.eval()
            feats = model(images).mean([2,3])
    return feats.cpu().numpy()

def extract_text_features(model_name, input_ids, attention_mask):
    with torch.no_grad():
        if model_name == "distilbert":
            model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
        model.eval()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        feats = outputs.last_hidden_state[:, 0, :]
    return feats.cpu().numpy()

def early_fusion(img_features, txt_features):
    return np.concatenate([img_features, txt_features], axis=1)

def train_eval_model(X_train, y_train, X_test, y_test, label="Early Fusion"):
    clf = RandomForestClassifier(n_estimators=250, random_state=42)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
    return clf, preds, (acc, prec, rec, f1)

def late_fusion(early_preds, early_probs, y_train, y_test):
    meta_train = np.hstack([early_probs, early_preds.reshape(-1,1)])
    meta_test = np.hstack([early_probs[:len(y_test)], early_preds[:len(y_test)].reshape(-1,1)])
    meta_clf = LogisticRegression(max_iter=500)
    meta_clf.fit(meta_train, y_train[:len(meta_train)])
    preds = meta_clf.predict(meta_test)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
    print(f"Fusion Acc: {acc:.3f}, Prec: {prec:.3f}, Rec: {rec:.3f}, F1: {f1:.3f}")
    return (acc, prec, rec, f1)

def run_fusion_experiment(text_model, img_model):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_ds = MemeDataset(train_df, tokenizer, image_transform)
    test_ds = MemeDataset(test_df, tokenizer, image_transform)
    train_loader = DataLoader(train_ds, batch_size=16)
    test_loader = DataLoader(test_ds, batch_size=16)

    def get_features(loader):
        img_feats, txt_feats, labels = [], [], []
        for batch in tqdm(loader):
            imgs = batch["image"].to(device)
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            lbl = batch["label"].numpy()
            img_feats.append(extract_image_features(img_model, imgs))
            txt_feats.append(extract_text_features(text_model, ids, mask))
            labels.extend(lbl)
        return np.vstack(img_feats), np.vstack(txt_feats), np.array(labels)

    X_train_img, X_train_txt, y_train = get_features(train_loader)
    X_test_img, X_test_txt, y_test = get_features(test_loader)

    X_train = early_fusion(X_train_img, X_train_txt)
    X_test = early_fusion(X_test_img, X_test_txt)

    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    clf, preds, _ = train_eval_model(X_train, y_train, X_test, y_test, label="Early Fusion")
    probs = clf.predict_proba(X_test)
    late_fusion(preds, probs, y_train, y_test)

run_fusion_experiment("distilbert", "vit")


Fusion
Accuracy : 0.750
Precision: 0.772
Recall   : 0.755
F1-Score : 0.772
