In [1]:
import pandas as pd

In [1]:
from transformers import pipeline

classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')
print(classifier("I love this!"))

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


[{'label': 'POSITIVE', 'score': 0.9998764991760254}]


In [2]:
csv_path = "/Users/sahilpandey/Projects/Sentiment_Analysis/Dataset/labels.csv"
img_path = "/Users/sahilpandey/Projects/Sentiment_Analysis/Dataset/images" 
df = pd.read_csv(csv_path)

In [3]:

H = {
    "hilarious": 3,
    "very_funny": 2,
    "funny": 1,
    "not_funny": 0
}
df["Humor_label"] = df["humour"].map(H).fillna(0).astype(int)


S = {
    "very_twisted": 3,
    "twisted_meaning": 2,
    "general": 1,
    "not_sarcastic": 0
}
df["Sarcasm_label"] = df["sarcasm"].map(S).fillna(0).astype(int)


O = {
    "hateful_offensive": 3,
    "very_offensive": 2,
    "slight": 1,
    "not_offensive": 0
    
}
df["Offensive_label"] = df["offensive"].map(O).fillna(0).astype(int)


sentiment_mapping = {
    "very_positive": 2, "positive": 2,
    "very_negative": 0, "negative": 0,
    "neutral":1
}
df["sentiment_label"] = df["overall_sentiment"].map(sentiment_mapping).fillna(0).astype(int)

df = df.reset_index(drop=True)
print(df[:1])

   index   image_name                                           text_ocr  \
0      0  image_1.jpg  LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...   

                                      text_corrected     humour  sarcasm  \
0  LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...  hilarious  general   

       offensive      motivational overall_sentiment  Humor_label  \
0  not_offensive  not_motivational     very_positive            3   

   Sarcasm_label  Offensive_label  sentiment_label  
0              1                0                2  


In [5]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image, UnidentifiedImageError

class MemeDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, tokenizer=None):
        self.data = df
        self.img_dir = img_dir
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.data.iloc[idx]['image_name'])

        try:
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            image = torch.zeros((3, 224, 224))  # fallback if image fails to load

        text = str(self.data.iloc[idx]['text_corrected'])
        text_encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=256, return_tensors='pt')
        text_encoded = {k: v.squeeze(0) for k, v in text_encoded.items()}  # remove batch dim

        # Labels as tensors
        sentiment = torch.tensor(int(self.data.iloc[idx]['sentiment_label']), dtype=torch.long)
        humor     = torch.tensor(int(self.data.iloc[idx]['Humor_label']), dtype=torch.long)
        sarcasm   = torch.tensor(int(self.data.iloc[idx]['Sarcasm_label']), dtype=torch.long)
        offense   = torch.tensor(int(self.data.iloc[idx]['Offensive_label']), dtype=torch.long)

        return {
            'image': image,
            'text': text_encoded,
            'sentiment_label': sentiment,
            'humor_label': humor,
            'sarcasm_label': sarcasm,
            'offense_label': offense,
        }

In [6]:
import torchvision.transforms as transforms
from transformers import ViTModel, BertModel, ViTFeatureExtractor, BertTokenizer

vit_feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])



In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train_dataset = MemeDataset(train_df, img_path, transform=transform,tokenizer=bert_tokenizer )
test_dataset = MemeDataset(test_df, img_path, transform=transform,tokenizer=bert_tokenizer )
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)

In [8]:
import torch
import torch.nn as nn
from transformers import ViTModel, BertModel

class CrossAttentionBlock(nn.Module):
    def __init__(self, dim_q, dim_kv, num_heads=8):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=dim_q, kdim=dim_kv, vdim=dim_kv, num_heads=num_heads, batch_first=True)
    
    def forward(self, query, key_value):
        out, _ = self.attn(query, key_value, key_value)
        return out

class MemeCrossAttentionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.cross_attn_img_to_txt = CrossAttentionBlock(dim_q=768, dim_kv=768)
        self.cross_attn_txt_to_img = CrossAttentionBlock(dim_q=768, dim_kv=768)

        self.classifier_input_dim = 768 * 2 

        self.sentiment_classifier = nn.Linear(self.classifier_input_dim, 3)
        self.humor_classifier     = nn.Linear(self.classifier_input_dim, 4)
        self.sarcasm_classifier   = nn.Linear(self.classifier_input_dim, 4)
        self.offense_classifier   = nn.Linear(self.classifier_input_dim, 4)

    def forward(self, image, text):
        vit_out = self.vit(pixel_values=image).last_hidden_state      
        bert_out = self.bert(**text).last_hidden_state                

        img_cls = vit_out[:, 0:1, :]      
        text_cls = bert_out[:, 0:1, :]    

        img_attn = self.cross_attn_img_to_txt(img_cls, bert_out)  
        text_attn = self.cross_attn_txt_to_img(text_cls, vit_out)  

        combined = torch.cat([img_attn.squeeze(1), text_attn.squeeze(1)], dim=-1)  

        return {
            'sentiment_logits': self.sentiment_classifier(combined),
            'humor_logits': self.humor_classifier(combined),
            'sarcasm_logits': self.sarcasm_classifier(combined),
            'offense_logits': self.offense_classifier(combined),
        }

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS (Apple GPU)")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
model = MemeCrossAttentionClassifier().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)
ce_loss = nn.CrossEntropyLoss()
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        optimizer.zero_grad()

        images = batch['image'].to(device)
        text_input = {k: v.to(device) for k, v in batch['text'].items()} 

        outputs = model(images, text_input) 

        loss_sentiment = ce_loss(outputs['sentiment_logits'], batch['sentiment_label'].to(device))
        loss_humor     = ce_loss(outputs['humor_logits'], batch['humor_label'].to(device))
        loss_sarcasm   = ce_loss(outputs['sarcasm_logits'], batch['sarcasm_label'].to(device))
        loss_offense   = ce_loss(outputs['offense_logits'], batch['offense_label'].to(device))

        loss = loss_sentiment + loss_humor + loss_sarcasm + loss_offense
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"[Epoch {epoch+1}] Avg Train Loss: {avg_train_loss:.4f}")

    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            images = batch['image'].to(device)
            text_input = {k: v.to(device) for k, v in batch['text'].items()}

            outputs = model(images, text_input)

            val_loss_sentiment = ce_loss(outputs['sentiment_logits'], batch['sentiment_label'].to(device))
            val_loss_humor     = ce_loss(outputs['humor_logits'], batch['humor_label'].to(device))
            val_loss_sarcasm   = ce_loss(outputs['sarcasm_logits'], batch['sarcasm_label'].to(device))
            val_loss_offense   = ce_loss(outputs['offense_logits'], batch['offense_label'].to(device))

            total_val_loss += (val_loss_sentiment + val_loss_humor + val_loss_sarcasm + val_loss_offense).item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"[Epoch {epoch+1}] Avg Validation Loss: {avg_val_loss:.4f}")

    scheduler.step(avg_val_loss)

Using MPS (Apple GPU)


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/20:   0%|          | 0/350 [01:12<?, ?it/s]


KeyboardInterrupt: 