# 1. Importing Necessary Libraries

Importing required libraries for data processing, model building, and image/text handling.

In [10]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import numpy as np

# 2. Dataset Class Preparation

Defining a custom dataset class to handle text, images, and labels.

In [11]:
class MemeDataset(Dataset):
    def __init__(self, df, image_folder, tokenizer, max_len, transform=None):
        self.df = df
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = f"{self.image_folder}/{row['image_id']}.jpg"
        text = row['transcriptions']
        label = row.get('labels', -1)

        # Text tokenization
        text_inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Image preprocessing
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Preprocessed image returning
        return {
            'image': image,
            'text_inputs': {k: v.squeeze(0) for k, v in text_inputs.items()},
            'label': torch.tensor(label, dtype=torch.long) if label != -1 else label
        }


# 3. Model Definition

In [12]:
class MultimodalModel(nn.Module):
    def __init__(self, text_model_name, num_labels):
        super(MultimodalModel, self).__init__()
        # Text branch
        self.text_model = AutoModel.from_pretrained(text_model_name)
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 256)
        
        # Image branch
        self.image_model = models.resnet50(pretrained=True)
        self.image_model.fc = nn.Linear(self.image_model.fc.in_features, 256)
        
        # Combined classifier
        self.classifier = nn.Sequential(
            nn.Linear(256 + 256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, text_inputs, image):
        # Text features
        text_outputs = self.text_model(**text_inputs)
        text_features = self.text_fc(text_outputs.pooler_output)
        
        # Image features
        image_features = self.image_model(image)
        
        # Combine features
        # ------------- Code Goes Here -----------

        # ------------- Code Goes Here -----------

# 4. Training Function Definition

In [13]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        images = batch['image'].to(device)
        text_inputs = {k: v.to(device) for k, v in batch['text_inputs'].items()}
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(text_inputs, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# 5. Validation Function Definition

In [14]:
def validate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            images = batch['image'].to(device)
            text_inputs = {k: v.to(device) for k, v in batch['text_inputs'].items()}
            labels = batch['label'].to(device)
            
            outputs = model(text_inputs, images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# 6. Prediction Function Definition

In [15]:
def predict(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            images = batch['image'].to(device)
            text_inputs = {k: v.to(device) for k, v in batch['text_inputs'].items()}
            outputs = model(text_inputs, images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
    return predictions

# 7. Main Function Definition

In [16]:
def main():
    # Define necessary train, dev, test file path with image folder paths
    
    # -------- There will be the required codes ---------

    # -------- There will be the required codes ---------
    
    model_name = "bert-base-uncased"
    max_len = 128
    batch_size = 16
    num_epochs = 45
    lr = 2e-5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Data loader
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    transform = transforms.Compose([
        # transforms.Resize((224, 224)),
        # transforms.ToTensor(),
        # transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    train_df = pd.read_csv(train_csv)
    dev_df = pd.read_csv(dev_csv)
    test_df = pd.read_csv(test_csv)
    
    train_dataset = MemeDataset(train_df, train_image_folder, tokenizer, max_len, transform)
    dev_dataset = MemeDataset(dev_df, dev_image_folder, tokenizer, max_len, transform)
    test_dataset = MemeDataset(test_df, test_image_folder, tokenizer, max_len, transform)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Model Initialization
    # -------- There will be the required codes ---------
    
    # -------- There will be the required codes ---------
    
    # Train and validate model
    for epoch in range(num_epochs):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        val_loss = validate_model(model, dev_loader, criterion, device)
        scheduler.step(val_loss)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Validation Loss = {val_loss:.4f}")
    
    # Predict on test set
    predictions = predict(model, test_loader, device)
    test_df['labels'] = predictions

    # Model prediction save in required file
    # -------- There will be the required codes ---------

In [18]:
if __name__ == "__main__":
    main()