In [2]:
!pip install torch torchvision transformers pandas pillow python-multipart scikit-learn

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 

In [1]:
import os
import json
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
import torchvision
from torchvision import transforms
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
class MemeDataset(Dataset):
    def __init__(self, df, base_img_path, tokenizer, max_length, mode='train'):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mode = mode
        
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        self.image_paths = []
        if mode == 'train':
            for _, row in df.iterrows():
                folder = 'Hate' if row['label'] == 1 else 'No Hate'
                self.image_paths.append(os.path.join(base_img_path, folder, row['index']))
        else:
            self.image_paths = [os.path.join(base_img_path, fname) for fname in df['index']]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert('RGB')
        img = self.transform(img)

        text = self.df.iloc[idx]['text']
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'image': img,
            'index': self.df.iloc[idx]['index']
        }

        if 'label' in self.df.columns:
            item['label'] = torch.tensor(self.df.iloc[idx]['label'], dtype=torch.long)

        return item


In [3]:
class MultimodalModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained('bert-base-uncased')
        self.image_encoder = torchvision.models.resnet50(pretrained=True)
        self.image_encoder = nn.Sequential(*list(self.image_encoder.children())[:-1])

        self.classifier = nn.Sequential(
            nn.Linear(768 + 2048, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 2)
        )

    def forward(self, input_ids, attention_mask, image):
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)

        image_features = self.image_encoder(image)
        image_features = image_features.view(image_features.size(0), -1)

        combined = torch.cat([text_features, image_features], dim=1)
        return self.classifier(combined)


In [4]:
TRAIN_CSV = '/kaggle/input/subtask-a-train/STask_A_train.csv'
VAL_CSV = '/kaggle/input/subtask-a-eval/STask-A(indextext)val.csv'  # used only for test time
TRAIN_IMG_DIR = '/kaggle/input/subtask-a-train/Subtask A Train/Subtask A Train'
VAL_IMG_DIR = '/kaggle/input/subtask-a-eval/Subtask A Eval/STask_A_val_img'
BATCH_SIZE = 8
MAX_LEN = 128
EPOCHS = 5
LR = 2e-5

# Load labeled training data
full_df = pd.read_csv(TRAIN_CSV)

# Split train/val from labeled data
train_df, val_df = train_test_split(full_df, test_size=0.2, stratify=full_df['label'], random_state=42)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset = MemeDataset(train_df, TRAIN_IMG_DIR, tokenizer, MAX_LEN, 'train')
val_dataset = MemeDataset(val_df, TRAIN_IMG_DIR, tokenizer, MAX_LEN, 'train')

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
optimizer = AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-05-12 05:29:19.709513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747027759.890638      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747027759.942125      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 204MB/s]


In [5]:
from sklearn.metrics import f1_score, accuracy_score

best_f1 = 0.0
save_path = 'best_multimodal_model.pth'

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        optimizer.zero_grad()

        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'image': batch['image'].to(device)
        }
        labels = batch['label'].to(device)

        outputs = model(**inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} | Avg Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} - Validation"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'image': batch['image'].to(device)
            }
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    f1 = f1_score(all_labels, all_preds, average='macro')
    acc = accuracy_score(all_labels, all_preds)

    print(f"Epoch {epoch+1} | Validation Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), save_path)
        print(f"✅ Best model saved with F1: {f1:.4f}")


Epoch 1 - Training: 100%|██████████| 405/405 [04:36<00:00,  1.46it/s]



Epoch 1 | Avg Training Loss: 0.6358


Epoch 1 - Validation: 100%|██████████| 102/102 [00:47<00:00,  2.14it/s]


Epoch 1 | Validation Accuracy: 0.7136 | F1 Score: 0.7136
✅ Best model saved with F1: 0.7136


Epoch 2 - Training: 100%|██████████| 405/405 [04:01<00:00,  1.68it/s]



Epoch 2 | Avg Training Loss: 0.4274


Epoch 2 - Validation: 100%|██████████| 102/102 [00:38<00:00,  2.63it/s]


Epoch 2 | Validation Accuracy: 0.6963 | F1 Score: 0.6916


Epoch 3 - Training: 100%|██████████| 405/405 [04:01<00:00,  1.68it/s]



Epoch 3 | Avg Training Loss: 0.1546


Epoch 3 - Validation: 100%|██████████| 102/102 [00:38<00:00,  2.63it/s]


Epoch 3 | Validation Accuracy: 0.7086 | F1 Score: 0.7084


Epoch 4 - Training: 100%|██████████| 405/405 [04:01<00:00,  1.68it/s]



Epoch 4 | Avg Training Loss: 0.0803


Epoch 4 - Validation: 100%|██████████| 102/102 [00:39<00:00,  2.61it/s]


Epoch 4 | Validation Accuracy: 0.7284 | F1 Score: 0.7284
✅ Best model saved with F1: 0.7284


Epoch 5 - Training: 100%|██████████| 405/405 [04:02<00:00,  1.67it/s]



Epoch 5 | Avg Training Loss: 0.0443


Epoch 5 - Validation: 100%|██████████| 102/102 [00:39<00:00,  2.61it/s]

Epoch 5 | Validation Accuracy: 0.7235 | F1 Score: 0.7232





In [6]:
# Load best model
model.load_state_dict(torch.load('best_multimodal_model.pth'))
model.eval()

# Inference on unlabeled test set
test_df = pd.read_csv(VAL_CSV)
test_dataset = MemeDataset(test_df, VAL_IMG_DIR, tokenizer, MAX_LEN, mode='eval')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Prediction"):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'image': batch['image'].to(device)
        }
        outputs = model(**inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()

        for idx, pred in zip(batch['index'], preds):
            predictions.append({'index': idx, 'prediction': int(pred)})

# Save submission
with open('submission.json', 'w') as f:
    for pred in predictions:
        json.dump(pred, f)
        f.write('\n')

!zip -j ref.zip submission.json
print("✅ Final predictions written to ref.zip")


  model.load_state_dict(torch.load('best_multimodal_model.pth'))
Final Test Prediction: 100%|██████████| 64/64 [00:29<00:00,  2.18it/s]

  adding: submission.json (deflated 91%)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Final predictions written to ref.zip
