In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import BertTokenizer, BertModel
from torchvision.models import ResNet50_Weights

# --- Hyperparameters ---
BATCH_SIZE = 8  # Reduced for faster training
EPOCHS = 5
LR = 1e-4


**Path**

In [26]:
TRAIN_CSV = '/content/drive/MyDrive/Project 3/train.csv'
DEV_CSV = '/content/drive/MyDrive/Project 3/dev.csv'
TRAIN_IMG_DIR = '/content/drive/MyDrive/Project 3/train'
DEV_IMG_DIR = '/content/drive/MyDrive/Project 3/dev'
TEST_IMG_DIR = '/content/drive/MyDrive/Project 3/test'


In [27]:
import pandas as pd

df = pd.read_csv(TRAIN_CSV)
print(df.head())
print(df.columns)


  image_name        labels                  transcriptions
0   1134.jpg      Misogyny  如果结婚是好事的话 按照我国的国情 应该托关系走后门才能结婚
1    901.jpg  Not-Misogyny                        狠狠期待 好了嘛
2    683.jpg      Misogyny         为什么不生孩子？ 让男的生呗 他们不是先生嘛？
3    768.jpg  Not-Misogyny        搞不懂呢 你知道滴 我不过是 来自大山深处的吗喽
4    178.jpg  Not-Misogyny                 买完同款买同款 家里可以开展览
Index(['image_name', 'labels', 'transcriptions'], dtype='object')


In [28]:
import pandas as pd

df = pd.read_csv(DEV_CSV)
print(df.head())
print(df.columns)


  image_name        labels                                     transcriptions
0    423.jpg  Not-Misogyny                                   想我的生活...总是差一点就顺呐
1    204.jpg      Misogyny                 孩子不听话是你不对 孩子学习成绩不好 还是你不对 EN EN 听我说
2    571.jpg      Misogyny                       同事： 你啥时候生孩子啊 我： 你是不是着急投胎叫我妈啊
3    323.jpg  Not-Misogyny                                              去你的调休
4   1403.jpg  Not-Misogyny  国庆7天+中秋3天 调休完等于总共放4天 上次遇到这种7+3=4的题目 还是树上骑个猴地上三...
Index(['image_name', 'labels', 'transcriptions'], dtype='object')


**Dataset**

In [29]:
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir, tokenizer, transform, is_test=False):
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.transform = transform
        self.is_test = is_test

        if not is_test:
            self.data = pd.read_csv(csv_file)
        else:
            self.image_names = sorted([f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

    def __len__(self):
        return len(self.image_names) if self.is_test else len(self.data)

    def __getitem__(self, idx):
        if self.is_test:
            image_name = self.image_names[idx]
            image_path = os.path.join(self.image_dir, image_name)
            image = self.transform(Image.open(image_path).convert('RGB'))
            dummy_text = ""
            encoded_text = self.tokenizer(dummy_text, padding='max_length', truncation=True, max_length=64, return_tensors='pt')
            return image, encoded_text['input_ids'].squeeze(0), encoded_text['attention_mask'].squeeze(0), image_name
        else:
            image_name = self.data.iloc[idx]['image_name']
            image_path = os.path.join(self.image_dir, image_name)
            image = self.transform(Image.open(image_path).convert('RGB'))
            transcription = self.data.iloc[idx]['transcriptions']
            encoded_text = self.tokenizer(transcription, padding='max_length', truncation=True, max_length=64, return_tensors='pt')
            label_str = self.data.iloc[idx]['labels']
            label = 1 if label_str.lower() == 'misogyny' else 0
            return image, encoded_text['input_ids'].squeeze(0), encoded_text['attention_mask'].squeeze(0), label, image_name

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
sample_dataset = MemeDataset(TRAIN_CSV, TRAIN_IMG_DIR, tokenizer, transform)
sample_item = sample_dataset[0]
print("✅ Sample image tensor shape:", sample_item[0].shape)
print("✅ Sample text input ids shape:", sample_item[1].shape)
print("✅ Sample label:", sample_item[3])


✅ Sample image tensor shape: torch.Size([3, 224, 224])
✅ Sample text input ids shape: torch.Size([64])
✅ Sample label: 1


**Model**

In [31]:
class MultimodalClassifier(nn.Module):
    def __init__(self):
        super(MultimodalClassifier, self).__init__()
        self.image_model = models.resnet50(weights=ResNet50_Weights.DEFAULT)
        self.image_model.fc = nn.Identity()
        for param in self.image_model.parameters():  # Freeze ResNet
            param.requires_grad = False

        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        for param in self.text_model.parameters():  # Freeze BERT
            param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(2048 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 2)
        )

    def forward(self, img, input_ids, attention_mask):
        img_feat = self.image_model(img)
        text_feat = self.text_model(input_ids=input_ids, attention_mask=attention_mask).pooler_output.detach()
        combined = torch.cat((img_feat, text_feat), dim=1)
        return self.classifier(combined)

In [32]:
model = MultimodalClassifier()
img, ids, mask, label, _ = sample_dataset[0]
with torch.no_grad():
    out = model(img.unsqueeze(0), ids.unsqueeze(0), mask.unsqueeze(0))
print("✅ Model output shape:", out.shape)


✅ Model output shape: torch.Size([1, 2])


**Initialize**

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model = MultimodalClassifier().to(device)
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)  # Only train classifier
criterion = nn.CrossEntropyLoss()

**Dataloader**

In [34]:
train_dataset = MemeDataset(TRAIN_CSV, TRAIN_IMG_DIR, tokenizer, transform)
dev_dataset = MemeDataset(DEV_CSV, DEV_IMG_DIR, tokenizer, transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)

In [35]:
sample_batch = next(iter(train_loader))
print("✅ Batch image shape:", sample_batch[0].shape)
print("✅ Batch input_ids shape:", sample_batch[1].shape)
print("✅ Batch labels:", sample_batch[3])


✅ Batch image shape: torch.Size([8, 3, 224, 224])
✅ Batch input_ids shape: torch.Size([8, 64])
✅ Batch labels: tensor([0, 0, 0, 0, 0, 0, 1, 0])


**Training**

In [19]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for img, ids, mask, labels, _ in tqdm(train_loader):
        img, ids, mask, labels = img.to(device), ids.to(device), mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(img, ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")

100%|██████████| 149/149 [15:55<00:00,  6.41s/it]


Epoch 1/5 - Loss: 0.5570


100%|██████████| 149/149 [09:38<00:00,  3.88s/it]


Epoch 2/5 - Loss: 0.4731


100%|██████████| 149/149 [09:35<00:00,  3.86s/it]


Epoch 3/5 - Loss: 0.4247


100%|██████████| 149/149 [09:36<00:00,  3.87s/it]


Epoch 4/5 - Loss: 0.3853


100%|██████████| 149/149 [09:37<00:00,  3.88s/it]

Epoch 5/5 - Loss: 0.3853





**Inference**

In [20]:
test_dataset = MemeDataset(None, TEST_IMG_DIR, tokenizer, transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model.eval()
predictions = []
with torch.no_grad():
    for img, ids, mask, image_names in tqdm(test_loader):
        img, ids, mask = img.to(device), ids.to(device), mask.to(device)
        outputs = model(img, ids, mask)
        preds = torch.argmax(outputs, dim=1).cpu().tolist()
        predictions.extend(zip(image_names, preds))

100%|██████████| 43/43 [02:57<00:00,  4.12s/it]


In [22]:
submission = pd.DataFrame(predictions, columns=['image_name', 'labels'])
submission['labels'] = submission['labels'].map({1: 'Misogyny', 0: 'Not-Misogyny'})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'SSNCSE_run1.csv'")

✅ Submission file saved as 'SSNCSE_run1.csv'


**Classification report for Development data**

In [23]:
from sklearn.metrics import classification_report

# --- Evaluation on Dev Set ---
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for img, ids, mask, labels, _ in tqdm(dev_loader):
        img, ids, mask = img.to(device), ids.to(device), mask.to(device)
        outputs = model(img, ids, mask)
        preds = torch.argmax(outputs, dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.tolist())

# --- Classification Report ---
report = classification_report(all_labels, all_preds, target_names=['Not-Misogyny', 'Misogyny'])
print("📊 Classification Report on Dev Set:")
print(report)


100%|██████████| 22/22 [02:34<00:00,  7.01s/it]

📊 Classification Report on Dev Set:
              precision    recall  f1-score   support

Not-Misogyny       0.86      0.92      0.89       123
    Misogyny       0.74      0.62      0.67        47

    accuracy                           0.84       170
   macro avg       0.80      0.77      0.78       170
weighted avg       0.83      0.84      0.83       170




