In [1]:
import os
import json
import torch
import numpy as np
from PIL import Image
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import SiglipModel, SiglipProcessor

In [2]:
import torch.optim as optim  # Add this with other imports

In [4]:
# Step 3: Extract the dataset from the zip file
import zipfile
import os

# Path to your zip file in Google Drive
zip_path = '/content/drive/MyDrive/FBHM.zip'  # Update with your zip file path
extract_path = '/content/dataset'  # Directory to extract files

# Create extraction directory if it doesn't exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Dataset extracted to: {extract_path}")

Dataset extracted to: /content/dataset


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load dataset
DATASET_PATH = "/content/dataset/FBHM/data"
with open("/content/balanced_new_siglip.json", "r") as f:
    dataset = json.load(f)

In [8]:
# Extract paths, texts, and labels
image_paths = [os.path.join(DATASET_PATH, data["img"]) for data in dataset]
texts = [f"{data['text']} {data['caption']}" for data in dataset]  # Combine text fields
labels = [data["label"] for data in dataset]


In [9]:
# Split dataset
train_paths, test_paths, train_texts, test_texts, train_labels, test_labels = train_test_split(
    image_paths, texts, labels, test_size=0.2, random_state=42
)

In [10]:
# Initialize SigLIP components with a larger model
siglip_processor = SiglipProcessor.from_pretrained("google/siglip-large-patch16-384")
siglip_model = SiglipModel.from_pretrained("google/siglip-large-patch16-384")

# Note: This model uses 384x384 resolution and has larger hidden_size (1152 vs 768),
# which may improve feature quality but increases memory usage
print("Vision Config:", siglip_model.config.vision_config)
print("Text Config:", siglip_model.config.text_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

Vision Config: SiglipVisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 1024,
  "image_size": 384,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-06,
  "model_type": "siglip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "transformers_version": "4.48.3"
}

Text Config: SiglipTextConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 49406,
  "eos_token_id": 49407,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 1024,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-06,
  "max_position_embeddings": 64,
  "model_type": "siglip_text_model",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "transformers_version": "4.48.3",
  "vocab_size": 32000
}



In [11]:
class MemeDataset(Dataset):
    def __init__(self, image_paths, texts, labels, processor):
        self.image_paths = image_paths
        self.texts = texts
        self.labels = labels
        self.processor = processor
        self.pad_token_id = processor.tokenizer.pad_token_id

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and process image
        image = Image.open(self.image_paths[idx]).convert("RGB")
        text = self.texts[idx]

        # Process inputs (larger model uses 384x384 images)
        inputs = self.processor(
            text=text,
            images=image,
            return_tensors="pt",
            padding="max_length",
            max_length=64,
            truncation=True
        )

        # Ensure attention mask is present
        if 'attention_mask' not in inputs:
            inputs['attention_mask'] = (inputs['input_ids'] != self.pad_token_id).long()

        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [12]:
# Create datasets and dataloaders
train_dataset = MemeDataset(train_paths, train_texts, train_labels, siglip_processor)
test_dataset = MemeDataset(test_paths, test_texts, test_labels, siglip_processor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [13]:
class MultimodalMemeClassifier(nn.Module):
    def __init__(self, siglip_model, num_classes=2):
        super().__init__()
        self.siglip = siglip_model

        # Freeze SigLIP parameters
        for param in self.siglip.parameters():
            param.requires_grad = False

        # Get embedding dimensions (1152 for large model)
        self.image_embed_dim = self.siglip.config.vision_config.hidden_size  # 1152
        self.text_embed_dim = self.siglip.config.text_config.hidden_size     # 1152

        assert self.image_embed_dim == self.text_embed_dim, "Embedding dimensions must match!"

        # Classifier adjusted for larger input (1152 + 1152 = 2304)
        self.classifier = nn.Sequential(
            nn.Linear(self.image_embed_dim + self.text_embed_dim, 512),  # 2304 -> 512
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, pixel_values, input_ids, attention_mask):
        outputs = self.siglip(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Combine embeddings
        combined = torch.cat([outputs.image_embeds, outputs.text_embeds], dim=1)
        return self.classifier(combined)

In [None]:
print(siglip_model.config.vision_config)  # Shows vision-specific parameters
print(siglip_model.config.text_config)    # Shows text-specific parameters

SiglipVisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 768,
  "image_size": 224,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-06,
  "model_type": "siglip_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "transformers_version": "4.48.3"
}

SiglipTextConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 49406,
  "eos_token_id": 49407,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 768,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-06,
  "max_position_embeddings": 64,
  "model_type": "siglip_text_model",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "transformers_version": "4.48.3",
  "vocab_size": 32000
}



In [15]:
# Initialize training components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalMemeClassifier(siglip_model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)

print(f"Using device: {device}")

Using device: cuda


In [None]:
# Re-initialize datasets with updated class
train_dataset = MemeDataset(train_paths, train_texts, train_labels, siglip_processor)
test_dataset = MemeDataset(test_paths, test_texts, test_labels, siglip_processor)

# Check one sample
sample = train_dataset[0]
print("Sample keys:", sample.keys())  # Should show all required keys

Sample keys: dict_keys(['pixel_values', 'input_ids', 'attention_mask', 'label'])


In [16]:
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        labels = batch['label'].to(device)

        outputs = model(**inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

In [17]:
def validate(model, test_loader, device):
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            outputs = model(**inputs)
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels.extend(batch['label'].numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    return val_acc, val_preds, val_labels

In [18]:
def train_model(model, train_loader, test_loader, optimizer, criterion, device, epochs=10):
    best_accuracy = 0
    for epoch in range(epochs):
        # Training phase
        avg_loss = train_epoch(model, train_loader, optimizer, criterion, device)

        # Validation phase
        val_acc, _, _ = validate(model, test_loader, device)

        print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), "best_model_large.pth")

    print(f"\nBest Validation Accuracy: {best_accuracy:.4f}")

In [19]:
# Run training
train_model(model, train_loader, test_loader, optimizer, criterion, device, epochs=10)

Epoch 1/10 | Loss: 0.6547 | Val Acc: 0.6965
Epoch 2/10 | Loss: 0.5670 | Val Acc: 0.7155
Epoch 3/10 | Loss: 0.5240 | Val Acc: 0.7144
Epoch 4/10 | Loss: 0.4923 | Val Acc: 0.7292
Epoch 5/10 | Loss: 0.4698 | Val Acc: 0.7281
Epoch 6/10 | Loss: 0.4543 | Val Acc: 0.7313
Epoch 7/10 | Loss: 0.4344 | Val Acc: 0.7260
Epoch 8/10 | Loss: 0.4222 | Val Acc: 0.7345
Epoch 9/10 | Loss: 0.4099 | Val Acc: 0.7323
Epoch 10/10 | Loss: 0.3947 | Val Acc: 0.7302

Best Validation Accuracy: 0.7345


In [None]:
# Training Loop with Validation
def train_model(model, train_loader, test_loader, epochs=10):
    best_accuracy = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
                outputs = model(**inputs)
                val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                val_labels.extend(batch['label'].numpy())

        val_acc = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), "best_model.pth")

    print(f"\nBest Validation Accuracy: {best_accuracy:.4f}")


In [None]:
# Recreate dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalMemeClassifier(siglip_model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)

# Run training
train_model(model, train_loader, test_loader, epochs=10)

Epoch 1/10 | Loss: 0.6762 | Val Acc: 0.6038
Epoch 2/10 | Loss: 0.6183 | Val Acc: 0.6797
Epoch 3/10 | Loss: 0.5799 | Val Acc: 0.6965
Epoch 4/10 | Loss: 0.5584 | Val Acc: 0.7039
Epoch 5/10 | Loss: 0.5385 | Val Acc: 0.6986
Epoch 6/10 | Loss: 0.5250 | Val Acc: 0.6955
Epoch 7/10 | Loss: 0.5122 | Val Acc: 0.7092
Epoch 8/10 | Loss: 0.4995 | Val Acc: 0.7071
Epoch 9/10 | Loss: 0.4915 | Val Acc: 0.7050
Epoch 10/10 | Loss: 0.4803 | Val Acc: 0.7113

Best Validation Accuracy: 0.7113


In [None]:
model.load_state_dict(torch.load("best_model.pth"))

In [None]:
# Final Evaluation

model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        outputs = model(**inputs)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_labels.extend(batch['label'].numpy())

print("\nFinal Test Results:")
print(f"Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))


Final Test Results:
Accuracy: 0.7113
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       469
           1       0.75      0.64      0.69       480

    accuracy                           0.71       949
   macro avg       0.72      0.71      0.71       949
weighted avg       0.72      0.71      0.71       949

Confusion Matrix:
[[369 100]
 [174 306]]
