In [15]:
import os
import pandas as pd
from PIL import Image
import torch
from lavis.models import load_model_and_preprocess

# Load LAVIS VQA model and preprocessors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, vis_processors, _ = load_model_and_preprocess(
    name="blip_vqa", model_type="vqav2", is_eval=True, device=device
)

# Define VQA questions
questions = [
    "What is the race of the person in the image?",
    "What is the gender of the person in the image?",
    "What is the religion of the person in the image?",
    "Which country does the person in the image come from?",
    "Are there disabled people in the image?",
    "What animal is in the image?",
    "Is there a person in the image?",
    "Is there an animal in the image?",
]

# Define a generic answer list
answer_list = [
    "Caucasian", "Asian", "African", "Male", "Female", "Christianity", "Islam", "Judaism",
    "USA", "India", "China", "Yes", "No", "Dog", "Cat", "Bird", "Person", "Animal"
]

# Load CSV file
csv_path = "/kaggle/input/labelandcap/merged_data.csv"
output_csv_path = "output_vqa_answers.csv"
df = pd.read_csv(csv_path)

# Image folder path
image_folder = "/kaggle/input/requirement/resources/resources/datasets/harmeme/img"

# Generate VQA answers
answers = []

for index, row in df.iterrows():
    image_filename = row['image']
    image_path = os.path.join(image_folder, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        answers.append({question: "Image Not Found" for question in questions})
        continue

    try:
        # Load image
        image = Image.open(image_path).convert("RGB")
        processed_image = vis_processors["eval"](image).unsqueeze(0).to(device)

        # Generate answers for each question
        answer_dict = {}
        for question in questions:
            inputs = {"image": processed_image, "text_input": question}
            answer = model.predict_answers(
                samples=inputs, num_beams=5, num_answers=1, answer_list=answer_list
            )
            answer_dict[question] = answer[0]  # Top answer
        answers.append(answer_dict)

    except Exception as e:
        print(f"Error processing image {image_filename}: {e}")
        answers.append({question: "Error Processing Image" for question in questions})

# Merge VQA answers with the original DataFrame
answers_df = pd.DataFrame(answers)
df = pd.concat([df, answers_df], axis=1)

# Save updated DataFrame to a new CSV file
df.to_csv(output_csv_path, index=False)
print(f"Updated CSV file saved at: {output_csv_path}")

  checkpoint = torch.load(cached_file, map_location="cpu")


Updated CSV file saved at: output_vqa_answers.csv


In [1]:
pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-e86qe0iy
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-e86qe0iy
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=f282ced5a0020855e975f65900b4d5c064a7b32e8a5688ed8ef01a7c17688017
  Stored in directory: /tmp/pip-ephem-wheel-cache-5mpl4kb9/wheels/da/2b/4c/d6691fa9597aac8bb

In [4]:
pip install torch-geometric -f https://data.pyg.org/whl/torch-2.0.1+cu118.html

Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu118.html
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Note: you may need to restart the kernel to use updated packages.


In [43]:
import torch
import pandas as pd

csv_path = "/kaggle/input/with-vqa-data/output_vqa_answers (2).csv"
df = pd.read_csv(csv_path)

embedding_path = "/kaggle/input/combine-embedding/combined_embeddings.pt"
embeddings = torch.load(embedding_path)

train_df = df[df["split"] == "train"]
val_df = df[df["split"] == "val"]
test_df = df[df["split"] == "test"]

train_embeddings = [e for e in embeddings if e["image"] in train_df["image"].values]
val_embeddings = [e for e in embeddings if e["image"] in val_df["image"].values]
test_embeddings = [e for e in embeddings if e["image"] in test_df["image"].values]

print(f"Train embeddings: {len(train_embeddings)}")
print(f"Validation embeddings: {len(val_embeddings)}")
print(f"Test embeddings: {len(test_embeddings)}")

  embeddings = torch.load(embedding_path)


Train embeddings: 3013
Validation embeddings: 177
Test embeddings: 354


In [44]:
torch.save(train_embeddings, "/kaggle/working/train_embeddings.pt")
torch.save(val_embeddings, "/kaggle/working/val_embeddings.pt")
torch.save(test_embeddings, "/kaggle/working/test_embeddings.pt")

print("Embeddings saved successfully!")

Embeddings saved successfully!


In [45]:
train_embeddings = torch.load("/kaggle/working/train_embeddings.pt")
val_embeddings = torch.load("/kaggle/working/val_embeddings.pt")
test_embeddings = torch.load("/kaggle/working/test_embeddings.pt")

print(f"Train: {len(train_embeddings)}, Validation: {len(val_embeddings)}, Test: {len(test_embeddings)}")

  train_embeddings = torch.load("/kaggle/working/train_embeddings.pt")


Train: 3013, Validation: 177, Test: 354


  val_embeddings = torch.load("/kaggle/working/val_embeddings.pt")
  test_embeddings = torch.load("/kaggle/working/test_embeddings.pt")


MAX_MODEL = IMAGE + TEXT + CAPTION + VQA

In [46]:
from torch_geometric.data import Data

def create_graphs_from_embeddings(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["image_embedding"], embed["caption_embedding"], embed["meme_text_embedding"]] + list(embed["vqa_embeddings"])
        )  # [num_nodes, feature_dim]

        
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs


train_graphs = create_graphs_from_embeddings(train_embeddings)
val_graphs = create_graphs_from_embeddings(val_embeddings)
test_graphs = create_graphs_from_embeddings(test_embeddings)

In [47]:
from torch_geometric.loader import DataLoader


batch_size = 32
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)

In [48]:
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch.optim import Adam
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Define the Enhanced GAT model
class EnhancedGAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads=4, dropout=0.5):
        super(EnhancedGAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=num_heads, concat=True)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, concat=True)
        self.conv3 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, concat=False)
        self.fc1 = Linear(hidden_dim, hidden_dim // 2)
        self.fc2 = Linear(hidden_dim // 2, output_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if x.dim() == 3:
            x = x.squeeze(1)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)  # Graph-level pooling
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=32, shuffle=False)
test_loader = DataLoader(test_graphs, batch_size=32, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = graphs[0].x.size(-1)
hidden_dim = 128
output_dim = 2  # Binary classification
model = EnhancedGAT(input_dim, hidden_dim, output_dim).to(device)
print(f"Total trainable parameters in the model: {count_parameters(model):,}")

optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)


best_val_loss = float('inf')
patience = 5
early_stop_counter = 0

# Training and validation loop
epochs = 50
print("Starting training...")
for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model(batch)
            loss = criterion(out, batch.y)
            val_loss += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels)
    val_loss /= len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training complete.")

# Test phase
print("Starting testing...")
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(labels)

test_accuracy = accuracy_score(test_labels, test_preds)
conf_matrix = confusion_matrix(test_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Total trainable parameters in the model: 799,042
Starting training...
Epoch 1/50, Train Loss: 0.5033, Validation Loss: 0.4206, Validation Accuracy: 0.8362
Epoch 2/50, Train Loss: 0.4333, Validation Loss: 0.5028, Validation Accuracy: 0.7458
Epoch 3/50, Train Loss: 0.3908, Validation Loss: 0.5059, Validation Accuracy: 0.7345
Epoch 4/50, Train Loss: 0.3517, Validation Loss: 0.4698, Validation Accuracy: 0.7853
Epoch 5/50, Train Loss: 0.3296, Validation Loss: 0.4883, Validation Accuracy: 0.8192
Epoch 6/50, Train Loss: 0.2863, Validation Loss: 0.6922, Validation Accuracy: 0.7853
Early stopping at epoch 6
Training complete.
Starting testing...
Test Accuracy: 0.8503
Confusion Matrix:
[[204  26]
 [ 27  97]]


  model.load_state_dict(torch.load("best_model.pt"))


Without VQA

In [63]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch.optim import Adam
from sklearn.metrics import accuracy_score, confusion_matrix


def create_graphs_without_vqa(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["image_embedding"], embed["caption_embedding"], embed["meme_text_embedding"]]
        )  # [num_nodes, feature_dim]

        # 완전 연결 엣지 생성
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        # 그래프 객체 생성
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs

# Train, Validation, Test 그래프 생성
train_graphs_no_vqa = create_graphs_without_vqa(train_embeddings)
val_graphs_no_vqa = create_graphs_without_vqa(val_embeddings)
test_graphs_no_vqa = create_graphs_without_vqa(test_embeddings)

# 데이터 로더 생성
batch_size = 32
train_loader_no_vqa = DataLoader(train_graphs_no_vqa, batch_size=batch_size, shuffle=True)
val_loader_no_vqa = DataLoader(val_graphs_no_vqa, batch_size=batch_size, shuffle=False)
test_loader_no_vqa = DataLoader(test_graphs_no_vqa, batch_size=batch_size, shuffle=False)

# Define Enhanced GAT model
class EnhancedGAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads=4, dropout=0.5):
        super(EnhancedGAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=num_heads, concat=True)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, concat=True)
        self.conv3 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, concat=False)
        self.fc1 = Linear(hidden_dim, hidden_dim // 2)
        self.fc2 = Linear(hidden_dim // 2, output_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if x.dim() == 3:
            x = x.squeeze(1)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)  # Graph-level pooling
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim_no_vqa = train_graphs_no_vqa[0].x.size(-1)
hidden_dim = 128
output_dim = 2  # Binary classification
model_no_vqa = EnhancedGAT(input_dim_no_vqa, hidden_dim, output_dim).to(device)
optimizer = Adam(model_no_vqa.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Early stopping 설정
best_val_loss_no_vqa = float('inf')
early_stop_counter_no_vqa = 0
patience = 5

# Training and validation loop
epochs = 50
print("Starting training without VQA...")
for epoch in range(epochs):
    # Training phase
    model_no_vqa.train()
    train_loss_no_vqa = 0
    for batch in train_loader_no_vqa:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model_no_vqa(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss_no_vqa += loss.item()
    train_loss_no_vqa /= len(train_loader_no_vqa)

    # Validation phase
    model_no_vqa.eval()
    val_loss_no_vqa = 0
    val_preds_no_vqa, val_labels_no_vqa = [], []
    with torch.no_grad():
        for batch in val_loader_no_vqa:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model_no_vqa(batch)
            loss = criterion(out, batch.y)
            val_loss_no_vqa += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds_no_vqa.extend(preds)
            val_labels_no_vqa.extend(labels)
    val_loss_no_vqa /= len(val_loader_no_vqa)
    val_accuracy_no_vqa = accuracy_score(val_labels_no_vqa, val_preds_no_vqa)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss_no_vqa:.4f}, Validation Loss: {val_loss_no_vqa:.4f}, Validation Accuracy: {val_accuracy_no_vqa:.4f}")

    # Early stopping logic
    if val_loss_no_vqa < best_val_loss_no_vqa:
        best_val_loss_no_vqa = val_loss_no_vqa
        early_stop_counter_no_vqa = 0
        torch.save(model_no_vqa.state_dict(), "best_model_no_vqa.pt")
    else:
        early_stop_counter_no_vqa += 1
        if early_stop_counter_no_vqa >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training without VQA complete.")

# Test phase
print("Starting testing without VQA...")
model_no_vqa.load_state_dict(torch.load("best_model_no_vqa.pt"))
model_no_vqa.eval()
test_preds_no_vqa, test_labels_no_vqa = [], []
with torch.no_grad():
    for batch in test_loader_no_vqa:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model_no_vqa(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds_no_vqa.extend(preds)
        test_labels_no_vqa.extend(labels)

test_accuracy_no_vqa = accuracy_score(test_labels_no_vqa, test_preds_no_vqa)
conf_matrix_no_vqa = confusion_matrix(test_labels_no_vqa, test_preds_no_vqa)
print(f"Test Accuracy without VQA: {test_accuracy_no_vqa:.4f}")
print("Confusion Matrix without VQA:")
print(conf_matrix_no_vqa)

Starting training without VQA...
Epoch 1/50, Train Loss: 0.4842, Validation Loss: 0.4364, Validation Accuracy: 0.7627
Epoch 2/50, Train Loss: 0.4095, Validation Loss: 0.5545, Validation Accuracy: 0.7853
Epoch 3/50, Train Loss: 0.3770, Validation Loss: 0.5801, Validation Accuracy: 0.7401
Epoch 4/50, Train Loss: 0.3423, Validation Loss: 0.5402, Validation Accuracy: 0.8136
Epoch 5/50, Train Loss: 0.3164, Validation Loss: 0.5519, Validation Accuracy: 0.7571
Epoch 6/50, Train Loss: 0.2868, Validation Loss: 0.5775, Validation Accuracy: 0.7571
Early stopping at epoch 6
Training without VQA complete.
Starting testing without VQA...
Test Accuracy without VQA: 0.8503
Confusion Matrix without VQA:
[[192  38]
 [ 15 109]]


  model_no_vqa.load_state_dict(torch.load("best_model_no_vqa.pt"))


Without Captioning(BEST MODEL)

In [59]:
from torch_geometric.data import Data


def create_graphs_without_captioning(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["image_embedding"], embed["meme_text_embedding"]] + list(embed["vqa_embeddings"])
        )  # [num_nodes, feature_dim]

        
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs


train_graphs_no_caption = create_graphs_without_captioning(train_embeddings)
val_graphs_no_caption = create_graphs_without_captioning(val_embeddings)
test_graphs_no_caption = create_graphs_without_captioning(test_embeddings)


batch_size = 32
train_loader_no_caption = DataLoader(train_graphs_no_caption, batch_size=batch_size, shuffle=True)
val_loader_no_caption = DataLoader(val_graphs_no_caption, batch_size=batch_size, shuffle=False)
test_loader_no_caption = DataLoader(test_graphs_no_caption, batch_size=batch_size, shuffle=False)


input_dim_no_caption = train_graphs_no_caption[0].x.size(-1)  # 노드 특징 크기
model_no_caption = EnhancedGAT(input_dim_no_caption, hidden_dim, output_dim).to(device)

optimizer = Adam(model_no_caption.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)


best_val_loss_no_caption = float('inf')
early_stop_counter_no_caption = 0

# Training and validation loop
print("Starting training without Captioning...")
for epoch in range(epochs):
    # Training phase
    model_no_caption.train()
    train_loss_no_caption = 0
    for batch in train_loader_no_caption:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model_no_caption(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss_no_caption += loss.item()
    train_loss_no_caption /= len(train_loader_no_caption)

    # Validation phase
    model_no_caption.eval()
    val_loss_no_caption = 0
    val_preds_no_caption, val_labels_no_caption = [], []
    with torch.no_grad():
        for batch in val_loader_no_caption:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model_no_caption(batch)
            loss = criterion(out, batch.y)
            val_loss_no_caption += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds_no_caption.extend(preds)
            val_labels_no_caption.extend(labels)
    val_loss_no_caption /= len(val_loader_no_caption)
    val_accuracy_no_caption = accuracy_score(val_labels_no_caption, val_preds_no_caption)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss_no_caption:.4f}, Validation Loss: {val_loss_no_caption:.4f}, Validation Accuracy: {val_accuracy_no_caption:.4f}")

    # Early stopping logic
    if val_loss_no_caption < best_val_loss_no_caption:
        best_val_loss_no_caption = val_loss_no_caption
        early_stop_counter_no_caption = 0
        torch.save(model_no_caption.state_dict(), "best_model_no_caption.pt")
    else:
        early_stop_counter_no_caption += 1
        if early_stop_counter_no_caption >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training without Captioning complete.")

# Test phase
print("Starting testing without Captioning...")
model_no_caption.load_state_dict(torch.load("best_model_no_caption.pt"))
model_no_caption.eval()
test_preds_no_caption, test_labels_no_caption = [], []
with torch.no_grad():
    for batch in test_loader_no_caption:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model_no_caption(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds_no_caption.extend(preds)
        test_labels_no_caption.extend(labels)

test_accuracy_no_caption = accuracy_score(test_labels_no_caption, test_preds_no_caption)
conf_matrix_no_caption = confusion_matrix(test_labels_no_caption, test_preds_no_caption)
print(f"Test Accuracy without Captioning: {test_accuracy_no_caption:.4f}")
print("Confusion Matrix without Captioning:")
print(conf_matrix_no_caption)

Starting training without Captioning...
Epoch 1/50, Train Loss: 0.5061, Validation Loss: 0.4751, Validation Accuracy: 0.8023
Epoch 2/50, Train Loss: 0.4165, Validation Loss: 0.5448, Validation Accuracy: 0.7571
Epoch 3/50, Train Loss: 0.3877, Validation Loss: 0.5772, Validation Accuracy: 0.7571
Epoch 4/50, Train Loss: 0.3537, Validation Loss: 0.5575, Validation Accuracy: 0.7288
Epoch 5/50, Train Loss: 0.3133, Validation Loss: 0.5660, Validation Accuracy: 0.7401
Epoch 6/50, Train Loss: 0.2994, Validation Loss: 0.6502, Validation Accuracy: 0.7514
Early stopping at epoch 6
Training without Captioning complete.
Starting testing without Captioning...
Test Accuracy without Captioning: 0.8729
Confusion Matrix without Captioning:
[[202  28]
 [ 17 107]]


  model_no_caption.load_state_dict(torch.load("best_model_no_caption.pt"))


Without Image

In [60]:
from torch_geometric.data import Data


def create_graphs_without_image(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["caption_embedding"], embed["meme_text_embedding"]] + list(embed["vqa_embeddings"])
        )  # [num_nodes, feature_dim]

        
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs


train_graphs_no_image = create_graphs_without_image(train_embeddings)
val_graphs_no_image = create_graphs_without_image(val_embeddings)
test_graphs_no_image = create_graphs_without_image(test_embeddings)


batch_size = 32
train_loader_no_image = DataLoader(train_graphs_no_image, batch_size=batch_size, shuffle=True)
val_loader_no_image = DataLoader(val_graphs_no_image, batch_size=batch_size, shuffle=False)
test_loader_no_image = DataLoader(test_graphs_no_image, batch_size=batch_size, shuffle=False)


input_dim_no_image = train_graphs_no_image[0].x.size(-1)  
model_no_image = EnhancedGAT(input_dim_no_image, hidden_dim, output_dim).to(device)

optimizer = Adam(model_no_image.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)


best_val_loss_no_image = float('inf')
early_stop_counter_no_image = 0

# Training and validation loop
print("Starting training without Image...")
for epoch in range(epochs):
    # Training phase
    model_no_image.train()
    train_loss_no_image = 0
    for batch in train_loader_no_image:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model_no_image(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss_no_image += loss.item()
    train_loss_no_image /= len(train_loader_no_image)

    # Validation phase
    model_no_image.eval()
    val_loss_no_image = 0
    val_preds_no_image, val_labels_no_image = [], []
    with torch.no_grad():
        for batch in val_loader_no_image:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model_no_image(batch)
            loss = criterion(out, batch.y)
            val_loss_no_image += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds_no_image.extend(preds)
            val_labels_no_image.extend(labels)
    val_loss_no_image /= len(val_loader_no_image)
    val_accuracy_no_image = accuracy_score(val_labels_no_image, val_preds_no_image)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss_no_image:.4f}, Validation Loss: {val_loss_no_image:.4f}, Validation Accuracy: {val_accuracy_no_image:.4f}")

    # Early stopping logic
    if val_loss_no_image < best_val_loss_no_image:
        best_val_loss_no_image = val_loss_no_image
        early_stop_counter_no_image = 0
        torch.save(model_no_image.state_dict(), "best_model_no_image.pt")
    else:
        early_stop_counter_no_image += 1
        if early_stop_counter_no_image >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training without Image complete.")

# Test phase
print("Starting testing without Image...")
model_no_image.load_state_dict(torch.load("best_model_no_image.pt"))
model_no_image.eval()
test_preds_no_image, test_labels_no_image = [], []
with torch.no_grad():
    for batch in test_loader_no_image:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model_no_image(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds_no_image.extend(preds)
        test_labels_no_image.extend(labels)

test_accuracy_no_image = accuracy_score(test_labels_no_image, test_preds_no_image)
conf_matrix_no_image = confusion_matrix(test_labels_no_image, test_preds_no_image)
print(f"Test Accuracy without Image: {test_accuracy_no_image:.4f}")
print("Confusion Matrix without Image:")
print(conf_matrix_no_image)

Starting training without Image...
Epoch 1/50, Train Loss: 0.5301, Validation Loss: 0.4867, Validation Accuracy: 0.7684
Epoch 2/50, Train Loss: 0.4483, Validation Loss: 0.5085, Validation Accuracy: 0.7232
Epoch 3/50, Train Loss: 0.3979, Validation Loss: 0.5315, Validation Accuracy: 0.7740
Epoch 4/50, Train Loss: 0.3803, Validation Loss: 0.7081, Validation Accuracy: 0.7175
Epoch 5/50, Train Loss: 0.3467, Validation Loss: 0.5962, Validation Accuracy: 0.7401
Epoch 6/50, Train Loss: 0.3330, Validation Loss: 0.7158, Validation Accuracy: 0.7345
Early stopping at epoch 6
Training without Image complete.
Starting testing without Image...
Test Accuracy without Image: 0.8277
Confusion Matrix without Image:
[[194  36]
 [ 25  99]]


  model_no_image.load_state_dict(torch.load("best_model_no_image.pt"))


Without Meme TEXT

In [61]:
from torch_geometric.data import Data


def create_graphs_without_text(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["image_embedding"], embed["caption_embedding"]] + list(embed["vqa_embeddings"])
        )  # [num_nodes, feature_dim]

        
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs


train_graphs_no_text = create_graphs_without_text(train_embeddings)
val_graphs_no_text = create_graphs_without_text(val_embeddings)
test_graphs_no_text = create_graphs_without_text(test_embeddings)


batch_size = 32
train_loader_no_text = DataLoader(train_graphs_no_text, batch_size=batch_size, shuffle=True)
val_loader_no_text = DataLoader(val_graphs_no_text, batch_size=batch_size, shuffle=False)
test_loader_no_text = DataLoader(test_graphs_no_text, batch_size=batch_size, shuffle=False)


input_dim_no_text = train_graphs_no_text[0].x.size(-1)  
model_no_text = EnhancedGAT(input_dim_no_text, hidden_dim, output_dim).to(device)

optimizer = Adam(model_no_text.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)


best_val_loss_no_text = float('inf')
early_stop_counter_no_text = 0

# Training and validation loop
print("Starting training without Text...")
for epoch in range(epochs):
    # Training phase
    model_no_text.train()
    train_loss_no_text = 0
    for batch in train_loader_no_text:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model_no_text(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss_no_text += loss.item()
    train_loss_no_text /= len(train_loader_no_text)

    # Validation phase
    model_no_text.eval()
    val_loss_no_text = 0
    val_preds_no_text, val_labels_no_text = [], []
    with torch.no_grad():
        for batch in val_loader_no_text:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model_no_text(batch)
            loss = criterion(out, batch.y)
            val_loss_no_text += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds_no_text.extend(preds)
            val_labels_no_text.extend(labels)
    val_loss_no_text /= len(val_loader_no_text)
    val_accuracy_no_text = accuracy_score(val_labels_no_text, val_preds_no_text)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss_no_text:.4f}, Validation Loss: {val_loss_no_text:.4f}, Validation Accuracy: {val_accuracy_no_text:.4f}")

    # Early stopping logic
    if val_loss_no_text < best_val_loss_no_text:
        best_val_loss_no_text = val_loss_no_text
        early_stop_counter_no_text = 0
        torch.save(model_no_text.state_dict(), "best_model_no_text.pt")
    else:
        early_stop_counter_no_text += 1
        if early_stop_counter_no_text >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training without Text complete.")

# Test phase
print("Starting testing without Text...")
model_no_text.load_state_dict(torch.load("best_model_no_text.pt"))
model_no_text.eval()
test_preds_no_text, test_labels_no_text = [], []
with torch.no_grad():
    for batch in test_loader_no_text:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model_no_text(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds_no_text.extend(preds)
        test_labels_no_text.extend(labels)

test_accuracy_no_text = accuracy_score(test_labels_no_text, test_preds_no_text)
conf_matrix_no_text = confusion_matrix(test_labels_no_text, test_preds_no_text)
print(f"Test Accuracy without Text: {test_accuracy_no_text:.4f}")
print("Confusion Matrix without Text:")
print(conf_matrix_no_text)

Starting training without Text...
Epoch 1/50, Train Loss: 0.5120, Validation Loss: 0.5899, Validation Accuracy: 0.7458
Epoch 2/50, Train Loss: 0.4507, Validation Loss: 0.4952, Validation Accuracy: 0.7684
Epoch 3/50, Train Loss: 0.4001, Validation Loss: 0.6357, Validation Accuracy: 0.7345
Epoch 4/50, Train Loss: 0.3720, Validation Loss: 0.5893, Validation Accuracy: 0.7627
Epoch 5/50, Train Loss: 0.3413, Validation Loss: 0.5574, Validation Accuracy: 0.7571
Epoch 6/50, Train Loss: 0.3247, Validation Loss: 0.6544, Validation Accuracy: 0.7458
Epoch 7/50, Train Loss: 0.2921, Validation Loss: 0.6084, Validation Accuracy: 0.7345
Early stopping at epoch 7
Training without Text complete.
Starting testing without Text...
Test Accuracy without Text: 0.8362
Confusion Matrix without Text:
[[194  36]
 [ 22 102]]


  model_no_text.load_state_dict(torch.load("best_model_no_text.pt"))


Without captioning and VQA(ONLY MEME TEXT, IMAGE)

In [62]:
from torch_geometric.data import Data


def create_graphs_without_caption_vqa(embeddings):
    graphs = []
    for embed in embeddings:
        
        node_features = torch.stack(
            [embed["image_embedding"], embed["meme_text_embedding"]]
        )  # [num_nodes, feature_dim]

        
        num_nodes = node_features.size(0)
        edge_indices = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        
        label = torch.tensor(embed["label"], dtype=torch.long)
        graph = Data(x=node_features, edge_index=edge_index, y=label)
        graphs.append(graph)
    return graphs

# Train, Validation, Test 
train_graphs_no_caption_vqa = create_graphs_without_caption_vqa(train_embeddings)
val_graphs_no_caption_vqa = create_graphs_without_caption_vqa(val_embeddings)
test_graphs_no_caption_vqa = create_graphs_without_caption_vqa(test_embeddings)


batch_size = 32
train_loader_no_caption_vqa = DataLoader(train_graphs_no_caption_vqa, batch_size=batch_size, shuffle=True)
val_loader_no_caption_vqa = DataLoader(val_graphs_no_caption_vqa, batch_size=batch_size, shuffle=False)
test_loader_no_caption_vqa = DataLoader(test_graphs_no_caption_vqa, batch_size=batch_size, shuffle=False)


input_dim_no_caption_vqa = train_graphs_no_caption_vqa[0].x.size(-1)  
model_no_caption_vqa = EnhancedGAT(input_dim_no_caption_vqa, hidden_dim, output_dim).to(device)

optimizer = Adam(model_no_caption_vqa.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)


best_val_loss_no_caption_vqa = float('inf')
early_stop_counter_no_caption_vqa = 0

# Training and validation loop
print("Starting training without Caption & VQA...")
for epoch in range(epochs):
    # Training phase
    model_no_caption_vqa.train()
    train_loss_no_caption_vqa = 0
    for batch in train_loader_no_caption_vqa:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        optimizer.zero_grad()
        out = model_no_caption_vqa(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        train_loss_no_caption_vqa += loss.item()
    train_loss_no_caption_vqa /= len(train_loader_no_caption_vqa)

    # Validation phase
    model_no_caption_vqa.eval()
    val_loss_no_caption_vqa = 0
    val_preds_no_caption_vqa, val_labels_no_caption_vqa = [], []
    with torch.no_grad():
        for batch in val_loader_no_caption_vqa:
            batch = batch.to(device)
            if batch.x.dim() == 3:
                batch.x = batch.x.squeeze(1)
            out = model_no_caption_vqa(batch)
            loss = criterion(out, batch.y)
            val_loss_no_caption_vqa += loss.item()
            preds = out.argmax(dim=1).cpu().numpy()
            labels = batch.y.cpu().numpy()
            val_preds_no_caption_vqa.extend(preds)
            val_labels_no_caption_vqa.extend(labels)
    val_loss_no_caption_vqa /= len(val_loader_no_caption_vqa)
    val_accuracy_no_caption_vqa = accuracy_score(val_labels_no_caption_vqa, val_preds_no_caption_vqa)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss_no_caption_vqa:.4f}, Validation Loss: {val_loss_no_caption_vqa:.4f}, Validation Accuracy: {val_accuracy_no_caption_vqa:.4f}")

    # Early stopping logic
    if val_loss_no_caption_vqa < best_val_loss_no_caption_vqa:
        best_val_loss_no_caption_vqa = val_loss_no_caption_vqa
        early_stop_counter_no_caption_vqa = 0
        torch.save(model_no_caption_vqa.state_dict(), "best_model_no_caption_vqa.pt")
    else:
        early_stop_counter_no_caption_vqa += 1
        if early_stop_counter_no_caption_vqa >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    scheduler.step()

print("Training without Caption & VQA complete.")

# Test phase
print("Starting testing without Caption & VQA...")
model_no_caption_vqa.load_state_dict(torch.load("best_model_no_caption_vqa.pt"))
model_no_caption_vqa.eval()
test_preds_no_caption_vqa, test_labels_no_caption_vqa = [], []
with torch.no_grad():
    for batch in test_loader_no_caption_vqa:
        batch = batch.to(device)
        if batch.x.dim() == 3:
            batch.x = batch.x.squeeze(1)
        out = model_no_caption_vqa(batch)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = batch.y.cpu().numpy()
        test_preds_no_caption_vqa.extend(preds)
        test_labels_no_caption_vqa.extend(labels)

test_accuracy_no_caption_vqa = accuracy_score(test_labels_no_caption_vqa, test_preds_no_caption_vqa)
conf_matrix_no_caption_vqa = confusion_matrix(test_labels_no_caption_vqa, test_preds_no_caption_vqa)
print(f"Test Accuracy without Caption & VQA: {test_accuracy_no_caption_vqa:.4f}")
print("Confusion Matrix without Caption & VQA:")
print(conf_matrix_no_caption_vqa)

Starting training without Caption & VQA...
Epoch 1/50, Train Loss: 0.4715, Validation Loss: 0.5536, Validation Accuracy: 0.7627
Epoch 2/50, Train Loss: 0.4136, Validation Loss: 0.5747, Validation Accuracy: 0.7175
Epoch 3/50, Train Loss: 0.3911, Validation Loss: 0.6733, Validation Accuracy: 0.7062
Epoch 4/50, Train Loss: 0.3521, Validation Loss: 0.5845, Validation Accuracy: 0.7401
Epoch 5/50, Train Loss: 0.3266, Validation Loss: 0.6044, Validation Accuracy: 0.7458
Epoch 6/50, Train Loss: 0.2965, Validation Loss: 0.6521, Validation Accuracy: 0.6949
Early stopping at epoch 6
Training without Caption & VQA complete.
Starting testing without Caption & VQA...
Test Accuracy without Caption & VQA: 0.8531
Confusion Matrix without Caption & VQA:
[[193  37]
 [ 15 109]]


  model_no_caption_vqa.load_state_dict(torch.load("best_model_no_caption_vqa.pt"))
