In [2]:
!pip install wget tqdm

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=79fbbdf24a96279d288e75c11b718549c3dc5b385e701ff069a4102ad46051f0
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [49]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, ViTFeatureExtractor, ViTModel
from PIL import Image
import pandas as pd
import numpy as np
from torchvision import transforms
import json
import os
import wget
import zipfile
from tqdm import tqdm
import os
import json
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

# Constants
MAX_LENGTH = 128
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
NUM_EPOCHS = 50
IMAGE_SIZE = 224
HIDDEN_SIZE = 768



In [40]:
def prepare_kaggle_daquar(input_dir, output_dir):
    """Prepare DAQUAR dataset from Kaggle format with compound answer handling"""
    print("Processing Kaggle DAQUAR dataset...")
    
    # Read CSV files
    train_data = pd.read_csv(os.path.join(input_dir, 'data_train.csv'))
    eval_data = pd.read_csv(os.path.join(input_dir, 'data_eval.csv'))
    
    # Read image lists
    with open(os.path.join(input_dir, 'train_images_list.txt'), 'r') as f:
        train_images = [line.strip() for line in f.readlines()]
    with open(os.path.join(input_dir, 'test_images_list.txt'), 'r') as f:
        test_images = [line.strip() for line in f.readlines()]
    
    # Process answers to handle compound answers
    def get_all_answers(data):
        answer_set = set()
        for answer in data['answer']:
            # Split compound answers and strip whitespace
            parts = [part.strip() for part in str(answer).split(',')]
            answer_set.update(parts)
        return sorted(list(answer_set))
    
    # Create answer vocabulary from both train and eval sets
    answer_vocab = get_all_answers(pd.concat([train_data, eval_data]))
    answer_to_idx = {ans: idx for idx, ans in enumerate(answer_vocab)}
    
    def create_annotations(data):
        annotations = []
        for _, row in data.iterrows():
            # Split compound answers into individual answers
            answers = [ans.strip() for ans in str(row['answer']).split(',')]
            # Use the first answer as the primary answer
            primary_answer = answers[0]
            
            ann = {
                'image': f"{row['image_id']}.png",
                'question': row['question'],
                'answer': primary_answer,  # Use only the primary answer
                'all_answers': answers  # Keep all answers for potential future use
            }
            annotations.append(ann)
        return annotations
    
    # Create annotation files
    train_annotations = create_annotations(train_data)
    test_annotations = create_annotations(eval_data)
    
    # Create processed directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Save processed annotations
    with open(os.path.join(output_dir, 'train_annotations.json'), 'w') as f:
        json.dump(train_annotations, f)
    with open(os.path.join(output_dir, 'test_annotations.json'), 'w') as f:
        json.dump(test_annotations, f)
    
    # Save vocabulary
    with open(os.path.join(output_dir, 'answer_vocab.json'), 'w') as f:
        json.dump({
            'answer_vocab': answer_vocab,
            'answer_to_idx': answer_to_idx
        }, f)
    
    print(f"Dataset prepared successfully!")
    print(f"Total training examples: {len(train_annotations)}")
    print(f"Total test examples: {len(test_annotations)}")
    print(f"Total unique answers: {len(answer_vocab)}")
    
    # Print first few examples
    print("\nFirst few training examples:")
    for i in range(3):
        print(f"Example {i+1}:")
        print(f"Image: {train_annotations[i]['image']}")
        print(f"Question: {train_annotations[i]['question']}")
        print(f"Primary Answer: {train_annotations[i]['answer']}")
        print(f"All Answers: {train_annotations[i]['all_answers']}\n")
    
    return len(answer_vocab)

In [41]:
class KaggleDAQUARDataset(Dataset):
    def __init__(self, input_dir, processed_dir, split='train'):
        self.input_dir = input_dir
        self.processed_dir = processed_dir
        self.split = split
        
        # Load annotations
        with open(os.path.join(processed_dir, f'{split}_annotations.json'), 'r') as f:
            self.annotations = json.load(f)
            
        # Load answer vocabulary
        with open(os.path.join(processed_dir, 'answer_vocab.json'), 'r') as f:
            vocab_data = json.load(f)
            self.answer_vocab = vocab_data['answer_vocab']
            self.answer_to_idx = vocab_data['answer_to_idx']
            
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
        
    def __len__(self):
        return len(self.annotations)
        
    def __getitem__(self, idx):
        ann = self.annotations[idx]
        
        # Load and preprocess image
        img_path = os.path.join(self.input_dir, 'images', ann['image'])
        image = Image.open(img_path).convert('RGB')
        
        # Process image with ViT feature extractor
        image_features = self.feature_extractor(images=image, return_tensors="pt")
        
        # Tokenize question
        question_encoding = self.tokenizer(
            ann['question'],
            padding='max_length',
            max_length=MAX_LENGTH,
            truncation=True,
            return_tensors='pt'
        )
        
        # Get answer index (using primary answer)
        answer_idx = self.answer_to_idx[ann['answer']]
        
        return {
            'image': image_features.pixel_values[0],
            'input_ids': question_encoding['input_ids'][0],
            'attention_mask': question_encoding['attention_mask'][0],
            'answer': torch.tensor(answer_idx)
        }

In [42]:
    input_dir = '/kaggle/input/processed-daquar-dataset'
    output_dir = '/kaggle/working/processed'
    
    # Prepare dataset
    print("Preparing dataset...")
    num_classes = prepare_kaggle_daquar(input_dir, output_dir)
    
    print(f"\nNumber of answer classes: {num_classes}")
    
    # Test dataset loading
    print("\nTesting dataset loading...")
    try:
        dataset = KaggleDAQUARDataset(
            input_dir=input_dir,
            processed_dir=output_dir,
            split='train'
        )
        print(f"Successfully loaded dataset with {len(dataset)} examples")
        
        # Test loading first item
        first_item = dataset[0]
        print("\nFirst item shapes:")
        print(f"Image: {first_item['image'].shape}")
        print(f"Input IDs: {first_item['input_ids'].shape}")
        print(f"Attention Mask: {first_item['attention_mask'].shape}")
        print(f"Answer: {first_item['answer']}")
        
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")

Preparing dataset...
Processing Kaggle DAQUAR dataset...
Dataset prepared successfully!
Total training examples: 6795
Total test examples: 5673
Total unique answers: 582

First few training examples:
Example 1:
Image: image3.png
Question: what is on the right side of the black telephone and on the left side of the red chair
Primary Answer: desk
All Answers: ['desk']

Example 2:
Image: image3.png
Question: what is in front of the white door on the left side of the desk
Primary Answer: telephone
All Answers: ['telephone']

Example 3:
Image: image3.png
Question: what is on the desk
Primary Answer: book
All Answers: ['book', 'scissor', 'papers', 'tape_dispenser']


Number of answer classes: 582

Testing dataset loading...
Successfully loaded dataset with 6795 examples

First item shapes:
Image: torch.Size([3, 224, 224])
Input IDs: torch.Size([128])
Attention Mask: torch.Size([128])
Answer: 160




In [43]:
class CrossModalAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads=8, batch_first=True)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ReLU(),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        
    def forward(self, x, y):
        # Cross attention
        attended_x, _ = self.attention(x, y, y)
        x = self.norm1(x + attended_x)
        
        # Feed forward
        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)
        
        return x

In [44]:
class KaggleDAQUARDataset(Dataset):
    def __init__(self, input_dir, processed_dir, split='train', transform=None):
        self.input_dir = input_dir
        self.processed_dir = processed_dir
        self.split = split
        
        # Load annotations
        with open(os.path.join(processed_dir, f'{split}_annotations.json'), 'r') as f:
            self.annotations = json.load(f)
            
        # Load answer vocabulary
        with open(os.path.join(processed_dir, 'answer_vocab.json'), 'r') as f:
            vocab_data = json.load(f)
            self.answer_vocab = vocab_data['answer_vocab']
            self.answer_to_idx = vocab_data['answer_to_idx']
            
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
        
    def __len__(self):
        return len(self.annotations)
        
    def __getitem__(self, idx):
        ann = self.annotations[idx]
        
        # Load and preprocess image
        img_path = os.path.join(self.input_dir, 'images', ann['image'])
        image = Image.open(img_path).convert('RGB')
        
        # Process image with ViT feature extractor (handles resizing and normalization)
        image_features = self.feature_extractor(images=image, return_tensors="pt")
        
        # Tokenize question
        question_encoding = self.tokenizer(
            ann['question'],
            padding='max_length',
            max_length=MAX_LENGTH,
            truncation=True,
            return_tensors='pt'
        )
        
        # Convert answer to index
        answer_idx = self.answer_to_idx[ann['answer']]
        
        return {
            'image': image_features.pixel_values[0],  # Remove the batch dimension
            'input_ids': question_encoding['input_ids'][0],
            'attention_mask': question_encoding['attention_mask'][0],
            'answer': torch.tensor(answer_idx)
        }

In [45]:
class MultimodalVQAModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        # Load pre-trained models
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224')
        
        # Cross-modal attention modules
        self.image_to_text_attention = CrossModalAttention(HIDDEN_SIZE)
        self.text_to_image_attention = CrossModalAttention(HIDDEN_SIZE)
        
        # Freeze some layers for transfer learning
        for param in self.bert.parameters():
            param.requires_grad = False
        for param in self.vit.parameters():
            param.requires_grad = False
            
        # Unfreeze the last few layers
        for param in self.bert.encoder.layer[-2:].parameters():
            param.requires_grad = True
        for param in self.vit.encoder.layer[-2:].parameters():
            param.requires_grad = True
        
        # Final fusion and classification layers
        self.fusion = nn.Sequential(
            nn.Linear(HIDDEN_SIZE * 2, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        
        self.classifier = nn.Linear(HIDDEN_SIZE, num_classes)
        
    def forward(self, image, input_ids, attention_mask):
        # Process image with ViT
        image_features = self.vit(image).last_hidden_state  # [batch_size, num_patches, hidden_size]
        
        # Process text with BERT
        text_features = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        # Cross-modal attention
        attended_image = self.image_to_text_attention(image_features, text_features)
        attended_text = self.text_to_image_attention(text_features, image_features)
        
        # Pool attended features
        image_pooled = attended_image.mean(dim=1)  # [batch_size, hidden_size]
        text_pooled = attended_text.mean(dim=1)    # [batch_size, hidden_size]
        
        # Concatenate pooled features
        combined_features = torch.cat((image_pooled, text_pooled), dim=1)
        
        # Final fusion and classification
        fused_features = self.fusion(combined_features)
        logits = self.classifier(fused_features)
        
        return logits

In [46]:
def create_data_loaders(input_dir, processed_dir, batch_size):
    # Create datasets without additional transforms since ViT feature extractor handles preprocessing
    train_dataset = KaggleDAQUARDataset(
        input_dir=input_dir,
        processed_dir=processed_dir,
        split='train'
    )
    
    val_dataset = KaggleDAQUARDataset(
        input_dir=input_dir,
        processed_dir=processed_dir,
        split='test'  # Using test split for validation
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    return train_loader, val_loader


In [47]:
def train_model(input_dir, processed_dir, num_classes):
    # Create data loaders
    train_loader, val_loader = create_data_loaders(input_dir, processed_dir, BATCH_SIZE)
    
    # Initialize model
    print("Initializing model...")
    model = MultimodalVQAModel(num_classes=num_classes)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # Train the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on {device}")
    
    model = model.to(device)
    
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}')
        for batch in progress_bar:
            image = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['answer'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(image, input_ids, attention_mask)
            loss = criterion(outputs, answers)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += answers.size(0)
            correct += predicted.eq(answers).sum().item()
            
            progress_bar.set_postfix({
                'loss': f'{train_loss/total:.4f}',
                'acc': f'{100.*correct/total:.2f}%'
            })
        
        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                image = batch['image'].to(device)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                answers = batch['answer'].to(device)
                
                outputs = model(image, input_ids, attention_mask)
                loss = criterion(outputs, answers)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += answers.size(0)
                val_correct += predicted.eq(answers).sum().item()
        
        print(f'\nEpoch {epoch+1}/{NUM_EPOCHS}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {100.*correct/total:.2f}%')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {100.*val_correct/val_total:.2f}%')
    
    return model

In [48]:
    input_dir = '/kaggle/input/processed-daquar-dataset'
    processed_dir = '/kaggle/working/processed'
    
    # Prepare dataset
    print("Preparing dataset...")
    num_classes = prepare_kaggle_daquar(input_dir, processed_dir)
    
    print(f"\nNumber of answer classes: {num_classes}")
    
    # Train the model
    model = train_model(input_dir, processed_dir, num_classes)

Preparing dataset...
Processing Kaggle DAQUAR dataset...
Dataset prepared successfully!
Total training examples: 6795
Total test examples: 5673
Total unique answers: 582

First few training examples:
Example 1:
Image: image3.png
Question: what is on the right side of the black telephone and on the left side of the red chair
Primary Answer: desk
All Answers: ['desk']

Example 2:
Image: image3.png
Question: what is in front of the white door on the left side of the desk
Primary Answer: telephone
All Answers: ['telephone']

Example 3:
Image: image3.png
Question: what is on the desk
Primary Answer: book
All Answers: ['book', 'scissor', 'papers', 'tape_dispenser']


Number of answer classes: 582




Initializing model...


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on cuda


Epoch 1/10: 100%|██████████| 213/213 [03:28<00:00,  1.02it/s, loss=0.1685, acc=6.56%]
Validation: 100%|██████████| 178/178 [01:59<00:00,  1.50it/s]



Epoch 1/10:
Train Loss: 5.3746, Accuracy: 6.56%
Val Loss: 4.4467, Accuracy: 14.26%


Epoch 2/10: 100%|██████████| 213/213 [03:27<00:00,  1.03it/s, loss=0.1345, acc=14.36%]
Validation: 100%|██████████| 178/178 [01:59<00:00,  1.49it/s]



Epoch 2/10:
Train Loss: 4.2903, Accuracy: 14.36%
Val Loss: 3.9223, Accuracy: 21.82%


Epoch 3/10: 100%|██████████| 213/213 [03:27<00:00,  1.03it/s, loss=0.1211, acc=19.25%]
Validation: 100%|██████████| 178/178 [01:59<00:00,  1.50it/s]



Epoch 3/10:
Train Loss: 3.8631, Accuracy: 19.25%
Val Loss: 3.7164, Accuracy: 23.30%


Epoch 4/10: 100%|██████████| 213/213 [03:27<00:00,  1.03it/s, loss=0.1127, acc=22.68%]
Validation: 100%|██████████| 178/178 [01:58<00:00,  1.50it/s]



Epoch 4/10:
Train Loss: 3.5937, Accuracy: 22.68%
Val Loss: 3.5973, Accuracy: 24.29%


Epoch 5/10: 100%|██████████| 213/213 [03:28<00:00,  1.02it/s, loss=0.1061, acc=25.61%]
Validation: 100%|██████████| 178/178 [01:58<00:00,  1.50it/s]



Epoch 5/10:
Train Loss: 3.3854, Accuracy: 25.61%
Val Loss: 3.5263, Accuracy: 25.47%


Epoch 6/10: 100%|██████████| 213/213 [03:27<00:00,  1.03it/s, loss=0.1005, acc=27.79%]
Validation: 100%|██████████| 178/178 [01:58<00:00,  1.50it/s]



Epoch 6/10:
Train Loss: 3.2071, Accuracy: 27.79%
Val Loss: 3.4622, Accuracy: 25.89%


Epoch 7/10: 100%|██████████| 213/213 [03:27<00:00,  1.02it/s, loss=0.0950, acc=30.95%]
Validation: 100%|██████████| 178/178 [01:58<00:00,  1.50it/s]



Epoch 7/10:
Train Loss: 3.0298, Accuracy: 30.95%
Val Loss: 3.4073, Accuracy: 26.79%


Epoch 8/10: 100%|██████████| 213/213 [03:27<00:00,  1.02it/s, loss=0.0907, acc=33.51%]
Validation: 100%|██████████| 178/178 [01:59<00:00,  1.50it/s]



Epoch 8/10:
Train Loss: 2.8936, Accuracy: 33.51%
Val Loss: 3.3818, Accuracy: 27.32%


Epoch 9/10: 100%|██████████| 213/213 [03:28<00:00,  1.02it/s, loss=0.0865, acc=35.58%]
Validation: 100%|██████████| 178/178 [01:58<00:00,  1.50it/s]



Epoch 9/10:
Train Loss: 2.7584, Accuracy: 35.58%
Val Loss: 3.3322, Accuracy: 28.03%


Epoch 10/10: 100%|██████████| 213/213 [03:28<00:00,  1.02it/s, loss=0.0825, acc=37.26%]
Validation: 100%|██████████| 178/178 [01:59<00:00,  1.50it/s]


Epoch 10/10:
Train Loss: 2.6333, Accuracy: 37.26%
Val Loss: 3.3250, Accuracy: 27.99%



