In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
!pip install torchtext sentence-transformers transformers



In [None]:
!pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow)
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras-3.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, keras, tensorflow
  At

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import ViTFeatureExtractor, ViTModel
from sentence_transformers import SentenceTransformer
from PIL import Image
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import nltk
from nltk.translate.bleu_score import sentence_bleu
import os
import nltk
from nltk.translate.bleu_score import sentence_bleu

import pickle
from tqdm import tqdm
import h5py

In [None]:
EMBED_SIZE = 768  # ViT has 12 layers
HIDDEN_SIZE = 512
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001

    # Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
def extract_vit_features(image, feature_extractor, vit_model):
    """Extract features from ViT"""
    with torch.no_grad():
        inputs = feature_extractor(images=image, return_tensors="pt")
        outputs = vit_model(**inputs)
        features = outputs.last_hidden_state[:, 0, :]
        print(f"ViT feature shape: {features.shape}")
        return features

In [21]:
class DataPreprocessor:
    def __init__(self, image_dir, captions_file, feature_extractor, max_len=50, cache_dir='/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/cached_data'):
        self.image_dir = image_dir
        self.captions_file = captions_file  # Added this line
        self.max_len = max_len
        self.cache_dir = cache_dir
        self.feature_extractor = feature_extractor
        self.vit_model = None

        # Create cache directory
        os.makedirs(cache_dir, exist_ok=True)

        # Cache file paths
        self.vocab_cache = os.path.join(cache_dir, 'vocabulary.pkl')
        self.train_cache = os.path.join(cache_dir, 'train_data.pkl')
        self.test_cache = os.path.join(cache_dir, 'test_data.pkl')

        # Initialize ViT model (do it once)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        print("Loading ViT model...")
        self.vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224').to(self.device)
        self.vit_model.eval()

    def extract_features(self, image_path):
        try:
            image = Image.open(image_path).convert('RGB')

            with torch.no_grad():
                inputs = self.feature_extractor(images=image, return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                outputs = self.vit_model(**inputs)
                features = outputs.last_hidden_state[:, 0, :].cpu()
                return features.squeeze(0)
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            return torch.zeros(768)

    def process_data(self):
        if (os.path.exists(self.train_cache) and
            os.path.exists(self.test_cache) and
            os.path.exists(self.vocab_cache)):
            print("Loading cached data...")
            return self.load_cached_data()

        print("Processing data from scratch...")
        return self.create_and_cache_data()

    def create_and_cache_data(self):
        print("Reading captions file...")
        img_captions = {}
        all_captions = []

        # Read and process captions
        with open(self.captions_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()[1:]  # Skip header
            for line in tqdm(lines, desc="Reading captions"):
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    img_name = parts[0].strip()
                    caption = parts[1].strip().strip('"\'')

                    if caption:
                        if img_name not in img_captions:
                            img_captions[img_name] = []
                        img_captions[img_name].append(caption)
                        all_captions.append(caption)

        # Build vocabulary
        word2idx, idx2word = self.build_vocabulary(all_captions)

        # Process all images and captions
        print("\nProcessing images and creating batches...")
        features_list = []
        captions_list = []

        for img_name in tqdm(img_captions.keys(), desc="Processing images"):
            image_path = os.path.join(self.image_dir, img_name)
            if os.path.exists(image_path):
                features = self.extract_features(image_path)

                for caption in img_captions[img_name]:
                    features_list.append(features)

                    # Process caption
                    words = caption.lower().split()
                    caption_indices = [word2idx.get(word, word2idx['<UNK>']) for word in words]
                    caption_indices = [word2idx['<START>']] + caption_indices + [word2idx['<END>']]

                    # Pad sequence
                    if len(caption_indices) < self.max_len:
                        caption_indices += [word2idx['<PAD>']] * (self.max_len - len(caption_indices))
                    else:
                        caption_indices = caption_indices[:self.max_len]

                    captions_list.append(caption_indices)
            else:
                print(f"Warning: Image not found: {image_path}")

        # Convert to tensors
        print("\nConverting to tensors...")
        features_tensor = torch.stack(features_list)
        captions_tensor = torch.tensor(captions_list)

        # Split into train and test
        print("Splitting into train and test sets...")
        indices = torch.randperm(len(features_tensor))
        train_size = int(0.8 * len(indices))

        train_indices = indices[:train_size]
        test_indices = indices[train_size:]

        train_data = (features_tensor[train_indices], captions_tensor[train_indices])
        test_data = (features_tensor[test_indices], captions_tensor[test_indices])

        # Cache the processed data
        print("Caching processed data...")
        with open(self.train_cache, 'wb') as f:
            pickle.dump(train_data, f)
        with open(self.test_cache, 'wb') as f:
            pickle.dump(test_data, f)

        print(f"\nProcessing completed!")
        print(f"Train set size: {len(train_indices)}")
        print(f"Test set size: {len(test_indices)}")

        return (word2idx, idx2word), train_data, test_data
    def build_vocabulary(self, captions):
        print("Building vocabulary...")
        word_freq = {}

        for caption in captions:
            words = caption.lower().split()
            for word in words:
                word_freq[word] = word_freq.get(word, 0) + 1

        word2idx = {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNK>': 3}
        for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True):
            if len(word2idx) < 10000:
                word2idx[word] = len(word2idx)

        idx2word = {v: k for k, v in word2idx.items()}

        # Cache vocabulary
        with open(self.vocab_cache, 'wb') as f:
            pickle.dump((word2idx, idx2word), f)

        print(f"Vocabulary size: {len(word2idx)}")
        return word2idx, idx2word
    def load_cached_data(self):
        print("Loading vocabulary...")
        with open(self.vocab_cache, 'rb') as f:
            vocab = pickle.load(f)

        print("Loading train data...")
        with open(self.train_cache, 'rb') as f:
            train_data = pickle.load(f)

        print("Loading test data...")
        with open(self.test_cache, 'rb') as f:
            test_data = pickle.load(f)

        print(f"Train set size: {len(train_data[0])}")
        print(f"Test set size: {len(test_data[0])}")

        return vocab, train_data, test_data

In [22]:


    # Initialize ViT feature extractor
print("Initializing ViT feature extractor...")
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

    # Create dataset
preprocessor = DataPreprocessor(
        image_dir="/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/Images",
        captions_file="/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/captions.txt",
        feature_extractor=feature_extractor
    )
(word2idx, idx2word), (train_features, train_captions), (test_features, test_captions) = preprocessor.process_data()
# print(f"Dataset size: {len(dataset)}")
    # Split dataset
train_dataset = TensorDataset(train_features, train_captions)
test_dataset = TensorDataset(test_features, test_captions)
    # Create data loaders
train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=16,
        pin_memory=True
    )

test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=16,
        pin_memory=True
    )

Initializing ViT feature extractor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Using device: cuda
Loading ViT model...


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading cached data...
Loading vocabulary...
Loading train data...
Loading test data...
Train set size: 127131
Test set size: 31783


In [None]:
(word2idx, idx2word), (train_features, train_captions), (test_features, test_captions) = preprocessor.process_data()
# print(f"Dataset size: {len(dataset)}")


Processing data from scratch...
Reading captions file...


Reading captions: 100%|██████████| 158915/158915 [00:00<00:00, 844690.69it/s]


Building vocabulary...
Vocabulary size: 10000

Processing images and creating batches...


Processing images:   4%|▍         | 1339/31783 [04:29<1:42:10,  4.97it/s]


KeyboardInterrupt: 

In [15]:
import pickle

from torch.utils.data import Dataset, DataLoader, TensorDataset
def load_dataset(cache_dir='cached_data', batch_size=32):
    """
    Load preprocessed data from pickle files and create DataLoaders
    Returns vocabulary and data loaders for train and test sets
    """
    vocab_path = os.path.join(cache_dir, 'vocabulary.pkl')
    train_path = os.path.join(cache_dir, 'train_data.pkl')
    test_path = os.path.join(cache_dir, 'test_data.pkl')

    # Check if pickle files exist
    if not all(os.path.exists(p) for p in [vocab_path, train_path, test_path]):
        raise FileNotFoundError("Required pickle files not found. Run preprocessing first.")

    # Load vocabulary
    print("Loading vocabulary...")
    with open(vocab_path, 'rb') as f:
        word2idx, idx2word = pickle.load(f)

    # Load train features and captions
    print("Loading train data...")
    with open(train_path, 'rb') as f:
        train_features, train_captions = pickle.load(f)

    # Load test features and captions
    print("Loading test data...")
    with open(test_path, 'rb') as f:
        test_features, test_captions = pickle.load(f)

    # Create datasets
    train_dataset = TensorDataset(train_features, train_captions)
    test_dataset = TensorDataset(test_features, test_captions)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=16,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        num_workers=16,
        pin_memory=True
    )

    print(f"Vocabulary size: {len(word2idx)}")
    print(f"Train set size: {len(train_features)}")
    print(f"Test set size: {len(test_features)}")

    # Return both raw data and loaders
    raw_data = {
        'vocab': (word2idx, idx2word),
        'train_data': (train_features, train_captions),
        'test_data': (test_features, test_captions)
    }

    loaders = {
        'train': train_loader,
        'test': test_loader
    }

    return raw_data, loaders
# Load both raw data and DataLoaders
raw_data, loaders = load_dataset(cache_dir='/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/cached_data', batch_size=32)

# Access vocabulary and raw data if needed
word2idx, idx2word = raw_data['vocab']
train_features, train_captions = raw_data['train_data']
test_features, test_captions = raw_data['test_data']

# Access DataLoaders
train_loader = loaders['train']
test_loader = loaders['test']

Loading vocabulary...
Loading train data...
Loading test data...
Vocabulary size: 10000
Train set size: 127131
Test set size: 31783




In [8]:
import torch.nn.functional as F
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.v.data.normal_(mean=0, std=0.1)

    def forward(self, hidden, encoder_outputs):
        """
        hidden: (batch_size, 1, hidden_size)
        encoder_outputs: (batch_size, seq_len, hidden_size)
        """
        batch_size, seq_len, hidden_size = encoder_outputs.size()

        # Ensure hidden has correct shape
        if hidden.dim() == 2:
            hidden = hidden.unsqueeze(1)

        # Repeat hidden state for each encoder output
        hidden = hidden.repeat(1, seq_len, 1)

        # Calculate attention scores
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))

        # Reshape v for batch processing
        v = self.v.repeat(batch_size, 1).unsqueeze(1)

        # Calculate attention weights
        attention_weights = torch.bmm(v, energy.transpose(1, 2)).squeeze(1)
        attention_weights = F.softmax(attention_weights, dim=1)

        # Apply attention to encoder outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)

        return context, attention_weights


#MODEL

In [9]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size, embed_size=256, num_layers=2, dropout_p=0.3):
        super(ImageCaptioningModel, self).__init__()

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.embed_size = embed_size

        # Image feature processing
        self.feature_encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        )

        # Word embeddings
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_dropout = nn.Dropout(dropout_p)

        # Additional embedding processing
        self.embed_process = nn.Linear(embed_size, hidden_size)

        # Attention
        self.attention = AttentionLayer(hidden_size)

        # Decoder GRU
        self.decoder_rnn = nn.GRU(
            input_size=hidden_size * 2,  # Concatenated context and processed embedding
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_p if num_layers > 1 else 0
        )

        # Output projection
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size, vocab_size)
        )

        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, images, captions, teacher_forcing_ratio=0.5):
        batch_size = images.size(0)
        max_length = captions.size(1) - 1  # -1 because we don't predict for last token
        device = images.device

        # Encode images
        image_features = self.feature_encoder(images)
        image_features = image_features.unsqueeze(1)  # (batch_size, 1, hidden_size)

        # Initialize outputs tensor
        outputs = torch.zeros(batch_size, max_length, self.vocab_size).to(device)

        # Initialize decoder input
        decoder_input = captions[:, 0]  # Start tokens

        # Initialize hidden state (GRU only needs one hidden state, not cell state)
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

        for t in range(max_length):
            # Embed input tokens
            embedded = self.embedding(decoder_input)  # (batch_size, embed_size)
            embedded = self.embed_dropout(embedded)
            embedded = self.embed_process(embedded)  # (batch_size, hidden_size)

            # Add sequence dimension
            embedded = embedded.unsqueeze(1)  # (batch_size, 1, hidden_size)

            # Calculate attention
            context, _ = self.attention(embedded, image_features)

            # Combine embedding and context
            decoder_input_combined = torch.cat((embedded, context), dim=2)

            # GRU forward pass
            output, hidden = self.decoder_rnn(decoder_input_combined, hidden)

            # Process output
            output = self.layer_norm(output.squeeze(1))
            output = self.output_layer(output)

            # Store output
            outputs[:, t] = output

            # Teacher forcing
            if random.random() < teacher_forcing_ratio and t < max_length - 1:
                decoder_input = captions[:, t + 1]
            else:
                decoder_input = output.argmax(dim=1)

        return outputs

In [10]:
def evaluate_model(model, test_loader, criterion, device, pad_idx):
    model.eval()
    total_loss = 0
    total_word_accuracy = 0
    total_sentence_accuracy = 0
    num_batches = 0

    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Evaluating")

        for images, captions in progress_bar:
            # Move to device
            images = images.to(device)
            captions = captions.to(device)

            # Get input and target sequences
            input_captions = captions[:, :-1]
            target_captions = captions[:, 1:]

            # Forward pass
            outputs = model(images, input_captions)
            outputs = outputs[:, :-1, :]

            # Reshape for loss calculation
            outputs_flat = outputs.reshape(-1, outputs.size(-1))
            targets_flat = target_captions.reshape(-1)

            # Calculate metrics
            loss = criterion(outputs_flat, targets_flat)
            word_acc, sent_acc = calculate_accuracy(outputs_flat, targets_flat, pad_idx)

            # Update metrics
            total_loss += loss.item()
            total_word_accuracy += word_acc
            total_sentence_accuracy += sent_acc
            num_batches += 1

            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'word_acc': f'{word_acc:.4f}',
                'sent_acc': f'{sent_acc:.4f}'
            })

    # Calculate averages
    avg_loss = total_loss / num_batches
    avg_word_acc = total_word_accuracy / num_batches
    avg_sent_acc = total_sentence_accuracy / num_batches

    return {
        'loss': avg_loss,
        'word_accuracy': avg_word_acc,
        'sentence_accuracy': avg_sent_acc
    }

In [11]:
def calculate_accuracy(outputs, targets, pad_idx):
    """
    Calculate word-level
    outputs: (batch_size * seq_len, vocab_size)
    targets: (batch_size * seq_len)
    """
    # Get predictions
    predictions = outputs.argmax(dim=1)  # (batch_size * seq_len)

    # Create mask to ignore padding tokens
    mask = (targets != pad_idx)

    # Word-level accuracy
    correct_words = ((predictions == targets) & mask).sum().item()
    total_words = mask.sum().item()
    word_accuracy = correct_words / total_words if total_words > 0 else 0

    # Reshape for sentence-level accuracy
    batch_size = len(targets) // targets.shape[0]
    predictions = predictions.view(-1, batch_size)
    targets = targets.view(-1, batch_size)
    mask = mask.view(-1, batch_size)

    return word_accuracy

In [12]:
from torch.nn.parallel import DataParallel
from tqdm import tqdm
import threading
from queue import Queue
import random
from torch.cuda.amp import autocast, GradScaler

In [None]:
import random
def train_model(model, train_loader, criterion, optimizer, device, epoch, total_epochs, teacher_forcing_ratio=0.5):
    model.train()
    total_loss = 0
    total_words = 0
    correct_words = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{total_epochs}")

    for i, (images, captions) in enumerate(progress_bar):
        try:
            # Move to device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            outputs = model(images, captions)

            # Calculate loss
            loss = criterion(
                outputs.reshape(-1, outputs.size(-1)),
                captions[:, 1:].reshape(-1)
            )

            # Calculate accuracy
            predictions = outputs.argmax(dim=2)
            mask = captions[:, 1:] != 0  # Ignore padding
            correct = (predictions == captions[:, 1:]) & mask
            total_words += mask.sum().item()
            correct_words += correct.sum().item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Update metrics
            total_loss += loss.item()
            current_accuracy = correct_words / total_words if total_words > 0 else 0

            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{current_accuracy:.4f}'
            })

            # Print batch statistics
            if (i + 1) % 100 == 0:
                print(f"\nBatch {i+1}/{len(train_loader)}")
                print(f"Loss: {loss.item():.4f}")

        except Exception as e:
            print(f"\nError in batch {i}:")
            print(f"Exception: {str(e)}")
            continue

    avg_loss = total_loss / len(train_loader)
    avg_accuracy = correct_words / total_words if total_words > 0 else 0

    return avg_loss, avg_accuracy



In [18]:
import random
import torch
import threading
from queue import Queue
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def train_model(model, train_loader, criterion, optimizer, device, epoch, total_epochs, teacher_forcing_ratio=0.5, num_threads=2):
    model.train()
    total_loss = 0
    total_words = 0
    correct_words = 0

    # Create queues for batch processing and results
    batch_queue = Queue(maxsize=num_threads * 2)
    result_queue = Queue()

    # Lock for synchronizing updates
    update_lock = threading.Lock()

    def process_batch(batch_data):
        try:
            images, captions = batch_data
            batch_results = {}

            # Move to device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            outputs = model(images, captions)

            # Calculate loss
            loss = criterion(
                outputs.reshape(-1, outputs.size(-1)),
                captions[:, 1:].reshape(-1)
            )

            # Calculate accuracy
            predictions = outputs.argmax(dim=2)
            mask = captions[:, 1:] != 0  # Ignore padding
            correct = (predictions == captions[:, 1:]) & mask

            batch_results['loss'] = loss
            batch_results['correct'] = correct.sum().item()
            batch_results['total'] = mask.sum().item()

            return batch_results

        except Exception as e:
            print(f"\nError processing batch:")
            print(f"Exception: {str(e)}")
            return None

    def update_metrics(results):
        nonlocal total_loss, total_words, correct_words

        with update_lock:
            if results:
                total_loss += results['loss'].item()
                total_words += results['total']
                correct_words += results['correct']

                # Backward pass (needs to be done in main thread for thread safety)
                optimizer.zero_grad()
                results['loss'].backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{total_epochs}")

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []

        for i, batch in enumerate(progress_bar):
            future = executor.submit(process_batch, batch)
            futures.append(future)

            while futures:
                done_futures = [f for f in futures if f.done()]
                for future in done_futures:
                    results = future.result()
                    update_metrics(results)
                    futures.remove(future)

                    current_accuracy = correct_words / total_words if total_words > 0 else 0
                    progress_bar.set_postfix({
                        'loss': f'{total_loss/(i+1):.4f}',
                        'acc': f'{current_accuracy:.4f}'
                    })

                    if (i + 1) % 500 == 0:
                        print(f"\nBatch {i+1}/{len(train_loader)}")
                        print(f"Loss: {total_loss/(i+1):.4f}")
                        print(f"Accuracy: {current_accuracy:.4f}")
                        print(f"Total words: {total_words}")
                        print(f"Correct words: {correct_words}")

        for future in futures:
            results = future.result()
            update_metrics(results)

    avg_loss = total_loss / len(train_loader)
    avg_accuracy = correct_words / total_words if total_words > 0 else 0

    return avg_loss, avg_accuracy


In [23]:
def main():
    LEARNING_RATE = 0.001
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = ImageCaptioningModel(
        input_size=768,  # ViT base size
        hidden_size=HIDDEN_SIZE,
        vocab_size=len(word2idx),
        embed_size=EMBED_SIZE,
        num_layers=2,
        dropout_p=0.3
    ).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    print("Starting training...")
    for epoch in range(EPOCHS):
        loss, accuracy = train_model(
        model=model,
        train_loader=train_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        epoch=epoch+1,
        total_epochs=5,
        num_threads=16
    )

        print(f"\nEpoch {epoch + 1}/{EPOCHS}")
        print(f"Average Loss: {loss:.4f}")

        # Save checkpoint
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'accuracy': accuracy,
            'word2idx': word2idx,
            'idx2word': idx2word
        }, f'improved_model_epoch_{epoch+1}.pth')

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Main error: {str(e)}")
        raise

Using device: cuda
Starting training...


Epoch 1/5:   0%|          | 0/3973 [00:00<?, ?it/s]Exception in thread Thread-21 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 59, in _pin_memory_loop
    do_one_step()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 35, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
  File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_co

KeyboardInterrupt: 

In [None]:
def generate_caption(image_path, model_path="best_model.pth", max_length=50):
    """
    Generate a caption for a single image using the saved model
    """
    # Load model checkpoint
    checkpoint = torch.load(model_path, map_location='cpu')
    word2idx = checkpoint['word2idx']
    idx2word = checkpoint['idx2word']

    # Initialize model and load weights
    model = ImageCaptioningModel(
        input_size=768,  # ViT base size
        hidden_size=512,
        vocab_size=len(word2idx)
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    # Load and process image
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

    # Extract features
    image = Image.open(image_path).convert('RGB')

    with torch.no_grad():
        # Get ViT features
        inputs = feature_extractor(images=image, return_tensors="pt")
        outputs = vit_model(**inputs)
        image_features = outputs.last_hidden_state[:, 0, :]  # [1, 768]
        print(f"Image Embedding shape:",{image_features.shape})
    # Generate caption
    with torch.no_grad():
        # Initialize with start token
        current_token = torch.tensor([[word2idx['<START>']]])
        caption = []

        # Generate words until max length or end token
        for _ in range(max_length):
            # Generate next word
            output = model(image_features, current_token)
            next_word_idx = output[0, -1].argmax().item()

            # Convert to word
            word = idx2word[next_word_idx]

            # Stop if end token or pad
            if word in ['<END>', '<PAD>']:
                break

            caption.append(word)

            # Update current token
            current_token = torch.cat([current_token, torch.tensor([[next_word_idx]])], dim=1)

    return ' '.join(caption)

# Example usage:
if __name__ == "__main__":
    # Test with a sample image
    image_path = "/content/8192398089.jpg"  # Replace with your image path
    caption = generate_caption(image_path)
    print(f"\nGenerated caption: {caption}")

    # # Test with multiple images
    # test_images = [
    #     "flickr30k/images/image1.jpg",
    #     "flickr30k/images/image2.jpg",
    #     "flickr30k/images/image3.jpg"
    # ]

    # print("\nGenerating captions for multiple images:")
    # for img_path in test_images:
    #     try:
    #         caption = generate_caption(img_path)
    #         print(f"\nImage: {img_path}")
    #         print(f"Caption: {caption}")
    #     except Exception as e:
    #         print(f"Error processing {img_path}: {str(e)}")

  checkpoint = torch.load(model_path, map_location='cpu')
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Image Embedding shape: {torch.Size([1, 768])}

Generated caption: women black in and outfits in dance


In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
import os
import random
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from transformers import ViTFeatureExtractor, ViTModel
import numpy as np

class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.v.data.normal_(mean=0, std=0.1)

    def forward(self, hidden, encoder_outputs):
        """
        hidden: (batch_size, 1, hidden_size)
        encoder_outputs: (batch_size, seq_len, hidden_size)
        """
        batch_size, seq_len, hidden_size = encoder_outputs.size()

        # Ensure hidden has correct shape
        if hidden.dim() == 2:
            hidden = hidden.unsqueeze(1)

        # Repeat hidden state for each encoder output
        hidden = hidden.repeat(1, seq_len, 1)

        # Calculate attention scores
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))

        # Reshape v for batch processing
        v = self.v.repeat(batch_size, 1).unsqueeze(1)

        # Calculate attention weights
        attention_weights = torch.bmm(v, energy.transpose(1, 2)).squeeze(1)
        attention_weights = F.softmax(attention_weights, dim=1)

        # Apply attention to encoder outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)

        return context, attention_weights

class ImageCaptioningModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size, embed_size=256, num_layers=2, dropout_p=0.3):
        super(ImageCaptioningModel, self).__init__()

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.embed_size = embed_size

        # Image feature processing
        self.feature_encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        )

        # Word embeddings
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.embed_dropout = nn.Dropout(dropout_p)

        # Additional embedding processing
        self.embed_process = nn.Linear(embed_size, hidden_size)

        # Attention
        self.attention = AttentionLayer(hidden_size)

        # Decoder GRU
        self.decoder_rnn = nn.GRU(
            input_size=hidden_size * 2,  # Concatenated context and processed embedding
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_p if num_layers > 1 else 0
        )

        # Output projection
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size, vocab_size)
        )

        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_size)

    def generate_caption(self, image_features, word2idx, idx2word, max_length=50):
        self.eval()
        with torch.no_grad():
            batch_size = 1
            device = image_features.device

            # Encode image features
            image_features = self.feature_encoder(image_features)
            image_features = image_features.unsqueeze(1)  # [1, 1, hidden_size]

            # Initialize hidden state
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

            # Start with <START> token
            decoder_input = torch.tensor([[word2idx['<START>']]]).to(device)

            generated_words = []
            attention_weights_list = []  # Store attention weights for visualization

            for _ in range(max_length):
                # Embed input token
                embedded = self.embedding(decoder_input)
                embedded = self.embed_dropout(embedded)
                embedded = self.embed_process(embedded)

                # Calculate attention
                context, attention_weights = self.attention(hidden[-1], image_features)
                attention_weights_list.append(attention_weights)

                # Combine embedding and context
                decoder_input_combined = torch.cat((embedded, context), dim=2)

                # GRU forward pass
                output, hidden = self.decoder_rnn(decoder_input_combined, hidden)

                # Process output
                output = self.layer_norm(output.squeeze(1))
                output = self.output_layer(output)

                # Get predicted word
                predicted_idx = output.argmax(dim=1).item()
                predicted_word = idx2word[predicted_idx]

                if predicted_word in ['<END>', '<PAD>']:
                    break

                generated_words.append(predicted_word)
                decoder_input = torch.tensor([[predicted_idx]]).to(device)

            return generated_words, attention_weights_list

    def forward(self, images, captions, teacher_forcing_ratio=0.5):
        batch_size = images.size(0)
        max_length = captions.size(1) - 1
        device = images.device

        # Encode images
        image_features = self.feature_encoder(images)
        image_features = image_features.unsqueeze(1)

        # Initialize outputs tensor
        outputs = torch.zeros(batch_size, max_length, self.vocab_size).to(device)

        # Initialize decoder input
        decoder_input = captions[:, 0]

        # Initialize hidden state
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

        for t in range(max_length):
            # Embed input tokens
            embedded = self.embedding(decoder_input)
            embedded = self.embed_dropout(embedded)
            embedded = self.embed_process(embedded)
            embedded = embedded.unsqueeze(1)

            # Calculate attention using last layer hidden state
            context, _ = self.attention(hidden[-1], image_features)

            # Combine embedding and context
            decoder_input_combined = torch.cat((embedded, context), dim=2)

            # GRU forward pass
            output, hidden = self.decoder_rnn(decoder_input_combined, hidden)

            # Process output
            output = self.layer_norm(output.squeeze(1))
            output = self.output_layer(output)

            # Store output
            outputs[:, t] = output

            # Teacher forcing
            if random.random() < teacher_forcing_ratio and t < max_length - 1:
                decoder_input = captions[:, t + 1]
            else:
                decoder_input = output.argmax(dim=1)

        return outputs

# The evaluate_model function remains the same as before

In [39]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

def load_image_paths_and_captions(image_dir, caption_file, max_images=5):
    """Load image paths and their corresponding captions from a text file"""
    image_paths = []
    reference_captions = []

    with open(caption_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= max_images:
                break

            try:
                # Split by first comma only
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name = parts[0].strip()
                    caption = parts[1].strip()

                    # Remove .jpg if present in image name
                    if not image_name.endswith('.jpg'):
                        image_name += '.jpg'

                    image_path = os.path.join(image_dir, image_name)
                    if os.path.exists(image_path):
                        image_paths.append(image_path)
                        reference_captions.append(caption)
                        print(f"Loaded: {image_name} with caption: {caption}")
                    else:
                        print(f"Warning: Image not found: {image_path}")
            except Exception as e:
                print(f"Error processing line {i+1}: {line.strip()}")
                print(f"Error details: {str(e)}")
                continue

    if not image_paths:
        raise ValueError("No valid images found in the caption file")

    return image_paths, reference_captions

def evaluate_model(model_path, image_dir, caption_file, device='cuda'):
    """Evaluate the model on specified images"""
    print("Loading model...")
    checkpoint = torch.load(model_path, map_location=device)
    word2idx = checkpoint['word2idx']
    idx2word = checkpoint['idx2word']

    # Initialize model
    model = ImageCaptioningModel(
        input_size=768,  # ViT feature size
        hidden_size=512,
        vocab_size=len(word2idx),
        embed_size=768
    )

    # Load model weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    # Initialize ViT models
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224').to(device)
    vit_model.eval()

    # Load image paths and captions
    print("\nLoading image paths and captions...")
    print("\nLoading image paths and captions...")
    image_paths = []
    reference_captions = []

    with open(caption_file, 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:  # Limit to 5 images
                break
            parts = line.strip().split(',', 1)
            if len(parts) == 2:
                image_name, caption = parts
                if not image_name.endswith('.jpg'):
                    image_name += '.jpg'
                image_path = os.path.join(image_dir, image_name.strip())
                if os.path.exists(image_path):
                    image_paths.append(image_path)
                    reference_captions.append(caption.strip())

    # Initialize BLEU score calculator
    smoothing = SmoothingFunction().method1
    all_bleu_scores = []
    all_references = []
    all_hypotheses = []

    print("\nGenerating captions and calculating BLEU scores...")
    with torch.no_grad():
        for i, (image_path, reference) in enumerate(zip(image_paths, reference_captions)):
            try:
                print(f"\nProcessing image {i+1}: {os.path.basename(image_path)}")

                # Load and process image
                image = Image.open(image_path).convert('RGB')
                inputs = feature_extractor(images=image, return_tensors="pt")
                inputs = {k: v.to(device) for k, v in inputs.items()}

                # Get ViT features
                image_features = vit_model(**inputs).last_hidden_state[:, 0, :]

                # Generate caption
                generated_caption, _ = model.generate_caption(image_features, word2idx, idx2word)

                # Convert generated caption and reference to tokens
                generated_tokens = generated_caption  # Already a list of tokens
                reference_tokens = reference.lower().split()

                print(f"Generated caption: {' '.join(generated_tokens)}")
                print(f"Reference caption: {reference}")

                # Calculate BLEU scores
                try:
                    bleu1 = sentence_bleu([reference_tokens], generated_tokens,
                                        weights=(1, 0, 0, 0),
                                        smoothing_function=smoothing)
                    bleu4 = sentence_bleu([reference_tokens], generated_tokens,
                                        weights=(0.25, 0.25, 0.25, 0.25),
                                        smoothing_function=smoothing)

                    print(f"BLEU-1: {bleu1:.4f}")
                    print(f"BLEU-4: {bleu4:.4f}")

                    all_bleu_scores.append({'bleu1': bleu1, 'bleu4': bleu4})
                    all_references.append([reference_tokens])
                    all_hypotheses.append(generated_tokens)

                except Exception as e:
                    print(f"Error calculating BLEU scores: {str(e)}")
                    continue

            except Exception as e:
                print(f"Error processing image {image_path}: {str(e)}")
                continue

    # Calculate final scores
    if all_bleu_scores:
        avg_bleu1 = np.mean([s['bleu1'] for s in all_bleu_scores])
        avg_bleu4 = np.mean([s['bleu4'] for s in all_bleu_scores])

        try:
            corpus_bleu1 = corpus_bleu(all_references, all_hypotheses,
                                    weights=(1, 0, 0, 0),
                                    smoothing_function=smoothing)
            corpus_bleu4 = corpus_bleu(all_references, all_hypotheses,
                                    weights=(0.25, 0.25, 0.25, 0.25),
                                    smoothing_function=smoothing)
        except Exception as e:
            print(f"Error calculating corpus BLEU: {str(e)}")
            corpus_bleu1 = 0.0
            corpus_bleu4 = 0.0

        print("\nFinal Results:")
        print(f"Average BLEU-1: {avg_bleu1:.4f}")
        print(f"Average BLEU-4: {avg_bleu4:.4f}")
        print(f"Corpus BLEU-1: {corpus_bleu1:.4f}")
        print(f"Corpus BLEU-4: {corpus_bleu4:.4f}")

        return {
            'avg_bleu1': avg_bleu1,
            'avg_bleu4': avg_bleu4,
            'corpus_bleu1': corpus_bleu1,
            'corpus_bleu4': corpus_bleu4
        }
    else:
        print("No scores calculated.")
        return None


if __name__ == "__main__":
    MODEL_PATH = '/content/improved_model_epoch_4.pth'
    IMAGE_DIR = '/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/Images'
    CAPTION_FILE = '/content/drive/MyDrive/Tech India/Preprocessed-Dataset/Rams-approach-preprocess/flickr30k/captions.txt'
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Evaluate model
    results = evaluate_model(MODEL_PATH, IMAGE_DIR, CAPTION_FILE, DEVICE)

Loading model...


  checkpoint = torch.load(model_path, map_location=device)
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Loading image paths and captions...

Loading image paths and captions...

Generating captions and calculating BLEU scores...

Processing image 1: 1000092795.jpg
Generated caption: a man in a a shirt is a a a .
Reference caption: Two young guys with shaggy hair look at their hands while hanging out in the yard .
BLEU-1: 0.1054
BLEU-4: 0.0130

Processing image 2: 1000092795.jpg
Generated caption: a man in a a shirt is a a a .
Reference caption: " Two young , White males are outside near many bushes ."
BLEU-1: 0.0000
BLEU-4: 0.0000

Processing image 3: 1000092795.jpg
Generated caption: a man in a a shirt is a a a .
Reference caption: Two men in green shirts are standing in a yard .
BLEU-1: 0.2727
BLEU-4: 0.0441

Processing image 4: 1000092795.jpg
Generated caption: a man in a a shirt is a a a .
Reference caption: A man in a blue shirt standing in a garden .
BLEU-1: 0.6364
BLEU-4: 0.2699

Final Results:
Average BLEU-1: 0.2536
Average BLEU-4: 0.0817
Corpus BLEU-1: 0.2326
Corpus BLEU-4: 0.0

In [36]:
!pip install openai

Collecting openai
  Downloading openai-1.53.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.53.0-py3-none-any.whl (387 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.1/387.1 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━

In [38]:
import base64
import openai


openai.api_key = "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"


# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "/content/1000092795.jpg"

# Getting the base64 string
base64_image = encode_image(image_path)

response = openai.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image? and compare the description 'A man in a blue shirt standing in a garden ' to give score different from your output",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
)

print(response.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I can\'t directly analyze the content of the image, but I can help you craft a description based on common features in images with people in gardens.\n\n**Possible Description:**\nTwo individuals are interacting near a garden gate, surrounded by lush greenery. One person is trying to open the gate while the other stands nearby.\n\n**Comparison to Given Description:**\nThe given description, "A man in a blue shirt standing in a garden," differs in several ways:\n\n1. **Number of Individuals**: The description mentions "a man," while the scene includes two people.\n2. **Action**: The given description implies stillness ("standing"), whereas the individuals are engaged in an action (interacting with the gate).\n3. **Details**: The color of the shirt is specified (blue), but there is no mention of the other person\'s clothing or their action.\n\n**Score (on a scale of 1 to 10)**:\n- *Given Descripti