### Step 1: Import Necessary Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import numpy as np
import pickle
from PIL import Image

ModuleNotFoundError: No module named 'torch'

In [None]:
# Clear cached memory
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)





### Step 2: Define the Dataset

In [None]:
class ImageTextDataset(Dataset):
    def __init__(self, image_dir, data_file, sentence_embeddings, transform=None):
        """
        image_dir: Directory with all the images.
        data_file: Path to the file containing image paths, captions, and match labels.
        sentence_embeddings: Pre-loaded sentence embeddings dictionary.
        transform: Optional transform to be applied on a sample.
        """
        self.image_dir = image_dir
        self.sentence_embeddings = sentence_embeddings
        self.transform = transform
        self.data = []

        # Load data
        with open(data_file, 'r') as file:
            for line in file:
                image_path, caption, label = line.strip().split('\t')
                self.data.append((image_path, caption, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, caption, label = self.data[idx]
        image = Image.open(f"{self.image_dir}/{image_path}").convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        text_embedding = self.sentence_embeddings[caption]
        label = torch.tensor(int(label == 'match'), dtype=torch.float)

        return image, torch.tensor(text_embedding, dtype=torch.float), label


### Step 3: Define the Siamese Network Model

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim):
        super(SiameseNetwork, self).__init__()
        self.cnn = models.resnet50(pretrained=True)
        # Adjust the following line if your embeddings' size is different
        self.cnn = nn.Sequential(*list(self.cnn.children())[:-1])
        self.fc_text = nn.Linear(embedding_dim, 2048)  # Ensure embedding_dim is 384 here

    def forward(self, image, text):
        image_features = self.cnn(image)
        image_features = image_features.view(image_features.size(0), -1)
        text_features = self.fc_text(text)
        return image_features, text_features


### Step 4: Define Contrastive Loss

In [None]:
class ContrastiveLoss(nn.Module):
    """
    Contrastive loss
    """
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, image_features, text_features, label):
        # Euclidean distance
        distance = F.pairwise_distance(image_features, text_features)
        # Contrastive loss
        loss = torch.mean((1-label) * torch.pow(distance, 2) +
                          (label) * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2))
        return loss


### Step 5: Load Data and Model Training
First, load the sentence embeddings and instantiate your dataset and dataloaders:

In [None]:
# Load sentence embeddings
with open('/home/rinzler/Github/Image-Text-Matching/data/flickr8k.cmp9137.sentence_transformers.pkl', 'rb') as f:
    sentence_embeddings = pickle.load(f)

# Data paths
image_dir = '/home/rinzler/Github/Image-Text-Matching/data/images'
train_data_file = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.TrainImages.txt'
val_data_file = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.DevImages.txt'
test_data_file = '/home/rinzler/Github/Image-Text-Matching/data/flickr8k.TestImages.txt'

# Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Dataset and DataLoader setup
train_dataset = ImageTextDataset(image_dir, train_data_file, sentence_embeddings, transform)
val_dataset = ImageTextDataset(image_dir, val_data_file, sentence_embeddings, transform)
test_dataset = ImageTextDataset(image_dir, test_data_file, sentence_embeddings, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model, Loss, and Optimizer
embedding_dim = 384  # Adjust according to your sentence embeddings
model = SiameseNetwork(embedding_dim)
loss_fn = ContrastiveLoss(margin=1.0)
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [None]:
# Training Loop
def train_epoch(loader, model, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    for images, texts, labels in loader:
        images, texts, labels = images.to(device), texts.to(device), labels.to(device)
        optimizer.zero_grad()
        image_features, text_features = model(images, texts)
        loss = loss_fn(image_features, text_features, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    return avg_loss

In [None]:
# Validation Loop
def validate_epoch(loader, model, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for images, texts, labels in loader:
            images, texts, labels = images.to(device), texts.to(device), labels.to(device)
            image_features, text_features = model(images, texts)
            loss = loss_fn(image_features, text_features, labels)
            total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    return avg_loss

In [None]:
# Run Training and Validation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10  # Adjust according to your needs
accumulation_steps = 4  # Example value, adjust based on your needs
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    for step, (images, texts, labels) in enumerate(train_loader):
        images, texts, labels = images.to(device), texts.to(device), labels.to(device)
        image_features, text_features = model(images, texts)
        loss = loss_fn(image_features, text_features, labels) / accumulation_steps  # Scale loss
        loss.backward()
        if (step + 1) % accumulation_steps == 0:  # Perform optimization step every `accumulation_steps` steps
            optimizer.step()
            optimizer.zero_grad()


Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn19HardwareInformationC1ERKNS_7SmModelEiff, version libcudnn_cnn_infer.so.8
Could not load library libcudnn_cnn_train.so.8. Error: /usr/local/cuda-11.8/lib64/libcudnn_cnn_train.so.8: undefined symbol: _ZN10cask_cudnn

RuntimeError: GET was unable to find an engine to execute this computation