In [None]:
!pip install -q torch torchvision pandas ftfy

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# === Paths ===
CSV_PATH = '/content/drive/My Drive/CLIP_Project/Data/results.csv'
IMG_DIR = '/content/drive/My Drive/CLIP_Project/Data/Testing_Images'
MODEL_PATH = '/content/drive/My Drive/CLIP_Project/Model_Files'

In [4]:
# === Load Libraries ===
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
import torch
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
# === Load Model & Processor ===
model = CLIPModel.from_pretrained(MODEL_PATH).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_PATH)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
# === Load and Filter Data ===
df = pd.read_csv(CSV_PATH, sep='|', engine='python')
df.columns = [col.strip() for col in df.columns]
df['image_name'] = df['image_name'].astype(str).str.strip()
df['comment'] = df['comment'].astype(str).str.strip()
df['comment_number'] = pd.to_numeric(df['comment_number'], errors='coerce')
df = df[df['comment_number'] == 0].reset_index(drop=True)
df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))
df = df[df['filepath'].apply(os.path.exists)].reset_index(drop=True)
df = df.sample(n=500, random_state=42).reset_index(drop=True)

In [8]:
print(df.head(5))

      image_name  comment_number  \
0  505062117.jpg             0.0   
1  513390919.jpg             0.0   
2  535529555.jpg             0.0   
3  493507605.jpg             0.0   
4  516433137.jpg             0.0   

                                             comment  \
0  A dog swims in the water with a tennis ball in...   
1  Two young children and a young adult are worki...   
2  Two dogs try to get the chewed-up red Frisbee ...   
3  A wet black dog is running away from another b...   
4  An elderly man lies on a couch in an alleyway ...   

                                            filepath  
0  /content/drive/My Drive/CLIP_Project/Data/Test...  
1  /content/drive/My Drive/CLIP_Project/Data/Test...  
2  /content/drive/My Drive/CLIP_Project/Data/Test...  
3  /content/drive/My Drive/CLIP_Project/Data/Test...  
4  /content/drive/My Drive/CLIP_Project/Data/Test...  


In [9]:
# === Dataset and DataLoader ===
class FlickrTestDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image = Image.open(self.data.iloc[idx]['filepath']).convert("RGB")
        text = self.data.iloc[idx]['comment']
        return {'image': image, 'text': text}

def collate_fn(batch):
    texts = [item['text'] for item in batch]
    images = [item['image'] for item in batch]
    return processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)

test_dataset = FlickrTestDataset(df)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [10]:
# === Evaluation ===
model.eval()
loss_fn = CrossEntropyLoss()

all_image_embeds = []
all_text_embeds = []
total_loss = 0.0
total_samples = 0
correct_image_to_text = 0
correct_text_to_image = 0

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        # Normalize embeddings
        image_embeds = F.normalize(outputs.image_embeds, p=2, dim=1)
        text_embeds = F.normalize(outputs.text_embeds, p=2, dim=1)

        # Store for global similarity matrix
        all_image_embeds.append(image_embeds)
        all_text_embeds.append(text_embeds)

        # Compute similarity
        logits_per_image = image_embeds @ text_embeds.T
        logits_per_text = text_embeds @ image_embeds.T

        labels = torch.arange(image_embeds.size(0), device=device)
        loss_i = loss_fn(logits_per_image, labels)
        loss_t = loss_fn(logits_per_text, labels)
        loss = (loss_i + loss_t) / 2

        total_loss += loss.item()
        total_samples += 1

        # Accuracy (Recall@1)
        pred_i2t = torch.argmax(logits_per_image, dim=1)
        pred_t2i = torch.argmax(logits_per_text, dim=1)
        correct_image_to_text += (pred_i2t == labels).sum().item()
        correct_text_to_image += (pred_t2i == labels).sum().item()

In [11]:
# === Final Metrics ===
avg_loss = total_loss / total_samples
accuracy_i2t = correct_image_to_text / len(df)
accuracy_t2i = correct_text_to_image / len(df)

print(f"\n✅ Testing Completed")
print(f"📉 Average Contrastive Loss: {avg_loss:.4f}")
print(f"🎯 Accuracy (Image -> Text): {accuracy_i2t*100:.2f}%")
print(f"🎯 Accuracy (Text -> Image): {accuracy_t2i*100:.2f}%")


✅ Testing Completed
📉 Average Contrastive Loss: 2.7667
🎯 Accuracy (Image -> Text): 75.60%
🎯 Accuracy (Text -> Image): 76.60%
