<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/CLIP_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from pycocotools.coco import COCO
from transformers import BertTokenizer, BertModel
import os
from PIL import Image

# Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
num_epochs = 5
learning_rate = 0.0001
feature_dim = 512  # Shared embedding space

In [4]:
# Download MS COCO dataset (captions)
# The file is too large, I cannot train in my limited google account
coco_ann_file = './annotations/captions_train2017.json'
coco_image_dir = './train2017/'

# Initialize COCO API for caption annotations
coco = COCO(coco_ann_file)

# Load image ids and corresponding captions
image_ids = coco.getImgIds()
captions = coco.loadAnns(coco.getAnnIds(imgIds=image_ids))

# Define a custom dataset for MS COCO
class COCODataset(torch.utils.data.Dataset):
    def __init__(self, coco, image_dir, transform=None):
        self.coco = coco
        self.image_dir = image_dir
        self.transform = transform
        self.image_ids = list(coco.imgs.keys())
        self.annotations = coco.loadAnns(coco.getAnnIds(imgIds=self.image_ids))

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.image_dir, img_info['file_name'])

        # Load and transform image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Get the first caption for simplicity (MS COCO has 5 captions per image)
        caption = self.annotations[idx]['caption']

        return image, caption

# Image transformations (ResNet requires 224x224 input)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load the dataset
coco_dataset = COCODataset(coco, coco_image_dir, transform=transform)
data_loader = DataLoader(coco_dataset, batch_size=batch_size, shuffle=True)

loading annotations into memory...


FileNotFoundError: [Errno 2] No such file or directory: './annotations/captions_train2017.json'

In [None]:

# Image Encoder (e.g., ResNet-18)
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.fc = nn.Identity()  # Remove final classification layer
        self.projection = nn.Linear(512, feature_dim)  # Project features to shared space

    def forward(self, images):
        features = self.resnet(images)
        return self.projection(features)

# Text Encoder (e.g., BERT for captions)
class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.projection = nn.Linear(768, feature_dim)  # Project features to shared space

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=40)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = self.bert(**inputs)
        return self.projection(outputs.pooler_output)

# CLIP Model (Image + Text Encoder)
class CLIPModel(nn.Module):
    def __init__(self):
        super(CLIPModel, self).__init__()
        self.image_encoder = ImageEncoder().to(device)
        self.text_encoder = TextEncoder().to(device)

    def forward(self, images, captions):
        image_features = self.image_encoder(images)
        text_features = self.text_encoder(captions)
        return image_features, text_features

# Loss Function: Contrastive Loss (Cross-entropy on cosine similarity)
def contrastive_loss(image_features, text_features, temperature=0.07):
    logits = torch.matmul(image_features, text_features.T) / temperature  # Compute cosine similarity
    labels = torch.arange(len(logits)).to(device)
    loss_i = nn.CrossEntropyLoss()(logits, labels)
    loss_t = nn.CrossEntropyLoss()(logits.T, labels)
    return (loss_i + loss_t) / 2


In [None]:

# Instantiate the model
model = CLIPModel()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, captions in data_loader:
        images = images.to(device)
        optimizer.zero_grad()

        # Forward pass
        image_features, text_features = model(images, captions)

        # Normalize features
        image_features = nn.functional.normalize(image_features, dim=1)
        text_features = nn.functional.normalize(text_features, dim=1)

        # Calculate loss
        loss = contrastive_loss(image_features, text_features)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(data_loader):.4f}')

print("Training complete.")
