# Environment setup ============================================================================================


## Installing CLIP and YoloV5 and Imports

It the first section of this file, the installation of the needed components is performed. These first bash lines install CLIP and YoloV5 respectively. These two Neural Network will represent the ground base of the project development.

In [None]:
%%bash

# Download CLIP and YOLO
pip install git+https://github.com/openai/CLIP.git
pip install -qr https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

# Command to install some needed dependencies in the AWS machine
sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6  -y


## List of imports

In [None]:
# general imports
import pickle
import json
import tarfile
import os
import torch
import clip
from PIL import Image, ImageDraw

# utility libraries imports
import pandas as pd
import numpy as np
from tqdm import tqdm

# torch imports
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam




## Setting the Clip model and Yolo model variables

In [None]:
# Chosing the device 
device = "cuda" if torch.cuda.is_available() else "cpu"

# choosing the clip model and the yolo versions
clip_model, preprocess = clip.load('RN50', device)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Fine-tuning Clip

## Creation of the train and validation splits

Class definition for reading the RefCOCOg dataset

In [None]:
class RefCOCOgDataset(Dataset):
    def __init__(self, transform=None, split='train'):
        # needed paths
        self.image_dir = os.path.join( 'refcocog', 'images' )

        # variables directly set
        self.transform = transform
        self.split = split

        # reading annotations and instances
        self.refs = self.load_refs()
        self.instances = self.load_instances()

        # utils
        self.image_id_to_filename = {img['id']: img['file_name'] for img in self.instances['images']}
        self.ann_id_to_bbox = {ann['id']: ann['bbox'] for ann in self.instances['annotations']}

        # define samples list
        self.samples = []
        for ref in self.refs:
            for sentence in ref['sentences']:
                self.samples.append({
                    'image_id': ref['image_id'],
                    'ann_id': ref['ann_id'],
                    'sentence': sentence['sent']
                })

        # Preload all images into memory if you have enough RAM
        # self.preloaded_images = {}
        # for image_id, filename in self.image_id_to_filename.items():
        #     image_path = os.path.join(self.image_dir, filename)
        #     self.preloaded_images[image_id] = Image.open(image_path).convert('RGB')

        # Precompute tokenization if possible
        # self.tokenized_sentences = clip.tokenize([s['sentence'] for s in self.samples])


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

         # Get image path and load image
        image_name = self.image_id_to_filename.get(sample['image_id'])
        image_path = os.path.join(self.image_dir, image_name)
        image = Image.open(image_path).convert('RGB')
        # image = self.preloaded_images[sample['image_id']]

        # Get bounding box
        # box = [obj for obj in self.instances['annotations'] if obj['id'] == sample['ann_id']][0].get('bbox')
        box = self.ann_id_to_bbox[sample['ann_id']]

        # Crop image using the correct coordinate system:
        x1 = max(0, int(box[0]))
        y1 = max(0, int(box[1]))
        x2 = min(image.size[0], int(box[0] + box[2]))
        y2 = min(image.size[1], int(box[1] + box[3]))

        # Crop the image to the bounding box region
        cropped_image = image.crop((x1, y1, x2, y2))

        # Apply transforms if specified
        if self.transform:
            cropped_image = self.transform(cropped_image)

        # Keep the original bbox calculation as it was
        bbox = torch.FloatTensor([
            box[0],
            box[1],
            box[0] + box[2],
            box[1] + box[3]
        ])

        return {
            'image': cropped_image,
            'bbox': bbox,
            'sentence': sample['sentence']
        }


    def load_refs(self):
        annotation_file = os.path.join('refcocog', 'annotations', 'refs(umd).p')
        with open(annotation_file, 'rb') as f:
            data = pickle.load(f)
        annotations = [item for item in data if item['split'] == self.split]
        return annotations

    def load_instances(self):
        instances_file = os.path.join('refcocog', 'annotations', 'instances.json')
        return json.load(open(instances_file, 'r'))

    def define_entries(self):
        for img in self.instances['images']:
            image_name = img["file_name"]
            images_annotations = [obj for obj in self.instances['annotations'] if obj['image_id'] == img["id"]]
            images_sentences = [obj for obj in self.refs if obj['image_id'] == img["id"]]

In [None]:
# Image transformation
transform = None

# #Train set and validation set split
train_dataset = RefCOCOgDataset(transform=transform, split='train')
val_dataset = RefCOCOgDataset(transform=transform, split='val')

# DataLoaders batch size and other options. Computation is done with 4 workers to speed it up
batch_size = 2
shuffle = True
num_workers = 1,
pin_memory = True,
persistent_workers = True

# DataLoader, to create iterable batches with 32 examples each, shuffled in case of training set and not shuffled in case of validation set
val_loader = DataLoader(
    val_dataset,
    batch_size,
    shuffle,
    # num_workers,
    # pin_memory,
    # persistent_workers
)
train_loader = DataLoader(
    train_dataset,
    batch_size,
    shuffle,
    # num_workers,
    # pin_memory,
    # persistent_workers
)

# setting the transform property for the splits
train_dataset.transform = preprocess
val_dataset.transform = preprocess


In [None]:
batch1 = next(iter(val_loader))
batch2 = next(iter(val_loader))

images1 = batch1['image'].to(device)
texts1 = clip.tokenize(batch1['sentence']).to(device)

logits_per_image1, logits_per_text1 = clip_model(images1, texts1)

images2 = batch2['image'].to(device)
texts2 = clip.tokenize(batch2['sentence']).to(device)

logits_per_image2, logits_per_text2 = clip_model(images2, texts2)


print(logits_per_image1)
print(logits_per_text1)
print(logits_per_image2)
print(logits_per_text2)

## Training and storing the model

**Train** and **Validation** functions for each training epoch

In [None]:
# Learning rate and optimizer
learning_rate = 1e-3
optimizer = Adam(clip_model.parameters(), lr=learning_rate)

# Loss function
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

def train_epoch(model, dataloader, optimizer, device):
    model.train()  # Uncomment this line
    total_loss = 0
    pbar = tqdm(dataloader, total=len(dataloader))

    for batch in pbar:
        # Correctly extract images and texts from the batch
        images = batch['image'].to(device)
        texts = clip.tokenize(batch['sentence'] ).to(device)


        optimizer.zero_grad()

        # # Forward pass
        logits_per_image, logits_per_text = model(images, texts)
        print(logits_per_image2)
        print(logits_per_text2)

        # # Compute loss (assuming you want to use the standard CLIP contrastive loss)
        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        loss_image = loss_img(logits_per_image, ground_truth)
        loss_text = loss_txt(logits_per_text, ground_truth)
        loss = (loss_image + loss_text) / 2

        print("loss: ", loss)

        # # # Backward pass and optimization
        # loss.backward()
        # optimizer.step()

        # total_loss += loss.item()

        # pbar.set_description(f'Loss: {loss.item():.4f}')

    return total_loss / len(dataloader)

# Validation function
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            images = batch['image'].to(device)
            texts = clip.tokenize(batch['sentence']).to(device)

            # Forward pass
            logits_per_image, logits_per_text = model(images, texts)

            # Calculate the loss
            ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
            loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

            total_loss += loss.item()

    return total_loss / len(dataloader)

Training loop that generates the pretrained clip model on refCocog

In [None]:
# Ensure the model is in train mode and using float32
# clip_model = clip_model.float().train()

# Training loop
num_epochs = 1 

for epoch in range(num_epochs):
    train_loss = train_epoch(clip_model, train_loader, optimizer, device)
    val_loss = validate(clip_model, val_loader, device)

    # print(f"Epoch {epoch+1}/{num_epochs}")
    # print(f"Train Loss: {train_loss:.4f}")
    # print(f"Val Loss: {val_loss:.4f}")
    # print()

# Save the fine-tuned model
torch.save(clip_model.state_dict(), 'fine_tuned_clip_refcocog.pth')

In [None]:
print(batch1)

# First Model

## Model definition

The base model is defined as a starting point to further study the task and become familiar with this visual grounding task. The approach describe in the project statement 

In [None]:
class YoloClip(nn.Module):


    def __init__(self, clip_model, yolo_model):
        super().__init__()
        self.clip_model = clip_model
        self.yolo_model = yolo_model

    def forward(self, image, text):
        # Obtain yolo detections for the original image
        yolo_results = self.yolo_model(image)

        # Get crops from the detections
        yolo_crop = yolo_results.crop(save = False)

        # Get the results in pandas form to get the number of crops for each image
        yolo_panda = yolo_results.pandas().xyxy

        crops_nums = [len(yolo_panda[i]) for i in range(len(image))]

        # Preprocess the cropped images before passing it to CLIP
        # crop_images = torch.stack([self.image_transform(crop['im'].copy()) for crop in yolo_crop]).to(device)
        # crop_classes = [yolo_results.names.get(crop['cls'].item()) for crop in yolo_crop]
        # crop_classes_tensor = names_to_coco_cats(crop_classes)
        # crop_classes_tensor = crop_classes_tensor.to(device)

        # Get the CLIP embedding for each of the cropped images
        # image_features = self.clip_model.encode_image(crop_images)
        # image_features /= image_features.norm(dim=-1, keepdim=True)

        # Get the CLIP embedding for each of the text tokens
        # text = clip.tokenize(text).to(device)
        # text_features = self.clip_model.encode_text(text)
        # text_features /= text_features.norm(dim=-1, keepdim=True)



Instantiating and evaluating the model

# Second Model: XGBoost integration

Describe the integration with XGBoost

# Bash utilities

In [None]:
%%bash
tar -xzvf refcocog.tar.gz