In [None]:
from IPython.display import clear_output
!pip install vit_keras
!pip install colorama
clear_output()

In [None]:
pip install -r /kaggle/input/requirement/requirement.txt

In [None]:
from transformers import CLIPProcessor, CLIPModel

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
!ls ~/.cache/huggingface/hub | grep p-vit-base-p

In [None]:
clip_model.num_parameters()


In [None]:
dir(clip_model)


In [None]:
def get_text_embedding(text: str):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True)
    text_embeddings = clip_model.get_text_features(**inputs)
    return text_embeddings

In [None]:
import requests
from PIL import Image

In [None]:
# Function to get image embedding
# def get_image_embedding(image_url: str):
#     image = Image.open(requests.get(image_url, stream=True, headers={"User-Agent": "AlexCrawler/1.0 (alexgalea.ca; agalea91@gmail.com)"}).raw)
#     inputs = clip_processor(images=image, return_tensors="pt")
#     image_embeddings = clip_model.get_image_features(**inputs)
#     return image_embeddings
def get_image_embedding(image_path: str):
    image = Image.open(image_path).convert("RGB")  # Ensure RGB mode
    image = clip_processor(images=image , return_tensors = "pt")  # Preprocess & add batch dimension
    image_embeddings =  clip_model.get_image_features(**image)
    # image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings
    
    return image_embeddings

In [None]:
image_url = "/kaggle/input/satellite-image-caption-generation/train/00001.jpg"
image_embedding = get_image_embedding(image_url)

In [None]:
image_embedding.shape

In [None]:
from IPython.display import Image as DisplayImage


In [None]:
DisplayImage(image_url)

In [None]:
text = "there are many buildings"
text_embedding = get_text_embedding(text)

In [None]:
import torch

In [None]:
torch.nn.CosineSimilarity()(text_embedding, image_embedding)

In [None]:
import numpy as np
import pandas as pd

In [None]:
image_path = '/kaggle/input/satellite-image-caption-generation/'

train_data = pd.read_csv("/kaggle/input/satellite-image-caption-generation/train.csv")
train_data['filepath'] = image_path + train_data['filepath']

valid_data = pd.read_csv("/kaggle/input/satellite-image-caption-generation/valid.csv")
valid_data['filepath'] = image_path + valid_data['filepath']

test_data = pd.read_csv("/kaggle/input/satellite-image-caption-generation/test.csv")
test_data['filepath'] = image_path + test_data['filepath']

In [None]:
len(test_data)

In [None]:
# the first stage 
def text_preprocessing(data):
    data['captions'] = data['captions'].apply(lambda x: x.replace("["," "))
    data['captions'] = data['captions'].apply(lambda x: x.replace("]"," "))
    data['captions'] = data['captions'].apply(lambda x: x.replace("''"," "))
    data['captions'] = data['captions'].apply(lambda x: x.lower())
    data['captions'] = data['captions'].apply(lambda x: x.replace("[^A-Za-z]"," "))
    data['captions'] = data['captions'].apply(lambda x: x.replace("\s+"," "))
    data['captions'] = data['captions'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))

    return data

# splitting each caption (due to one image has many captions)
def splitting_captions(df):
    captions_arr = []
    filepaths_arr = []

    for i in range(df.shape[0]):
        img = df['filepath'].values[i]
        captions = re.split(r"' '", df['captions'].values[i])
        for caption in captions:
            captions_arr.append(caption)
            filepaths_arr.append(img)

    data = pd.DataFrame({'captions': captions_arr, 'filepath': filepaths_arr})

    return data

# the last stage and
def last_preprocessing(data):
    data['captions'] = data['captions'].apply(lambda x: x.replace("'",""))
    data['captions'] = data['captions'].apply(lambda x: x.replace(".",""))
    data['captions'] = "startseq "+data['captions']+" endseq"

    return data

In [None]:
import re

In [None]:
# Applying text preprocessing functions

train_data_preprocessed = text_preprocessing(train_data.iloc[:5000,:])
train_data_preprocessed = splitting_captions(train_data_preprocessed)
train_data_new = last_preprocessing(train_data_preprocessed)

valid_data_preprocessed = text_preprocessing(valid_data)
valid_data_preprocessed = splitting_captions(valid_data_preprocessed)
valid_data_new = last_preprocessing(valid_data_preprocessed)

test_data_preprocessed = text_preprocessing(test_data)
test_data_preprocessed = splitting_captions(test_data_preprocessed)
test_data_new = last_preprocessing(test_data_preprocessed)


print('train shape -> ', train_data_new.shape[0])
print('valid shape -> ', valid_data_new.shape[0])
print('test shape -> ', test_data_new.shape[0])

In [None]:
test_data_preprocessed

In [None]:
type(train_data_preprocessed)

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 64
SEED = 10
import tensorflow as tf
AUTO = tf.data.AUTOTUNE

In [None]:
# an example of training caption
train_data_new.iloc[SEED].captions

In [None]:
import time
from datetime import timedelta
from torch.optim import AdamW

In [None]:
from tqdm import tqdm 
import itertools

In [None]:
# Fine-tune the model

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

optimizer = AdamW(model.parameters(), lr=5e-6)

num_epochs = 5
batch_size = 64

model.train()
model.to(0)
epoch_losses = []
for epoch in list(range(num_epochs)):
    start_time = time.time()
    epoch_loss = 0
    for i in tqdm(range(0, len(train_data_preprocessed), batch_size)):
        batch = train_data_preprocessed.iloc[i:i+batch_size]  # Batch using iloc
        captions = batch["captions"].tolist()  # Extract captions
        images = [Image.open(filepath) for filepath in batch["filepath"]]
        inputs = processor(text=captions, images=images, return_tensors="pt", padding=True , truncation=True)
        inputs = {k: v.to(0) for k, v in inputs.items()}
        outputs = model(**inputs)

        # Compute cosine similarity
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

        labels = torch.arange(len(images), device=0)
        image_loss = torch.nn.functional.cross_entropy(logits_per_image, labels)
        text_loss = torch.nn.functional.cross_entropy(logits_per_text, labels)
        loss = (image_loss + text_loss) / 2

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

    
    avg_epoch_loss = epoch_loss / len(train_data_preprocessed)
    end_time = time.time()
    epoch_duration = end_time - start_time
    epoch_losses.append(avg_epoch_loss)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_epoch_loss:.4f} - Duration: {str(timedelta(seconds=epoch_duration))} seconds")
    

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(epoch_losses)
plt.xlabel("epoch")
plt.ylabel("average loss")

In [None]:
model.save_pretrained("/kaggle/working/floodnet-finetuned-clip-v1-model")
processor.save_pretrained("/kaggle/working/floodnet-finetuned-clip-v1-processor")

In [None]:
model.save_pretrained("/Users/suhelkhan/Major_Project/CLIP_model/fined_tuned_CLIP_model")
processor.save_pretrained("/Users/suhelkhan/Major_Project/CLIP_model/fined_tuned_processor")

In [None]:
# # Evaluate prior to fine-tuning

# # Function to evaluate the model
# def evaluate(model, processor, data, batch_size=32):
#     labels = ["A satellite image of a flooded area of land.", "A satellite image of a non-flooded area of land."]
#     model.eval()
#     print(f"Using {device} device")
#     model.to(device)
#     correct = 0
#     total = 0
#     for batch in tqdm(list(batched(data, batch_size))):
#         captions = np.array([record.caption for record in batch])
#         images = [Image.open(record.image) for record in batch]
#         inputs = processor(text=labels, images=images, return_tensors="pt", padding=True)
#         inputs = {k: v.to(device) for k, v in inputs.items()}
#         with torch.no_grad():
#             outputs = model(**inputs)
#             logits_per_image = outputs.logits_per_image # this is the image-text similarity score
#             # probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

#             pred_idx = torch.argmax(logits_per_image, dim=1)
#             pred_labels = np.array([labels[pred] for pred in pred_idx])
#             correct += (pred_labels == captions).sum().item()
#             total += len(batch)

#     return correct / total

In [None]:
# accuracy_after_training = evaluate(model, processor, train_data_preprocessed)

In [None]:
idx = np.where(test_data_preprocessed['captions'] == "startseq the airport is very large endseq")[0]
for image_idx in np.random.choice(idx, 1):
    display(Image.open(test_data_preprocessed['filepath'][image_idx]).resize((650,500)))

In [None]:
from torchvision.utils import make_grid
from torchvision.io import read_image 
from torchvision import transforms

In [None]:
def load_transform_images(image_paths):
    target_size = (240, 240)
    reshape_transform = lambda image_tensor: transforms.Resize(target_size)(image_tensor).expand(3, -1, -1)
    images = [reshape_transform(read_image(image)) for image in image_paths]
    return images

In [None]:
def get_image_embeddings(image_paths):
    # Load the CLIP model
    clip_model = CLIPModel.from_pretrained("/kaggle/working/floodnet-finetuned-clip-v1-model")
    clip_processor = CLIPProcessor.from_pretrained("/kaggle/working/floodnet-finetuned-clip-v1-processor")

    images = load_transform_images(image_paths)
    inputs = clip_processor(images=images, return_tensors="pt")
    image_embeddings = clip_model.get_image_features(**inputs)
    return image_embeddings

In [None]:
from PIL import Image
import torch
import numpy as np
from itertools import islice
from tqdm import tqdm

# Function to batch a list
def batched(iterable, batch_size):
    it = iter(iterable)
    while batch := list(islice(it, batch_size)):
        yield batch

# Extract image file paths from the DataFrame
images = test_data_preprocessed["filepath"].tolist()  # Assuming df is your DataFrame

# Define batch size
batch_size = 500
total_batches = int(np.ceil(len(images) / batch_size))

i = 0
embedding_files = []
for image_fps in tqdm(batched(images, batch_size), total=total_batches, desc="Processing Batches"):
    i += 1
    print(f"Processing batch {i}/{total_batches}")

    # Define output file path
    file_name = f"/kaggle/working/images_embeddings_batch_{i}.pt"

    # Process images and save embeddings
    torch.save(get_image_embeddings(image_fps), file_name)
    embedding_files.append(file_name)

print("Embedding processing complete. Saved files:", embedding_files)

In [None]:
embedding_files


In [None]:
image_embeddings = torch.cat([torch.load(fp) for fp in embedding_files])

In [None]:
image_embeddings.shape

In [None]:
sum(image_embedding.element_size() for image_embedding in image_embeddings) / 1e3 # kb

In [None]:
def rank_items(text_embedding: torch.Tensor, image_embeddings: torch.Tensor):
    scores = []
    cosine_similarity = torch.nn.CosineSimilarity()
    for image_embedding in tqdm(image_embeddings):
        score = cosine_similarity(text_embedding.unsqueeze(dim=0), image_embedding.unsqueeze(dim=0))
        scores.append(float(score.mean().item()))
    return scores

In [None]:
scores = rank_items(get_text_embedding("startseq there are red buildings and trees endseq"), image_embeddings)

In [None]:
scores

In [None]:
np.argsort(scores)

In [None]:
idx = np.argsort(scores)
np.array(scores)[idx][::-1]

In [None]:
idx = np.argsort(scores)
np.array(test_data_preprocessed['filepath'])[idx][::-1]

In [None]:
from IPython.display import display
from IPython.display import Image as DisplayImage

In [None]:
i = 0
for image in np.array(images)[idx][::-1]:
    i += 1
    display(DisplayImage(image, width="200px"))
    if i >= 5:
        break