# V2

In [None]:
import cv2
import os
from tqdm import tqdm
from google.colab import drive

video_folder = "/content/drive/My Drive/msrvtt_dataset/TrainValVideo"
output_frame_folder = "/content/drive/My Drive/ExtractedFrames_2"
os.makedirs(output_frame_folder, exist_ok=True)

def extract_frames(video_path, output_path_template):
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Cannot open video file: {video_path}")
            return

        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if frame_count <= 0:
            print(f"Error: Cannot determine frame count for video: {video_path}")
            cap.release()
            return

        frame_indices = [
            int(frame_count * 0.25),
            int(frame_count * 0.5),
            int(frame_count * 0.75)
        ]

        frame_indices = [min(max(idx, 0), frame_count - 1) for idx in frame_indices]

        for frame_num, idx in enumerate(frame_indices, start=1):
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                output_path = output_path_template.format(frame_num)
                cv2.imwrite(output_path, frame)
            else:
                print(f"Error: Failed to read frame {idx} from video: {video_path}")

        cap.release()

    except Exception as e:
        print(f"Exception occurred while processing {video_path}: {e}")

if os.path.exists(video_folder):
    video_files = sorted([f for f in os.listdir(video_folder) if f.endswith(".mp4")])

    for video_file in tqdm(video_files, desc="Extracting Frames"):
        video_id = os.path.splitext(video_file)[0]  # Extract video ID from filename
        video_path = os.path.join(video_folder, video_file)  # Full path to the video
        output_path_template = os.path.join(output_frame_folder, f"{video_id}_frame_{{}}.jpg")  # Template with frame number

        extract_frames(video_path, output_path_template)

    print(f"Frames extracted and saved to: {output_frame_folder}")
else:
    print(f"Error: The folder {video_folder} does not exist. Please check the path.")

Extracting Frames: 100%|██████████| 7010/7010 [30:32<00:00,  3.82it/s]

Frames extracted and saved to: /content/drive/My Drive/ExtractedFrames_2





In [None]:
os.listdir()


['Colab Notebooks',
 'ExtractedFrames',
 'FarsiCaption',
 'msrvtt_dataset',
 'ExtractedFrames_2',
 'saved_models']

# Train and Test Splits for Translated Captions

In [None]:
import os
os.chdir('/content/drive/My Drive/FarsiCaption')


In [None]:
!ls /content/drive/My\ Drive/FarsiCaption


msrvtt_test_fa.csv  msrvtt_train_fa.csv  test.csv  train.csv


In [None]:
import pandas as pd
import os


In [None]:
test_file_path = '/content/drive/My Drive/FarsiCaption/test.csv'
train_file_path = '/content/drive/My Drive/FarsiCaption/train.csv'


In [None]:
test_df = pd.read_csv(test_file_path)
train_df = pd.read_csv(train_file_path)


In [None]:
print("Test Dataset:")
print(test_df.head())
print("\nTest Dataset Structure:")
print(test_df.info())


Test Dataset:
    video_id                                            caption  sen_id  \
0  video6513                     a family is having coversation   83560   
1  video6514  a girl is surfing and a guy is riding a bike o...   28980   
2  video6515  a man cuts a bell pepper and describes how to ...   98260   
3  video6516  a before treatment and after treatment of a ma...   25540   
4  video6517        a girl and man look at the sky in amazement   41200   

   category                                          url  start time  \
0        14  https://www.youtube.com/watch?v=A9pM9iOuAzM      116.03   
1         3  https://www.youtube.com/watch?v=Vs0QoCtvJmk      479.13   
2        17  https://www.youtube.com/watch?v=xQyZYPZT0tI      153.93   
3        12  https://www.youtube.com/watch?v=BFLLzIhKLBs      297.60   
4         7  https://www.youtube.com/watch?v=4eAgsrEX_CA       96.44   

   end time     split    id  __index_level_0__  \
0    126.21  validate  6513               6128   
1 

In [None]:
print("Train Dataset:")
print(train_df.head())
print("\nTrain Dataset Structure:")
print(train_df.info())


Train Dataset:
    video_id                                            caption  sen_id  \
0     video0                                     a car is shown   77300   
1     video1  in a kitchen a woman adds different ingredient...  110460   
2    video10                               a man holds two dogs   47320   
3   video100                 a basset hound sits outside a door   18360   
4  video1000                       a woman is wearing a costume   49000   

   category                                          url  start time  \
0         9  https://www.youtube.com/watch?v=9lZi22qLlEo      137.72   
1        16  https://www.youtube.com/watch?v=w4JM08PDEng      184.33   
2         6  https://www.youtube.com/watch?v=CcJwo2eyfI0       33.33   
3        12  https://www.youtube.com/watch?v=6S-47swQBBU     1146.06   
4         7  https://www.youtube.com/watch?v=ALrHNDBK-jw      738.93   

   end time  split    id  __index_level_0__  \
0    149.44  train     0                  0   
1    20

In [None]:
print("Test Dataset Columns:", test_df.columns)
print("Train Dataset Columns:", train_df.columns)


Test Dataset Columns: Index(['video_id', 'caption', 'sen_id', 'category', 'url', 'start time',
       'end time', 'split', 'id', '__index_level_0__', 'caption_farsi'],
      dtype='object')
Train Dataset Columns: Index(['video_id', 'caption', 'sen_id', 'category', 'url', 'start time',
       'end time', 'split', 'id', '__index_level_0__', 'caption_farsi'],
      dtype='object')


In [None]:
print("Test Dataset Shape:", test_df.shape)
print("Train Dataset Shape:", train_df.shape)


Test Dataset Shape: (497, 11)
Train Dataset Shape: (6513, 11)


In [None]:
os.chdir('/content/drive/My Drive/ExtractedFrames_2')


In [None]:
files = os.listdir()
print(files)


['video7006_frame_3.jpg', 'video7007_frame_1.jpg', 'video7007_frame_2.jpg', 'video7007_frame_3.jpg', 'video7008_frame_1.jpg', 'video7008_frame_2.jpg', 'video7008_frame_3.jpg', 'video7009_frame_1.jpg', 'video7009_frame_2.jpg', 'video7009_frame_3.jpg', 'video701_frame_1.jpg', 'video701_frame_2.jpg', 'video701_frame_3.jpg', 'video702_frame_1.jpg', 'video702_frame_2.jpg', 'video702_frame_3.jpg', 'video703_frame_1.jpg', 'video703_frame_2.jpg', 'video703_frame_3.jpg', 'video704_frame_1.jpg', 'video704_frame_2.jpg', 'video704_frame_3.jpg', 'video705_frame_1.jpg', 'video705_frame_2.jpg', 'video705_frame_3.jpg', 'video706_frame_1.jpg', 'video706_frame_2.jpg', 'video706_frame_3.jpg', 'video707_frame_1.jpg', 'video707_frame_2.jpg', 'video707_frame_3.jpg', 'video708_frame_1.jpg', 'video708_frame_2.jpg', 'video708_frame_3.jpg', 'video709_frame_1.jpg', 'video709_frame_2.jpg', 'video709_frame_3.jpg', 'video71_frame_1.jpg', 'video71_frame_2.jpg', 'video71_frame_3.jpg', 'video710_frame_1.jpg', 'video71

In [None]:
!pip install transformers==4.33.0 --quiet
!pip install torch torchvision torchaudio --quiet
!pip install tqdm --quiet

import os
import torch
import pandas as pd
from PIL import Image
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from IPython.display import display  # for inline image display in Colab

from torch.utils.data import Dataset, DataLoader
from transformers import (
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPProcessor,
    CLIPModel
)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.3.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.33.0 which is incompatible.[0m[31m
[0m

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [None]:
# Select device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Adjust these paths to match your folder structure
BASE_PATH = "/content/drive/My Drive"

TRAIN_CSV = os.path.join(BASE_PATH, "FarsiCaption", "train.csv")
TEST_CSV  = os.path.join(BASE_PATH, "FarsiCaption", "test.csv")

# Folder containing extracted frames with 3 frames per video
# e.g., "video6513_frame_1.jpg", "video6513_frame_2.jpg", "video6513_frame_3.jpg"
FRAMES_FOLDER = os.path.join(BASE_PATH, "ExtractedFrames_2")


Mounted at /content/drive


In [None]:
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)

print("\n--- Train sample rows ---")
print(train_df.head(3))
print("\n--- Test sample rows ---")
print(test_df.head(3))

print("\nTrain columns:", train_df.columns)
print("Test columns:", test_df.columns)


Train DataFrame shape: (6513, 11)
Test DataFrame shape: (497, 11)

--- Train sample rows ---
  video_id                                            caption  sen_id  \
0   video0                                     a car is shown   77300   
1   video1  in a kitchen a woman adds different ingredient...  110460   
2  video10                               a man holds two dogs   47320   

   category                                          url  start time  \
0         9  https://www.youtube.com/watch?v=9lZi22qLlEo      137.72   
1        16  https://www.youtube.com/watch?v=w4JM08PDEng      184.33   
2         6  https://www.youtube.com/watch?v=CcJwo2eyfI0       33.33   

   end time  split  id  __index_level_0__  \
0    149.44  train   0                  0   
1    206.89  train   1                  1   
2     46.53  train  10                  2   

                                       caption_farsi  
0                         یک ماشین نشان داده شده است  
1  در یک آشپزخانه یک زن مواد مختلف

In [None]:
MAX_TRAIN_SAMPLES = 5000  # Adjust as needed (1..6513 or more)
MAX_TEST_SAMPLES  = 400   # Adjust as needed (1..497 or more)

train_df = train_df.head(MAX_TRAIN_SAMPLES)
test_df  = test_df.head(MAX_TEST_SAMPLES)

print(f"\nUsing {len(train_df)} train samples and {len(test_df)} test samples.")



Using 5000 train samples and 400 test samples.


In [None]:
# Text side: CLIP4Clip
text_tokenizer = CLIPTokenizer.from_pretrained("Searchium-ai/clip4clip-webvid150k")
text_model = CLIPTextModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k").to(device)

# Image side: standard CLIP
image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model      = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

print("Models loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location=map_location)


Models loaded successfully!


In [None]:
import random

class MSRVTTMultipleFrameDataset(Dataset):
    """
    For each row in the DataFrame, we have a video_id and a Persian caption.
    We assume frames of the form:
       video_id_frame_1.jpg, video_id_frame_2.jpg, video_id_frame_3.jpg
    We'll attempt to pick one of those frames at random if they exist.
    """
    def __init__(self, df, frames_folder, mode="train"):
        self.records = df.to_dict("records")
        self.frames_folder = frames_folder
        self.mode = mode  # "train" or "test"

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        row = self.records[idx]
        video_id = row["video_id"]           # e.g., "video6513"
        caption_fa = row["caption_farsi"]    # Persian caption

        # Potential frame paths
        candidate_paths = []
        for i in [1, 2, 3]:
            frame_name = f"{video_id}_frame_{i}.jpg"
            frame_path = os.path.join(self.frames_folder, frame_name)
            if os.path.exists(frame_path):
                candidate_paths.append(frame_path)

        if not candidate_paths:
            # If no frames exist, return a blank image or raise an error
            # We'll produce a black image as fallback
            candidate_paths = ["blank"]

        if self.mode == "train":
            # RANDOM selection of 1 frame for training
            chosen_path = random.choice(candidate_paths)
        else:
            # For test/eval, we can also pick one randomly,
            # or always pick the 2nd frame, etc.
            chosen_path = random.choice(candidate_paths)

        return caption_fa, chosen_path


In [None]:
train_dataset = MSRVTTMultipleFrameDataset(train_df, FRAMES_FOLDER, mode="train")
test_dataset  = MSRVTTMultipleFrameDataset(test_df,  FRAMES_FOLDER, mode="test")

BATCH_SIZE = 8  # Adjust based on GPU memory

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

print("DataLoaders created!")


DataLoaders created!


In [None]:
def clip_style_contrastive_loss(text_embeds, image_embeds, temperature=0.07):
    """
    text_embeds: (N, D) L2-normalized
    image_embeds: (N, D) L2-normalized
    Returns average of text->image and image->text cross-entropy losses.
    """
    # Compute NxN similarity
    logits = torch.matmul(text_embeds, image_embeds.t())  # shape (N, N)

    # Scale by temperature
    logits = logits / temperature

    # Labels = [0, 1, 2, ..., N-1]
    labels = torch.arange(logits.shape[0], device=logits.device)

    loss_t2i = nn.functional.cross_entropy(logits, labels)     # text->image
    loss_i2t = nn.functional.cross_entropy(logits.t(), labels) # image->text

    loss = (loss_t2i + loss_i2t) / 2.0
    return loss


In [None]:
def set_requires_grad(model, value):
    for param in model.parameters():
        param.requires_grad = value


In [None]:
# Optionally freeze part of the model. For example, freeze the image side:
# set_requires_grad(clip_model, False)

# Combine parameters of text_model & clip_model
all_params = list(text_model.parameters()) + list(clip_model.parameters())

optimizer = torch.optim.AdamW(all_params, lr=1e-5)
EPOCHS = 5  # Try more epochs (e.g., 5-10) for better results

for epoch in range(EPOCHS):
    text_model.train()
    clip_model.train()

    total_loss = 0.0
    step_count = 0

    for captions, frame_paths in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{EPOCHS}"):
        # Move text input to device
        inputs = text_tokenizer(
            text=captions,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        ).to(device)

        # Forward pass text
        text_out = text_model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
        text_embeds = text_out[0]  # shape (batch_size, embed_dim)
        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

        # Process images
        batch_images = []
        for fp in frame_paths:
            if fp == "blank":
                # fallback black image if no frame
                batch_images.append(Image.new("RGB", (224, 224), color="black"))
            else:
                img = Image.open(fp).convert("RGB")
                batch_images.append(img)

        # Convert images to tensors
        image_inputs = image_processor(images=batch_images, return_tensors="pt").to(device)
        image_features = clip_model.get_image_features(**image_inputs)
        image_embeds = image_features / image_features.norm(dim=-1, keepdim=True)

        # Compute contrastive loss
        loss = clip_style_contrastive_loss(text_embeds, image_embeds, temperature=0.07)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        step_count += 1

    avg_loss = total_loss / step_count if step_count > 0 else 0.0
    print(f"[Epoch {epoch+1}] Average Training Loss: {avg_loss:.4f}")

print("\nFine-tuning completed!")


Training Epoch 1/5: 100%|██████████| 625/625 [28:54<00:00,  2.78s/it]


[Epoch 1] Average Training Loss: 1.8091


Training Epoch 2/5: 100%|██████████| 625/625 [19:44<00:00,  1.90s/it]


[Epoch 2] Average Training Loss: 1.3884


Training Epoch 3/5: 100%|██████████| 625/625 [14:06<00:00,  1.35s/it]


[Epoch 3] Average Training Loss: 1.0875


Training Epoch 4/5: 100%|██████████| 625/625 [10:21<00:00,  1.00it/s]


[Epoch 4] Average Training Loss: 0.8290


Training Epoch 5/5: 100%|██████████| 625/625 [07:34<00:00,  1.37it/s]

[Epoch 5] Average Training Loss: 0.6041

Fine-tuning completed!





# Saved the fine tune models

In [None]:
import torch

# Define the save path in your Google Drive
save_path = "/content/drive/MyDrive/saved_models/fine_tuned_model.pth"  # Adjust the path as needed

# Save the state dictionaries of both models
torch.save({
    'text_model_state_dict': text_model.state_dict(),
    'clip_model_state_dict': clip_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),  # Optionally save the optimizer state
}, save_path)

print(f"Fine-tuned models saved to {save_path}")