# Download Fine Tune model - Dataset

In [3]:
import os
import cv2
import time
import gdown
import torch
import sqlite3
import zipfile
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, RobertaModel

In [4]:
VISION_ENCODER_URL = 'https://drive.google.com/uc?id=1Q5j4ZxDo_8b7xhWm1R8cyZlf4zi-EgLI'
TEXT_ENCODER_URL = 'https://drive.google.com/uc?id=1p4ogVQ5omTdCwnPmUh5avTMB58p0bw5V'

VISION_ENCODER_PATH = "final_vision_encoder.pt"
TEXT_ENCODER_PATH = "final_text_encoder.pt"

In [5]:
def download_model(url, output_path):
    print(f"Downloading model from: {url}")
    gdown.download(url, output_path, quiet=False)
    print(f"Model downloaded and saved to: {output_path}")

In [6]:
def load_finetuned_models(device):
    print("Loading fine-tuned models...")
    download_model(VISION_ENCODER_URL, VISION_ENCODER_PATH)
    download_model(TEXT_ENCODER_URL, TEXT_ENCODER_PATH)

    vision_encoder = torch.load(VISION_ENCODER_PATH, map_location=device)
    text_encoder = torch.load(TEXT_ENCODER_PATH, map_location=device)
    tokenizer = AutoTokenizer.from_pretrained('SajjadAyoubi/clip-fa-text')

    print("Models loaded successfully.")
    return vision_encoder, tokenizer, text_encoder

In [7]:
def download_and_extract_videos():
    print("Downloading video dataset...")
    url = 'https://drive.google.com/uc?id=1GYdaAsxRCqBI-N8KP1fIYOGmS_M3hxBW'
    output = 'videos.zip'
    gdown.download(url, output, quiet=False)
    print("Video dataset downloaded.")

    print("Extracting videos...")
    with zipfile.ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall('videos')
    print("Videos extracted.")

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
download_and_extract_videos()

Downloading video dataset...


Downloading...
From (original): https://drive.google.com/uc?id=1GYdaAsxRCqBI-N8KP1fIYOGmS_M3hxBW
From (redirected): https://drive.google.com/uc?id=1GYdaAsxRCqBI-N8KP1fIYOGmS_M3hxBW&confirm=t&uuid=3d42d5e9-141c-4792-be29-3084e3b2d915
To: /content/videos.zip
100%|██████████| 270M/270M [00:02<00:00, 92.7MB/s]


Video dataset downloaded.
Extracting videos...
Videos extracted.


In [9]:
vision_encoder, tokenizer, text_encoder = load_finetuned_models(device)

Loading fine-tuned models...
Downloading model from: https://drive.google.com/uc?id=1Q5j4ZxDo_8b7xhWm1R8cyZlf4zi-EgLI


Downloading...
From (original): https://drive.google.com/uc?id=1Q5j4ZxDo_8b7xhWm1R8cyZlf4zi-EgLI
From (redirected): https://drive.google.com/uc?id=1Q5j4ZxDo_8b7xhWm1R8cyZlf4zi-EgLI&confirm=t&uuid=4f231f04-5a38-4b72-9f4b-53efd36f3b88
To: /content/final_vision_encoder.pt
100%|██████████| 350M/350M [00:04<00:00, 76.5MB/s]


Model downloaded and saved to: final_vision_encoder.pt
Downloading model from: https://drive.google.com/uc?id=1p4ogVQ5omTdCwnPmUh5avTMB58p0bw5V


Downloading...
From (original): https://drive.google.com/uc?id=1p4ogVQ5omTdCwnPmUh5avTMB58p0bw5V
From (redirected): https://drive.google.com/uc?id=1p4ogVQ5omTdCwnPmUh5avTMB58p0bw5V&confirm=t&uuid=d234bf65-0d68-452d-a72f-57c3e46565f1
To: /content/final_text_encoder.pt
100%|██████████| 473M/473M [00:06<00:00, 76.7MB/s]
  vision_encoder = torch.load(VISION_ENCODER_PATH, map_location=device)


Model downloaded and saved to: final_text_encoder.pt


  text_encoder = torch.load(TEXT_ENCODER_PATH, map_location=device)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/354 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/875k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Models loaded successfully.


In [None]:
def extract_sampled_frames(video_path, vision_encoder, device, frame_step=5):
    print(f"🎥 Extracting frames from video: {video_path}")
    cap = cv2.VideoCapture(video_path)
    frame_embeddings = []
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_step == 0:
            resized_frame = cv2.resize(frame, (224, 224))
            image = Image.fromarray(cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB))
            image_tensor = torch.tensor(np.array(image) / 255.0).float().unsqueeze(0).to(device)
            image_tensor = image_tensor.permute(0, 3, 1, 2)

            with torch.no_grad():
                outputs = vision_encoder(image_tensor)

            image_embedding = outputs.pooler_output if hasattr(outputs, "pooler_output") else outputs.last_hidden_state.mean(dim=1)
            image_embedding = torch.nn.functional.normalize(image_embedding, p=2, dim=1)

            frame_embeddings.append(image_embedding.cpu().numpy())
        frame_count += 1
    cap.release()

    if frame_embeddings:
        video_embedding = np.mean(frame_embeddings, axis=0)
        video_embedding = video_embedding / np.linalg.norm(video_embedding)
    else:
        video_embedding = np.zeros((768,))
    print(f"✅ Frames extracted and normalized for: {video_path}")
    return video_embedding


In [None]:
def extract_text_features(text, tokenizer, text_encoder, device):
    print(f"📌 Extracting text features for: {text}")
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_embedding = text_encoder(**inputs).pooler_output.cpu().numpy()

    text_embedding = text_embedding / np.linalg.norm(text_embedding)

    print(f"✅ Text features extracted for: {text}")
    return text_embedding

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def save_extract_feature():
    print("🚀 Starting feature extraction and saving process...")
    video_features = {}
    video_folder = 'videos/general'  
    for video_name in os.listdir(video_folder):
        if video_name.endswith(".mp4"):
            video_path = os.path.join(video_folder, video_name)
            video_embeddings = extract_sampled_frames(video_path, vision_encoder, device, frame_step=5)  
            video_features[video_name] = video_embeddings

    save_features_to_db(video_features)
    print("✅ Feature extraction and saving completed.")
    return video_features

In [None]:
def retrieve_all_video_similarities(caption, video_features, tokenizer, text_encoder, device):
    print(f"🔍 Retrieving similarities for caption: {caption}")
    caption_embedding = extract_text_features(caption, tokenizer, text_encoder, device)

    print(f"✅ Caption embedding shape: {caption_embedding.shape}")

    caption_embedding = caption_embedding.flatten()

    similarities = []
    for video_name, features in video_features.items():
        features = np.array(features).flatten()
        features = features[:768]
        features = features / np.linalg.norm(features)

        print(f"🧐 Comparing: {video_name} | Caption Shape: {caption_embedding.shape} | Video Shape: {features.shape}")

        similarity = cosine_similarity(caption_embedding, features)
        similarities.append((video_name, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    print(f"✅ Similarities retrieved for caption: {caption}")

    return similarities

In [33]:
video_features = save_extract_feature()

🚀 Starting feature extraction and saving process...
🎥 Extracting frames from video: videos/general/generalVid9.mp4
✅ Frames extracted and normalized for: videos/general/generalVid9.mp4
🎥 Extracting frames from video: videos/general/generalVid13.mp4
✅ Frames extracted and normalized for: videos/general/generalVid13.mp4
🎥 Extracting frames from video: videos/general/generalVid12.mp4
✅ Frames extracted and normalized for: videos/general/generalVid12.mp4
🎥 Extracting frames from video: videos/general/generalVid7.mp4
✅ Frames extracted and normalized for: videos/general/generalVid7.mp4
🎥 Extracting frames from video: videos/general/generalVid1.mp4
✅ Frames extracted and normalized for: videos/general/generalVid1.mp4
🎥 Extracting frames from video: videos/general/generalVid5.mp4
✅ Frames extracted and normalized for: videos/general/generalVid5.mp4
🎥 Extracting frames from video: videos/general/generalVid2.mp4
✅ Frames extracted and normalized for: videos/general/generalVid2.mp4
🎥 Extracting 

In [35]:
caption = "پیانو روی صحنه"
all_similarities = retrieve_all_video_similarities(caption, video_features, tokenizer, text_encoder, device)

for video_name, similarity in all_similarities:
    print(f"🎬 Video: {video_name} | 🔥 Similarity: {similarity:.4f}")

🔍 Retrieving similarities for caption: پیانو روی صحنه
📌 Extracting text features for: پیانو روی صحنه
✅ Text features extracted for: پیانو روی صحنه
✅ Caption embedding shape: (1, 768)
🧐 Comparing: generalVid9.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid13.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid12.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid7.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid1.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid5.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid2.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid4.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid6.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid3.mp4 | Caption Shape: (768,) | Video Shape: (768,)
🧐 Comparing: generalVid8.mp4 | Caption Shape: (768,) | Video Shap