In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **ANIMOV-512x Fine-tuned Model**

In [3]:
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import torch.nn.functional as F
import os

# Define function to combine frames in a single horizontal row
def combine_frames_horizontally(frames):
    return np.hstack(frames)

# Specify the model folder, video path, and prompt
model_folder = "/content/drive/MyDrive/CLIPSIM/FinetunedModel/ANIMOV-512x"  # Updated path for Fine-tuned Model
video_path = os.path.join(model_folder, "Data.mp4")
prompt = "man doing pushups"

# Initialize frame capture
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0

# Check if the video was opened successfully
if not cap.isOpened():
    print("Error opening video file")

# Read frames until video ends or reach desired count
while cap.isOpened() and frame_count < 100:
    ret, frame = cap.read()
    if ret:
        resized_frame = cv2.resize(frame, (256, 256))  # Resize frames to keep them manageable
        frames.append(resized_frame)
        frame_count += 1
    else:
        break

cap.release()

# Combine frames into a single horizontal image and save it in the model's folder
combined_image_path = os.path.join(model_folder, "combined_frames_output.jpg")
if len(frames) > 0:
    combined_image = combine_frames_horizontally(frames[:100])  # Use the first 100 frames
    cv2.imwrite(combined_image_path, combined_image)
    print(f'Combined image of multiple frames saved successfully in {combined_image_path}')
else:
    print('No frames to combine.')

# CLIP model for CLIPSIM score calculation
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Check if the combined image was created before using it
if os.path.exists(combined_image_path):
    image = Image.open(combined_image_path)
    inputs = processor(text=[prompt], images=image, return_tensors="pt", padding=True)

    # Model inference to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

    # Normalize embeddings and calculate cosine similarity
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    clip_score = torch.matmul(image_embeds, text_embeds.T).item()

    print(f"CLIP Score for (ANIMOV-512x Fine-tuned Model): {clip_score}")
else:
    print("Combined image was not created due to insufficient frames.")


Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/ANIMOV-512x/combined_frames_output.jpg
CLIP Score for (ANIMOV-512x Fine-tuned Model): 0.2550389766693115


# **POTAT1 Fine-tuned Model**

In [4]:
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import torch.nn.functional as F
import os

# Define function to combine frames in a single horizontal row
def combine_frames_horizontally(frames):
    return np.hstack(frames)

# Specify the model folder, video path, and prompt for the Fine-tuned Model
model_folder = "/content/drive/MyDrive/CLIPSIM/FinetunedModel/POTAT1"  # Fine-tuned model path
video_path = os.path.join(model_folder, "Data.mp4")
prompt = "man doing pushups"

# Initialize frame capture
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0

# Check if the video was opened successfully
if not cap.isOpened():
    print("Error opening video file")

# Read frames until video ends or reach desired count
while cap.isOpened() and frame_count < 100:
    ret, frame = cap.read()
    if ret:
        resized_frame = cv2.resize(frame, (256, 256))  # Resize frames to keep them manageable
        frames.append(resized_frame)
        frame_count += 1
    else:
        break

cap.release()

# Combine frames into a single horizontal image and save it in the model's folder
combined_image_path = os.path.join(model_folder, "combined_frames_output.jpg")
if len(frames) > 0:
    combined_image = combine_frames_horizontally(frames[:100])  # Use the first 100 frames
    cv2.imwrite(combined_image_path, combined_image)
    print(f'Combined image of multiple frames saved successfully in {combined_image_path}')
else:
    print('No frames to combine.')

# CLIP model for CLIPSIM score calculation
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Check if the combined image was created before using it
if os.path.exists(combined_image_path):
    image = Image.open(combined_image_path)
    inputs = processor(text=[prompt], images=image, return_tensors="pt", padding=True)

    # Model inference to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

    # Normalize embeddings and calculate cosine similarity
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    clip_score = torch.matmul(image_embeds, text_embeds.T).item()

    print(f"CLIP Score for (POTAT1 Fine-tuned Model): {clip_score}")
else:
    print("Combined image was not created due to insufficient frames.")


Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/POTAT1/combined_frames_output.jpg
CLIP Score for (POTAT1 Fine-tuned Model): 0.2913568615913391


# **ZEROSCOPE Fine-tuned Model**

In [5]:
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import torch.nn.functional as F
import os

# Define function to combine frames in a single horizontal row
def combine_frames_horizontally(frames):
    return np.hstack(frames)

# Specify the model folder, video path, and prompt for the Fine-tuned Model
model_folder = "/content/drive/MyDrive/CLIPSIM/FinetunedModel/ZEROSCOPE"  # Fine-tuned model path
video_path = os.path.join(model_folder, "Data.mp4")
prompt = "Man doing pushups"

# Initialize frame capture
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0

# Check if the video was opened successfully
if not cap.isOpened():
    print("Error opening video file")

# Read frames until video ends or reach desired count
while cap.isOpened() and frame_count < 100:
    ret, frame = cap.read()
    if ret:
        resized_frame = cv2.resize(frame, (256, 256))  # Resize frames to keep them manageable
        frames.append(resized_frame)
        frame_count += 1
    else:
        break

cap.release()

# Combine frames into a single horizontal image and save it in the model's folder
combined_image_path = os.path.join(model_folder, "combined_frames_output.jpg")

if len(frames) > 0:
    combined_image = combine_frames_horizontally(frames[:100])  # Use the first 100 frames
    cv2.imwrite(combined_image_path, combined_image)
    print(f'Combined image of multiple frames saved successfully in {combined_image_path}')
else:
    print('No frames to combine.')

# CLIP model for CLIPSIM score calculation
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Check if the combined image was created before using it
if os.path.exists(combined_image_path):
    image = Image.open(combined_image_path)
    inputs = processor(text=[prompt], images=image, return_tensors="pt", padding=True)

    # Model inference to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

    # Normalize embeddings and calculate cosine similarity
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    clip_score = torch.matmul(image_embeds, text_embeds.T).item()

    print(f"CLIP Score for (ZEROSCOPE Fine-tuned Model): {clip_score}")
else:
    print("Combined image was not created due to insufficient frames.")


Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/ZEROSCOPE/combined_frames_output.jpg
CLIP Score for (ZEROSCOPE Fine-tuned Model): 0.33747565746307373


# **DAMO VILAB Fine-tuned Model**

In [6]:
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import torch.nn.functional as F
import os

# Define function to combine frames in a single horizontal row
def combine_frames_horizontally(frames):
    return np.hstack(frames)

# Specify the model folder, video path, and prompt for the Fine-tuned Model
model_folder = "/content/drive/MyDrive/CLIPSIM/FinetunedModel/DAMO VILAB"  # Fine-tuned model path
video_path = os.path.join(model_folder, "Data.mp4")
prompt = "Man doing pushups"

# Initialize frame capture
cap = cv2.VideoCapture(video_path)
frames = []
frame_count = 0

# Check if the video was opened successfully
if not cap.isOpened():
    print("Error opening video file")

# Read frames until video ends or reach desired count
while cap.isOpened() and frame_count < 100:
    ret, frame = cap.read()
    if ret:
        resized_frame = cv2.resize(frame, (256, 256))  # Resize frames to keep them manageable
        frames.append(resized_frame)
        frame_count += 1
    else:
        break

cap.release()

# Combine frames into a single horizontal image and save it in the model's folder
combined_image_path = os.path.join(model_folder, "combined_frames_output.jpg")
if len(frames) > 0:
    combined_image = combine_frames_horizontally(frames[:100])  # Use the first 100 frames
    cv2.imwrite(combined_image_path, combined_image)
    print(f'Combined image of multiple frames saved successfully in {combined_image_path}')
else:
    print('No frames to combine.')

# CLIP model for CLIPSIM score calculation
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Check if the combined image was created before using it
if os.path.exists(combined_image_path):
    image = Image.open(combined_image_path)
    inputs = processor(text=[prompt], images=image, return_tensors="pt", padding=True)

    # Model inference to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

    # Normalize embeddings and calculate cosine similarity
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    clip_score = torch.matmul(image_embeds, text_embeds.T).item()

    print(f"CLIP Score for (DAMO VILAB Fine-tuned Model): {clip_score}")
else:
    print("Combined image was not created due to insufficient frames.")


Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/DAMO VILAB/combined_frames_output.jpg
CLIP Score for (DAMO VILAB Fine-tuned Model): 0.304095059633255


# **CLIPSim Score Calculation for All Models **


In [7]:
import cv2
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import torch.nn.functional as F
from pathlib import Path
import os
import csv

# Load pre-trained CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function to combine frames in a single horizontal row
def combine_frames_horizontally(frames):
    return np.hstack(frames)

# Function to extract 100 frames and save combined frames image
def extract_and_combine_frames(video_path, combined_image_path):
    cap = cv2.VideoCapture(str(video_path))
    frames = []
    frame_count = 0

    if not cap.isOpened():
        print(f"Error opening video file {video_path}")
        return None

    # Read frames until video ends or reach desired count
    while cap.isOpened() and frame_count < 100:
        ret, frame = cap.read()
        if ret:
            resized_frame = cv2.resize(frame, (256, 256))  # Resize frames to keep them manageable
            frames.append(resized_frame)
            frame_count += 1
        else:
            break

    cap.release()

    # Combine frames into a single horizontal image and save
    if len(frames) > 0:
        combined_image = combine_frames_horizontally(frames[:100])  # Use the first 100 frames
        cv2.imwrite(combined_image_path, combined_image)
        print(f'Combined image of multiple frames saved successfully in {combined_image_path}')
        return combined_image_path
    else:
        print(f'No frames to combine for {video_path}')
        return None

# Function to calculate CLIP similarity for combined image and text prompt
def calculate_clipsim(combined_image_path, text_prompt):
    if os.path.exists(combined_image_path):
        image = Image.open(combined_image_path)
        inputs = processor(text=[text_prompt], images=image, return_tensors="pt", padding=True).to(device)

        # Model inference to get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds

        # Normalize embeddings and calculate cosine similarity
        image_embeds = F.normalize(image_embeds, p=2, dim=-1)
        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
        clip_score = torch.matmul(image_embeds, text_embeds.T).item()

        return clip_score
    else:
        print(f"Combined image not found at {combined_image_path}")
        return None

# Model directories and prompts
model_directories = {
    "ANIMOV-512x": "man doing pushups",
    "POTAT1": "man doing pushups",
    "ZEROSCOPE": "man doing pushups",
    "DAMO VILAB": "man doing pushups"
}

# Directory containing model folders
base_directory = '/content/drive/MyDrive/CLIPSIM/FinetunedModel'
results = []

# Iterate over each model folder and calculate CLIPSim score
for model_name, prompt in model_directories.items():
    model_folder = os.path.join(base_directory, model_name)
    video_path = os.path.join(model_folder, "Data.mp4")  # Ensure each model folder has "Data.mp4"
    combined_image_path = os.path.join(model_folder, "combined_frames_output.jpg")

    # Extract frames and save combined image
    combined_image = extract_and_combine_frames(video_path, combined_image_path)

    if combined_image:
        # Calculate CLIPSim score
        clipsim_score = calculate_clipsim(combined_image, prompt)
        if clipsim_score is not None:
            results.append({"model": model_name, "CLIPSim_score": clipsim_score})
            print(f"CLIPSim score for {model_name} (Fine-tuned Model): {clipsim_score:.4f}")

# Save results to a CSV file
output_csv_path = '/content/drive/MyDrive/CLIPSim_scores_finetuned_models.csv'
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["model", "CLIPSim_score"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")


Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/ANIMOV-512x/combined_frames_output.jpg
CLIPSim score for ANIMOV-512x (Fine-tuned Model): 0.2550
Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/POTAT1/combined_frames_output.jpg
CLIPSim score for POTAT1 (Fine-tuned Model): 0.2914
Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/ZEROSCOPE/combined_frames_output.jpg
CLIPSim score for ZEROSCOPE (Fine-tuned Model): 0.3375
Combined image of multiple frames saved successfully in /content/drive/MyDrive/CLIPSIM/FinetunedModel/DAMO VILAB/combined_frames_output.jpg
CLIPSim score for DAMO VILAB (Fine-tuned Model): 0.3041
Results saved to /content/drive/MyDrive/CLIPSim_scores_finetuned_models.csv
