## download ViCILP weights and put its pth file in viclip folder. 

In [1]:
!pip install -q --upgrade setuptools==69.5.1

%cd ../../Data/InternVid
!pip install -q ftfy einops

/Users/qing/PycharmProjects/InternVideo/Data/InternVid


In [16]:
import numpy as np
import cv2

from viclip import get_viclip, retrieve_text, _frame_from_video
from iv2_utils.iv2 import *

def listfile(path):
    return [os.path.join(path, x) for x in os.listdir(path)]

$$\Large \textbf{Adding noise to video + Splitting Video to 8-bit Windows}$$

-----

In [3]:
from IPython.display import clear_output
from PIL import Image, ImageSequence
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import numpy as np
import shutil
import pickle
import cv2
import os

def split_video_to_mp4(video_path, output_dir, window_size=4):
    if output_dir in os.listdir('.'):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    if video_path.endswith('.gif'):
        gif = Image.open(video_path)
        frames = [frame.copy() for frame in ImageSequence.Iterator(gif)]
        total_frames = len(frames)

        width, height = frames[0].size
        duration = gif.info['duration']
        fps = 1000 / duration
    else:
        video = cv2.VideoCapture(video_path)
        fps = video.get(cv2.CAP_PROP_FPS)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        frames = []
        success, frame = video.read()
        while success:
            frames.append(frame)
            success, frame = video.read()
        video.release()

    for i in range(total_frames - window_size + 1):
        output_path = os.path.join(output_dir, f'{i + 1}.mp4')
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        for frame in frames[i:i + window_size]:
            if isinstance(frame, Image.Image):
                frame_rgb = frame.convert('RGB')
                frame_array = np.array(frame_rgb)
                frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
            else:
                frame_bgr = frame
            out.write(frame_bgr)

        out.release()

def load_basketball(basketball_path, size):
    basketball = Image.open(basketball_path)
    basketball = basketball.resize((size, size), Image.LANCZOS)
    return basketball

def rotate_basketball(basketball):
    random_angle = np.random.randint(0, 360)
    return basketball.rotate(random_angle, expand=True)

def add_basketball_to_frame(frame, basketball):
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    basketball_rotated = rotate_basketball(basketball)

    frame_width, frame_height = frame_pil.size
    basketball_width, basketball_height = basketball_rotated.size

    max_x = frame_width - basketball_width
    max_y = frame_height - basketball_height
    rand_x = np.random.randint(0, max_x)
    rand_y = np.random.randint(0, max_y)

    frame_pil.paste(basketball_rotated, (rand_x, rand_y), basketball_rotated)
    return cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)

def add_noise(input_video_path, output_video_path, basketball_path, basketball_size):
    basketball = load_basketball(basketball_path, basketball_size)
    cap = cv2.VideoCapture(input_video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')

    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_with_basketball = add_basketball_to_frame(frame, basketball)

        out.write(frame_with_basketball)

    cap.release()
    out.release()
    #cv2.destroyAllWindows()

$$\Large \textbf{Loading the ViCLIP Model}$$

-----------

In [4]:
checkpoint_path = './viclip.pth'
model_cfgs = {
    'viclip-l-internvid-10m-flt': {
        'size': 'l',
        'pretrained': checkpoint_path,
    },
    'viclip-l-internvid-200m': {
        'size': 'l',
        'pretrained': checkpoint_path,
    },
    'viclip-b-internvid-10m-flt': {
        'size': 'b',
        'pretrained': checkpoint_path,
    },
    'viclip-b-internvid-200m': {
        'size': 'b',
        'pretrained': checkpoint_path,
    },
}
cfg = model_cfgs['viclip-l-internvid-10m-flt']
model_l = get_viclip(cfg['size'], cfg['pretrained'])

  state_dict = torch.load(pretrain, map_location='cpu')['model']


$$\Large \textbf{Adding Backflip Videos}$$


--------

In [None]:
if 'backflip' in os.listdir('.'):
    shutil.rmtree('backflip')

os.system('wget -q \"https://s3.amazonaws.com/kinetics/600/val/backflip (human).tar.gz\"')
shutil.unpack_archive('backflip (human).tar.gz', 'backflip')

j = 1
for i in sorted(os.listdir('backflip')):
    os.rename(os.path.join('backflip', i), os.path.join('backflip', f'{j}.mp4'))
    j += 1
    
backflip_files = os.listdir('backflip')
backflip_files.sort(key=lambda x: int(x.split('.')[0]))
backflip_files = [os.path.join('backflip', file) for file in backflip_files]
print(backflip_files[:5])

**Adding Noise to Videos**

In [None]:
if 'aug1' in os.listdir('.'):
    shutil.rmtree('aug1')
if 'aug2' in os.listdir('.'):
    shutil.rmtree('aug2')

def get_dim(file_path):
    vid = cv2.VideoCapture(file_path)
    height = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
    return height, width

os.mkdir('aug1')
os.mkdir('aug2')
for backflip in tqdm(backflip_files):
    height, width = get_dim(backflip)
    add_noise(backflip, os.path.join('aug1', backflip.split('/')[1]), 'Storage/cruise.png', int(min(height, width) / 3))

for aug1 in tqdm([os.path.join('aug1',x) for x in os.listdir('aug1')]):
    height, width = get_dim(aug1)
    add_noise(aug1, os.path.join('aug2', aug1.split('/')[1]), 'Storage/cruise.png', int(min(height, width) / 3))

shutil.rmtree('aug1')

$$\Large \textbf{Evaluation on K400}$$

------

In [6]:
import pandas as pd
import numpy as np
import os

val_data = pd.read_csv('../../InternVideo2/multi_modality/k600/val.csv').iloc[:,0:2]
videos = os.listdir('../../InternVideo2/multi_modality/k600/part_0')

print(val_data.head())
print()
print(videos[-5:])

id_label_map = {}
for row in range(len(val_data)):
    id_label_map[val_data.iloc[row]['youtube_id']] = val_data.iloc[row]['label']

remove_points = len(videos) - 500
sampled_points = np.random.choice(videos, remove_points, replace=False)
classes = val_data['label'].unique()
for i in sampled_points:
    os.remove(f'../../InternVideo2/multi_modality/k600/part_0/{i}')
print(f"Removed {remove_points} Videos!")

       label   youtube_id
0  abseiling  0wR5jVB-WPk
1  abseiling  3caPS4FHFF8
2  abseiling  3yaoNwz99xM
3  abseiling  6IbvOJxXnOo
4  abseiling  6_4kjPiQr7w

['-j3eNzQR-EI_000064_000074.mp4', '-C-PvafuvFE_000068_000078.mp4', '0yNXOIqJLtA_000012_000022.mp4', '13Ub1MDkiHc_000014_000024.mp4', '-IlFdaVdEyU_000001_000011.mp4']
Removed 0 Videos!


In [8]:
from IPython.display import clear_output
top1 = 0
top5 = 0
total = 0
collect_data = []
for check in videos:
    video = cv2.VideoCapture(f'../../InternVideo2/multi_modality/k600/part_0/{check}')
    frames = [x for x in _frame_from_video(video)]
    
    video_label = id_label_map[check[:11]]
    
    texts, probs = retrieve_text(frames, classes, models=model_l, topk=5, device="cpu")
    collect_data.append((texts, probs))
    clear_output(wait=True)
    if texts[0] == video_label:
        top1 += 1
    if video_label in texts:
        top5 += 1
    total += 1
    print(f"{total}/500 Completed")
    print('-'*25)
    print("Correct:", video_label)
    print('-'*25)
    print('Predictions ' + '-'*13)
    for i, v in zip(texts, probs):
        print(f'{v:.4f}', '|', i)
    print('-'*25)
    print("Top 1:",top1/total)
    print("Top 5:", top5/total)

500/500 Completed
-------------------------
Correct: playing harmonica
-------------------------
Predictions -------------
0.2789 | playing harmonica
0.2136 | playing flute
0.2064 | playing recorder
0.2021 | brushing teeth
0.2020 | playing xylophone
-------------------------
Top 1: 0.548
Top 5: 0.83


$$\Large \textbf{Evaluation on GIF100}$$

-----

In [26]:
videos = pickle_read('../../../photography-model/rustyjar/STOCK100.pkl')
print("Loaded",len(videos),"videos!")

Loaded 100 videos!


In [29]:
videos[4]

('GIF100/5.mp4',
 'A person splashes into the pool.',
 [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26])

In [14]:
model_l['viclip'].cache_txt

{'A small kid falls down onto the ground.': tensor([[-4.5776e-02, -2.1813e-02, -2.0643e-03,  5.7868e-02,  1.1160e-02,
          -1.6745e-02,  3.5187e-02, -4.3393e-03,  3.3565e-03, -1.8007e-02,
          -4.3136e-02,  1.9561e-02,  1.3579e-02,  2.2094e-03, -1.5398e-02,
          -1.3152e-03,  1.8768e-02,  2.0182e-02, -5.9828e-02, -2.0360e-02,
           7.3737e-03,  2.1527e-02, -1.5426e-02,  1.8080e-02,  7.5934e-03,
          -3.0730e-02, -3.8834e-02,  2.0356e-03,  1.3019e-02,  9.4537e-03,
           2.0870e-03,  4.2311e-02,  1.6326e-03, -8.5308e-03,  3.2212e-02,
          -2.4307e-02, -9.1623e-03,  1.5416e-02, -3.0801e-02, -1.2381e-02,
          -3.8146e-02, -2.8381e-02,  3.3975e-02,  1.2653e-02, -3.5661e-03,
           1.4811e-02,  2.6234e-02,  2.4688e-02,  2.0408e-02, -1.6954e-02,
           2.5903e-02, -4.1720e-02,  4.5704e-03,  4.4504e-03, -9.9096e-03,
           1.8941e-02,  4.2951e-02, -1.7330e-03,  3.6770e-02,  9.0733e-03,
           1.4584e-02,  1.2002e-02, -2.2910e-02,  2.0180e

In [18]:
from IPython.display import clear_output

logits = []
preds  = []
for video_path, phrase, frames in tqdm(videos[:51]):
    video = cv2.VideoCapture(os.path.join('../../../photography-model/', video_path))
    
    frames = [x for x in _frame_from_video(video)]

    logits_curr = []
    p = []
    for i in tqdm(range(len(frames) - 8)):
        texts, probs = retrieve_text(frames[i:i+8], [phrase], models=model_l, topk=1, device="cpu")
        logits_curr.append((probs[0].item(), i + 1))
        p.append(probs[0].item())
    logits.append(logits_curr)
    preds.append(np.argmax(p) + 1)

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/229 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
preds

[np.int64(25), np.int64(4), np.int64(154), np.int64(135)]

In [25]:
videos[2]

('GIF100/3.mp4',
 'A kid splashes into the water.',
 [237,
  238,
  239,
  240,
  241,
  242,
  243,
  244,
  245,
  246,
  247,
  248,
  249,
  250,
  251,
  252,
  253,
  254,
  255,
  256,
  257,
  258,
  259,
  260,
  261,
  262,
  263,
  264,
  265,
  266,
  267,
  268,
  269,
  270,
  271,
  272,
  273,
  274,
  276,
  277,
  278,
  279,
  280,
  281,
  282,
  275,
  283,
  284,
  285,
  286,
  287,
  288,
  289,
  290,
  291,
  292,
  293,
  294,
  295,
  296,
  297,
  298,
  299,
  300])

$$\Large \textbf{Predicting}$$

------

In [None]:
video = cv2.VideoCapture('example1.mp4')
frames = [x for x in _frame_from_video(video)]
text_candidates = ['A dog jumping in snow', "A car driving off a cliff", "A dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon."]
texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=2, device="cpu")
for t, p in list(zip(texts, probs)):
    print(t, p)

In [None]:
model_l['viclip'].cache_txt.keys()

In [None]:
import matplotlib.pyplot as plt
def predictVideo(videoPath, phrase):
    video = cv2.VideoCapture(videoPath)
    frames = [x for x in _frame_from_video(video)]
    text_candidates = [phrase]
    texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=1)
    return probs[0]

def predict(gifPath, phrase, outputDir = "output", output_logits = False, window_size = 8):
    assert(window_size >= 8)
    split_video_to_mp4(gifPath, outputDir, window_size=8)
    
    logits = []

    output_files = os.listdir(outputDir)
    output_files.sort(key=lambda x: int(x.split('.')[0]))

    videos = list(map(lambda x: os.path.join(outputDir, x), output_files))

    videoPbar = tqdm(videos)
    for videoPath in videoPbar:
        videoPbar.set_description(gifPath.split('/')[-1])
        logits.append((predictVideo(videoPath, phrase), int(videoPath.split('/')[-1].split('.')[0])))
        
#     if window_size > 8:
#         additional = window_size - 8
#         avg_logits = []
        
# #         if window_size == 13:
# #             for i in range(2, len(logits) - 2):
# #                 avg_logits.append((sum([logits[x][0] for x in [i - 2, i - 1, i, i + 1, i + 2]])/5, i + 1))
#     else:
#         avg_logits = logits
    
#     logits = avg_logits
    
#     if output_logits:
#         return avg_logits
    
    
    logits.sort(key=lambda x: -x[0])
    
    final_ans = logits[0][1] # Frame index (1 start)
    return final_ans

def runPredict(files):
    test_tqdm = tqdm(files)
    y_pred = []
    for test_file in test_tqdm:
        test_tqdm.set_description('Backflip files')
        y_pred.append(predict(test_file, 'A person performs a backflip.'))

    import pickle
    with open('y_pred.pkl', 'wb') as file:
        pickle.dump(y_pred, file)
    print("Done saving")

In [None]:
def getLogits(gifPath, phrase, outputDir = "output"):
    split_video_to_mp4(gifPath, outputDir, window_size=8)
    
    logits = []

    output_files = os.listdir(outputDir)
    output_files.sort(key=lambda x: int(x.split('.')[0]))

    videos = list(map(lambda x: os.path.join(outputDir, x), output_files))

    videoPbar = tqdm(videos)
    for videoPath in videoPbar:
        videoPbar.set_description(gifPath.split('/')[-1])
        logits.append((predictVideo(videoPath, phrase), int(videoPath.split('/')[-1].split('.')[0])))
        
    return logits

In [None]:
augmented = listfile('photography-model/augment')
augmented.sort(key = lambda x: int(x.split('/')[-1].split('.')[0]))
print(augmented[:5])

In [None]:
runPredict(augmented)

In [None]:
logitsList = []
for file in tqdm(augmented):
    logitsList.append(getLogits(file, 'A person performs a backflip.'))
print(logitsList)
with open('logits.pkl', 'wb') as file:
    pickle.dump(logitsList, file)

In [None]:
predict('photography-model/augment/22.mp4', 'A person performs a backflip.', window_size=11)

In [None]:
for idx, i in enumerate(graph_prediction):
    if i == max(graph_prediction):
        print("Original Prediction:",idx + 1)

for idx, i in enumerate(graph_prediction_16):
    if i == max(graph_prediction_16):
        print("New Prediction:     ",idx + 1)

In [None]:
graph_prediction_16[:5]

In [None]:
plt.plot([x[1] for x in graph_prediction], [x[0] for x in graph_prediction])
plt.plot([x[1] for x in graph_prediction_16], [x[0] for x in graph_prediction_16])
plt.title("Sliding Window Technique")

plt.xlabel('Frame #')
plt.ylabel('Similarity')
plt.gca().set_position([0, 0, 1, 1])
plt.savefig("output.svg")
plt.show()

In [None]:
predict("Storage/augment/24.mp4", 'A person performs a backflip.')

In [None]:
[1,2,3,4,5,6,7,8,9][:8]

In [None]:
runPredict([os.path.join('Storage/augment', x) for x in os.listdir('Storage/augment')])

In [None]:
aug2_files = os.listdir('aug2')
aug2_files.sort(key = lambda x: int(x.split('.')[0]))
runPredict([os.path.join('aug2', x) for x in aug2_files])

In [None]:
print(y_pred)

In [None]:
%cd Storage
os.system('git pull')
%cd ..

In [None]:
import shutil
shutil.make_archive('augment', 'zip', 'aug2')

In [None]:
import smtplib
import argparse
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
import os

# Email details
sender_email = "qxli2@students.everettcc.edu"
password = "miSTER3Man7Jig37"
# Email content

receiver_email = "qing.cminst@gmail.com"
attachment_path = "logits.pkl"

contents = "I'm done bro. (regular, not augmented)."
subject = "Helicopter Model Update"
body = f"""
<html>
<head>
    <style>
        body {{
            font-family: monospace;
        }}
    </style>
</head>
<body>
<pre>{contents}</pre>
</body>
</html>
"""

message = MIMEMultipart()
message["From"] = sender_email
message["To"] = receiver_email
message["Subject"] = subject

message.attach(MIMEText(body, "html"))

if attachment_path:
    try:
        with open(attachment_path, "rb") as attachment:
            part = MIMEBase("application", "octet-stream")
            part.set_payload(attachment.read())
        
        encoders.encode_base64(part)
        
        part.add_header(
            "Content-Disposition",
            f"attachment; filename= {os.path.basename(attachment_path)}"
        )
        
        message.attach(part)

    except Exception as e:
        print(f"Failed to attach file: {e}")

try:
    server = smtplib.SMTP("smtp.gmail.com", 587)
    server.starttls()
    server.login(sender_email, password)

    server.sendmail(sender_email, receiver_email, message.as_string())
    print("Email sent successfully!")

except Exception as e:
    print(f"Failed to send email: {e}")

finally:
    server.quit()

In [None]:
predict('backflip/49.mp4', 'A person performs a backflip')

In [None]:
predict('Storage/output_video2.mp4', 'A person performs a backflip')

In [None]:
import shutil
shutil.make_archive("backfliplol", 'zip', "backflip")

<a href="y_pred.pkl">Download</a>

In [None]:
y_pred

$$\Large \textbf{XCLIP testing}$$


------

In [10]:
!pip install av

Collecting av
  Downloading av-13.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading av-13.0.0-cp39-cp39-macosx_11_0_arm64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: av
Successfully installed av-13.0.0


In [38]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 8 frames
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")


from IPython.display import clear_output
top1 = 0
top5 = 0
total = 0
collect_data = []
for check in videos:
    container = av.open(os.path.join('../../InternVideo2/multi_modality/k600/part_0/',check))
    indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
    video = read_video_pyav(container, indices)

    inputs = processor(
        text=classes.tolist(),
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )


    with torch.no_grad():
        outputs = model(**inputs)
    
    clear_output(wait=True)
    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    video_label = id_label_map[check[:11]]
    print(f"{total}/500 Completed")
    print('-'*25)
    print("Correct:", video_label)
    print('-'*25)
    print('Predictions ' + '-'*13)

    ind = np.argpartition(probs.numpy()[0], -5)[-5:]
    top5_list = list(zip([probs[0][i] for i in ind], [classes[i] for i in ind]))
    top5_list.sort(key = lambda x: -x[0])
    
    for i,v in top5_list:
        print(f'{i:.4f}', '|', v)

    texts = [top5_list[x][1] for x in range(5)]
    if texts[0] == video_label:
        top1 += 1
    if video_label in texts:
        top5 += 1
    total += 1
    print('-'*25)
    print("Top 1:",top1/total)
    print("Top 5:", top5/total)

499/500 Completed
-------------------------
Correct: playing harmonica
-------------------------
Predictions -------------
0.7587 | playing harmonica
0.0100 | whistling
0.0047 | playing accordion
0.0044 | trimming or shaving beard
0.0040 | crying
-------------------------
Top 1: 0.576
Top 5: 0.828


In [72]:
def get_score(video, phrase):
    container = av.open(video) #'../../../photography-model/GIF87/1.mp4')
    indices = list(range(8))
    video = read_video_pyav(container, indices)
    
    inputs = processor(
        text=[phrase],
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits_per_video = outputs.logits_per_video
    # probs = logits_per_video.softmax(dim=1)
    return logits_per_video[0].numpy()[0]

In [76]:
from iv2_utils.iv2 import split_video_to_mp4, get_output_dir, pickle_write, pickle_read

In [None]:
gif87_dir = '../../../photography-model/GIF87/'

preds = []
logits = []
for i in tqdm(range(1, 88)):
    split_video_to_mp4(os.path.join(backflip_dir, f'{i}.mp4'), output_dir='output', window_size=8)
    logit_curr = []
    pbar = tqdm(get_output_dir('output'))
    for clip in pbar:
        logit_curr.append(get_score(clip, 'A person performing a backflip.'))
        pbar.set_description(str(np.argmax(logit_curr) + 1))
    logits.append(list(zip(logit_curr, list(range(1, len(logit_curr) + 1)))))
    preds.append(np.argmax(logit_curr) + 1)
    # print(np.argmax(logit_curr) + 1)

In [75]:
backflip_dir = '../../../photography-model/backflip/'

preds = []
logits = []
for i in tqdm(range(1, 51)):
    split_video_to_mp4(os.path.join(backflip_dir, f'{i}.mp4'), output_dir='output', window_size=8)
    logit_curr = []
    pbar = tqdm(get_output_dir('output'))
    for clip in pbar:
        logit_curr.append(get_score(clip, 'A person performing a backflip.'))
        pbar.set_description(str(np.argmax(logit_curr) + 1))
    logits.append(list(zip(logit_curr, list(range(1, len(logit_curr) + 1)))))
    preds.append(np.argmax(logit_curr) + 1)
    # print(np.argmax(logit_curr) + 1)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

  0%|          | 0/264 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/128 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]

  0%|          | 0/191 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/264 [00:00<?, ?it/s]

  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/268 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/221 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

In [77]:
pickle_write(preds, 'XCLIP-r8.pkl')
pickle_write(logits, 'XCLIP-logits-r.pkl')

In [1]:
def showFrames(path, highlight=False):
    global current_pick
    video_path = path
    cap = cv2.VideoCapture(video_path)
    
    frames = []
    success, frame = cap.read()
    while success:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(frame)
        frames.append(img)
        success, frame = cap.read()
    
    cap.release()

    if not frames:
        print("No frames found in the video.")
        return
    
    root = tk.Tk()
    root.title(path)
    currIdx = -1

    frame_anno = []
    def display_frame(index):
        global currIdx
        frame_label.config(text=f"Frame {index}", fg = 'black' if index + 1 not in frame_anno else 'green')
        img = ImageTk.PhotoImage(frames[index])
        frame_canvas.create_image(0, 0, anchor=tk.NW, image=img)
        frame_canvas.image = img
        currIdx = index
    
    frame_label = tk.Label(root, text="Frame 0", font=('Hack', 14), fg="red" if highlight else "black")
    frame_label.pack()
    
    frame_canvas = tk.Canvas(root, width=frames[0].width, height=frames[0].height)
    frame_canvas.pack()
    
    display_frame(0)
    
    filename_entry = tk.Entry(root, font=('Hack', 12))
    filename_entry.pack(pady=10)

    def next_frame(event):
        current_frame = int(frame_label.cget("text").split()[1])
        next_index = (current_frame + 1) % len(frames)
        display_frame(next_index)

    def doubleSkip(event):
        next_frame(event)
        next_frame(event)
    
    def prev_frame(event):
        current_frame = int(frame_label.cget("text").split()[1])
        next_index = (current_frame - 1) % len(frames)
        display_frame(next_index)

    def doublePrev(event):
        prev_frame(event)
        prev_frame(event)

    def restart(event):
        if anno_stock100[-1][0] == path:
            anno_stock100.pop(-1)
            frame_anno = []
            print("Removed previous one and reset frame_anno")
            print(anno_stock100)
        else:
            print("Not same path, ignoring")
    
    def save_frame(event):
        global currIdx
        global anno_stock100

        phrase = filename_entry.get()  # Get the text from the entry box
        anno_stock100.append((path, phrase, frame_anno))
        print("Saved!!")
        print(anno_stock100)
    def add_frame(event):
        global currIdx
        if currIdx + 1 in frame_anno:
            frame_anno.remove(currIdx + 1)
            print("Removed",currIdx + 1,"as a correct frame.")
            display_frame(currIdx)
        else:
            frame_anno.append(currIdx + 1)
            print("Added", currIdx + 1, "as a correct frame.")
            display_frame(currIdx)
    
    root.bind('<Right>', next_frame)
    root.bind('<Left>', prev_frame)
    root.bind('<Up>', doubleSkip)
    root.bind('<Down>', doublePrev)
    root.bind('<Command-s>', save_frame)
    root.bind('<Command-f>', add_frame)
    root.bind('<Command-r>', restart)
    
    root.mainloop()


In [10]:
video = pickle_read('../../../photography-model/rustyjar/STOCK100-testing.pkl')
video[45]

('GIF100/46.mp4', [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51])

In [11]:
from pkg_resources import packaging
from collections import OrderedDict
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import IPython.display
from os import system
from PIL import Image, ImageTk
import urllib.request
import tkinter as tk
import pandas as pd
import numpy as np
import skimage
import pickle
import torch
import time
import math
import clip
import cv2
import os
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print("Torch version:", torch.__version__)
print("CLIP Models:",clip.available_models())

def pickle_read(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

def pickle_write(a, b):
    pickle_filename = a if len(a) >= 4 and a[-4:] == ".pkl" else b
    data = b if pickle_filename == a else a
    with open(pickle_filename, 'wb') as file:
        pickle.dump(data, file)
showFrames('../../../photography-model/GIF100/27.mp4')

Torch version: 2.4.1
CLIP Models: ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
