# Youtube Video Preprocessing

In [None]:
#Importing Youtube module|
from pytube import YouTube 

In [None]:
#Downloading a Youtube Video
link="https://youtu.be/LjhCEhWiKXk"
yt = YouTube(link)
filename=link.split("/")[-1]+".mp4"  #Giving a filename

#Choosing a 360p MP4 video
mp4_files = yt.streams.filter(file_extension="mp4").get_by_resolution("360p")
mp4_files.download(filename=filename)

# Extracting Frames

In [None]:
#Importing CV2
import cv2
from PIL import Image

In [None]:
#Capturing Frames into an Array
cap = cv2.VideoCapture(filename)
video_frames=[]
N=60 #Skip Frames
frame_num=0
fps=cap.get(cv2.CAP_PROP_FPS)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    video_frames.append(Image.fromarray(frame[:, :, ::-1]))
    frame_num += N
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) #Skipping Frames
cap.release()

In [None]:
#Number of Frames Captured
len(video_frames)

# Feature Extraction

In [None]:
#Imports for Using CLIP
import torch
import clip
from PIL import Image
import math
import numpy 

#Model Import
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
#Preprocessor we get for applying various transformers on Image
preprocess

In [None]:
#Setting Batch Size and splitting in batches
batch_size = 256
batches = math.ceil(len(video_frames) / batch_size)

In [None]:
#To Store Encoded Frames
video_features = torch.empty([0, 512], dtype=torch.float16).to(device) 

#Frames Processing
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")
    batch_frames = video_frames[i*batch_size : (i+1)*batch_size] #Getting the i'th Batch
    #Preprocessing batch
    batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
    with torch.no_grad():
        batch_features = model.encode_image(batch_preprocessed)  #Encoding 
        batch_features /= batch_features.norm(dim=-1, keepdim=True)
    video_features = torch.cat((video_features, batch_features)) #Adding to video_features
# Print some stats
print(f"Features Shape: {video_features.shape}")

In [None]:
# Encode and normalize the search query using CLIP
with torch.no_grad():
    text_features = model.encode_text(clip.tokenize("man playing piano").to(device))
    text_features /= text_features.norm(dim=-1, keepdim=True)

In [None]:
# Compute the similarity between the search query and each frame using the Cosine similarity
similarities = (100.0 * video_features @ text_features.T)
values, best_photo_idx = similarities.topk(3, dim=0)

In [None]:
type(similarities)

In [None]:
#Indexes of best matches
best_photo_idx

In [None]:
similarities[:10]

In [None]:
#Heatmap of Video Search
import matplotlib.pyplot as plt
print("Search query heatmap over the frames of the video:")
plt.figure(figsize=(15,0.5))
plt.imshow(similarities.T.cpu().numpy())

plt.show()

In [None]:
import datetime
#Function to search and give timestamp
def search_video(search_query):
    with torch.no_grad():
        text_features = model.encode_text(clip.tokenize(search_query).to(device))
        text_features /= text_features.norm(dim=-1, keepdim=True)
    
    similarities = (100.0 * video_features @ text_features.T)
    values, best_photo_idx = similarities.topk(3, dim=0)
    
    # Display the top 3 frames
    for frame_id in best_photo_idx:
        display(video_frames[frame_id])
        # Find the timestamp in the video and display it
        seconds = round(frame_id.cpu().numpy()[0] * N / fps)
        print(str(datetime.timedelta(seconds=seconds)))

In [None]:
search_video("man playing piano")

# Making functions

In [None]:
#Imports
from pytube import YouTube 
import cv2
import torch
import clip
from PIL import Image
import math
import numpy 
import datetime

In [None]:
def video_download(link):
    print("Video Downloading")
    #Downloading a Youtube Video
    yt = YouTube(link)
    filename=link.split("/")[-1]+".mp4"  #Giving a filename
    #Choosing a 360p MP4 video
    mp4_files = yt.streams.filter(file_extension="mp4")
    mp4_files = mp4_files.get_by_resolution("360p")
    mp4_files.download(filename=filename)
    return filename
def video_frames_capture(filename):
    print("Frames Capturing...")
    #Capturing Frames into an Array
    cap = cv2.VideoCapture(filename)
    video_frames=[]
    N=60 #Skip Frames
    frame_num=0
    fps=cap.get(cv2.CAP_PROP_FPS)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(Image.fromarray(frame[:, :, ::-1]))
        frame_num += N
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) #Skipping Frames
    cap.release()
    return video_frames
def frame_classifier(video_frames):
    #Setting Batch Size and splitting in batches
    batch_size = 256
    batches = math.ceil(len(video_frames) / batch_size)
    #To Store Encoded Frames
    video_features = torch.empty([0, 512], dtype=torch.float16).to(device) 

    #Frames Processing
    for i in range(batches):
        print(f"Processing batch {i+1}/{batches}")
        batch_frames = video_frames[i*batch_size : (i+1)*batch_size] #Getting the i'th Batch
        #Preprocessing batch
        batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
        with torch.no_grad():
            batch_features = model.encode_image(batch_preprocessed)  #Encoding 
            batch_features /= batch_features.norm(dim=-1, keepdim=True)
        video_features = torch.cat((video_features, batch_features)) #Adding to video_features
        # Print some stats
        print(f"Features Shape: {video_features.shape}")
    return video_features
import datetime
#Function to search and give timestamp
def search_video_link(search_query,link):
    video=video_download(link)
    video_frames_captured=video_frames_capture(video)
    video_frames=frame_classifier(video_frames_captured)
    with torch.no_grad():
        text_features = model.encode_text(clip.tokenize(search_query).to(device))
        text_features /= text_features.norm(dim=-1, keepdim=True)
    similarities = (100.0 * video_features @ text_features.T)
    values, best_photo_idx = similarities.topk(3, dim=0)
    # Display the top 3 frames
    for frame_id in best_photo_idx:
        display(video_frames_captured[frame_id])
        # Find the timestamp in the video and display it
        seconds = round(frame_id.cpu().numpy()[0] * N / fps)
        print(str(datetime.timedelta(seconds=seconds)))

In [None]:
search_video_link("Man ringins bell","https://youtu.be/LjhCEhWiKXk")

In [None]:
search_video_link("Cassettle Tape","https://youtu.be/LjhCEhWiKXk")