In [1]:
import json
import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import os

#Import the models
model = models.video.r3d_18(pretrained=True)
model.eval()

#for the shortlisted m videos using opencv the videos are visualized
def visualise(video_file):
    cap = cv2.VideoCapture(video_file)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # Visualizing the video frame using OpenCV
        cv2.imshow("The captured frame is: ", frame)
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows() 


#Defining the hook for the output
def output_hook(desired_layers):
    layer_outputs = {}
     # Gather the output for modules for the specified layers
    def hook_function(module, input, output):
        for name, mod in model.named_modules():
            if mod == module:
                module_name = name
        if module_name in desired_layers:
            layer_outputs[module_name] = output.detach()
    # Registering the hook for the layer in desired_layer
    for name, module in model.named_modules():
        if name in desired_layers:
            module.register_forward_hook(hook_function)
    return layer_outputs

#Sliding Window Technique with window_size and step
def sliding_window(frames, window_size, step):
    slided_frames = []
    total_frames = len(frames)
    for i in range(0, total_frames - window_size +1, step):
        current_slide =[]
        for j in range(window_size):
            current_slide.append(frames[i+j])
        slided_frames.append(current_slide)
    return slided_frames

# Function to preprocess the video and apply sliding window approach  
def frames_collect(video_file_path):    
    #Capture the video
    desired_layers = ['layer3']
    layer_outputs = output_hook(desired_layers)
    video_frames = []
    cap = cv2.VideoCapture(video_file_path)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(frame)
        # Visualizing the video frame using OpenCV
        #cv2.imshow("The captured frame is: ", frame)
        #if cv2.waitKey(30) & 0xFF == ord('q'):
        #    break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows()    
    #Defining the maximum frames to 32 and step as 16
    max_frames = 32
    step = 16
    sliding_frames = []
    # Get sequences using sliding window
    if len(video_frames)>=32:
        sliding_frames = sliding_window(video_frames, max_frames, step)
    else:
        print("Skip the video")
        return
    
    # Transformation pipeline
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor()
    ])
    #Gathering all layer outputs ffrom the sliding window function
    slided_output_layer = []
    transformed_frame=[]
    for frames in sliding_frames:
        transformed_frame=[]
        for frame in frames:
            transform_frame=transform(frame)
            transformed_frame.append(transform_frame)
        tensor_frames = torch.stack(transformed_frame)
        tensor_frames = tensor_frames.permute(1, 0, 2, 3).unsqueeze(0)
        # Append the layers
        with torch.no_grad():
            output = model(tensor_frames)
        for layer_name, output in layer_outputs.items():
            slided_output_layer.append(output)
    # Stack outputs along a new dimension
    stack_layers = torch.stack(slided_output_layer, dim=0)
    # Apply max pooling across the windows (dim=0)
    maxpooling_output = torch.max(stack_layers, dim=0).values
    maxpooling_output = maxpooling_output.squeeze(0)
    output_current_layer = maxpooling_output.mean(dim=[1,2,3])
    fully_connected = nn.Linear(256, 512)
    output_current_layer = fully_connected(output_current_layer)
    return output_current_layer.tolist()


# Function to preprocess the video and apply sliding window approach  
def frames_collect_2(video_file_path, desired_layer):    
    #Capture the video
    layer_outputs = output_hook(desired_layer)
    video_frames = []
    cap = cv2.VideoCapture(video_file_path)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(frame)
        # Visualizing the video frame using OpenCV
        #cv2.imshow("The captured frame is: ", frame)
        #if cv2.waitKey(30) & 0xFF == ord('q'):
        #    break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows()    
    #Defining the maximum frames to 32 and step as 16
    max_frames = 32
    step = 16
    sliding_frames = []
    # Get sequences using sliding window
    if len(video_frames)>=32:
        sliding_frames = sliding_window(video_frames, max_frames, step)
    else:
        print("Skip the video")
        return
    
    # Transformation pipeline
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor()
    ])
    #Gathering all layer outputs ffrom the sliding window function
    slided_output_layer = []
    transformed_frame=[]
    for frames in sliding_frames:
        transformed_frame=[]
        for frame in frames:
            transform_frame=transform(frame)
            transformed_frame.append(transform_frame)
        tensor_frames = torch.stack(transformed_frame)
        tensor_frames = tensor_frames.permute(1, 0, 2, 3).unsqueeze(0)
        # Append the layers
        with torch.no_grad():
            output = model(tensor_frames)
        for layer_name, output in layer_outputs.items():
            slided_output_layer.append(output)
    # Stack outputs along a new dimension
    stack_layers = torch.stack(slided_output_layer, dim=0)
    # Apply max pooling across the windows (dim=0)
    maxpooling_output = torch.max(stack_layers, dim=0).values
    maxpooling_output = maxpooling_output.squeeze(0)
    output_current_layer = maxpooling_output.mean(dim=[1,2,3])
    return output_current_layer.tolist()


#Read Json
def read_json_file(file_name):
    with open(file_name, "r") as file:
        data = json.load(file)
    return data

#distance function - manhattan 
def manhattan(a, b):
    res=0
    for i in range(0,3):
        res = res + abs(a[i]-b[i])
    return res

res=[]
#Define Nearest Search
def nearest_search(video_file_path):
    sample_layer3 = frames_collect(video_file_path)
    sample_layer4 = frames_collect_2(video_file_path, ['layer4'] )
    sample_avgpool = frames_collect_2(video_file_path, ['avgpool'])
    #Define the Json Files to read
    json_files = ["sword_output.json", "cartwheel_output.json", "drink_output.json",
                 "ridebikeoutput.json", "swordexcercise_output.json", "wave_output.json"]
    #Loop throught the json files
    for k in range(0, len(json_files)):
        data = read_json_file(json_files[k])
        layer_3 = data[0]
        layer_4 = data[1]
        layer_5 = data[2]
        #On every layer calculate the manhattan distance
        for i in range(0, len(layer_3)):
            curr_1 = layer_3[i]
            curr_2 = layer_4[i]
            curr_3 = layer_5[i]
            dist=0
            for j in range(0, len(curr_1)):
                curr_sum=0
                a=[curr_1[j], curr_2[j], curr_3[j]]
                b=[sample_layer3[j], sample_layer4[j], sample_avgpool[j]]
                curr_sum += manhattan(a,b)
                dist=curr_sum
            res.append((dist, i, json_files[k]))
        #return res
    #Sort the result
    res.sort(key=lambda i:i[0])
    #Gather the similarity Videos
    similar_videos = []
    #Visualise the videos
    for i in range(0,10):
        print(res[i])
        similar_videos.append(res[i])
    file_path = {'sword_output.json':'/Users/srirupin/Downloads/target/sword' ,
            'cartwheel_output.json': '/Users/srirupin/Downloads/target/cartwheel',
            'ridebikeoutput.json':'/Users/srirupin/Downloads/target/ride_bike',
            'swordexcercise_output.json': '/Users/srirupin/Downloads/target/sword_exercise',
            'wave_output.json': '/Users/srirupin/Downloads/target/wave',
            'drink_output.json': '/Users/srirupin/Downloads/target/drink' 
            }
    output=[]
    for i in range(0,10):
        video_file_path = file_path[res[i][2]]
        j=0
        for k in os.listdir(video_file_path):
            if j ==res[i][1]:
                path = f'{video_file_path}/{k}'
                print("The Distance Measure: ", res[i][0])
                visualise(path)
                output.append((k, res[i][0]))
            j=j+1
    return output



In [2]:
sample_input_1 = '/Users/srirupin/Downloads/target/cartwheel/Bodenturnen_2004_cartwheel_f_cm_np1_le_med_0.avi'
sample_input_2 = '/Users/srirupin/Downloads/target/sword_exercise/Blade_Of_Fury_-_Scene_1_sword_exercise_f_cm_np1_ri_med_3.avi'
sample_input_3 = '/Users/srirupin/Downloads/target/sword/AHF_longsword_against_Rapier_and_Dagger_Fight_sword_f_cm_np2_ri_bad_0.avi'
sample_input_4 = '/Users/srirupin/Downloads/target/drink/CastAway2_drink_u_cm_np1_le_goo_8.avi'

In [3]:
ans = nearest_search(sample_input_1)

(0.05643364042043686, 29, 'swordexcercise_output.json')
(0.05719864368438721, 7, 'cartwheel_output.json')
(0.058262795209884644, 38, 'cartwheel_output.json')
(0.07027793675661087, 63, 'sword_output.json')
(0.08527994155883789, 142, 'drink_output.json')
(0.1098511666059494, 20, 'wave_output.json')
(0.11364439874887466, 109, 'swordexcercise_output.json')
(0.11459260806441307, 86, 'drink_output.json')
(0.12074108421802521, 106, 'cartwheel_output.json')
(0.12651728093624115, 75, 'ridebikeoutput.json')
The Distance Measure:  0.05643364042043686
The Distance Measure:  0.05719864368438721
The Distance Measure:  0.058262795209884644
The Distance Measure:  0.07027793675661087
The Distance Measure:  0.08527994155883789
The Distance Measure:  0.1098511666059494
The Distance Measure:  0.11364439874887466
The Distance Measure:  0.11459260806441307
The Distance Measure:  0.12074108421802521
The Distance Measure:  0.12651728093624115


In [4]:
ans_1 = nearest_search(sample_input_2)

(0.05217835307121277, 4, 'drink_output.json')
(0.05590900778770447, 106, 'drink_output.json')
(0.05643364042043686, 29, 'swordexcercise_output.json')
(0.056576214730739594, 39, 'drink_output.json')
(0.05719864368438721, 7, 'cartwheel_output.json')
(0.058262795209884644, 38, 'cartwheel_output.json')
(0.07015926390886307, 18, 'ridebikeoutput.json')
(0.07027793675661087, 63, 'sword_output.json')
(0.07097502425312996, 37, 'wave_output.json')
(0.07237998396158218, 57, 'ridebikeoutput.json')
The Distance Measure:  0.05217835307121277
The Distance Measure:  0.05590900778770447
The Distance Measure:  0.05643364042043686
The Distance Measure:  0.056576214730739594
The Distance Measure:  0.05719864368438721
The Distance Measure:  0.058262795209884644
The Distance Measure:  0.07015926390886307
The Distance Measure:  0.07027793675661087
The Distance Measure:  0.07097502425312996
The Distance Measure:  0.07237998396158218


In [5]:
ans_2 = nearest_search(sample_input_3)

(0.05217835307121277, 4, 'drink_output.json')
(0.05590900778770447, 106, 'drink_output.json')
(0.05643364042043686, 29, 'swordexcercise_output.json')
(0.056576214730739594, 39, 'drink_output.json')
(0.05719864368438721, 7, 'cartwheel_output.json')
(0.058262795209884644, 38, 'cartwheel_output.json')
(0.07015926390886307, 18, 'ridebikeoutput.json')
(0.07027793675661087, 63, 'sword_output.json')
(0.07097502425312996, 37, 'wave_output.json')
(0.07237998396158218, 57, 'ridebikeoutput.json')
The Distance Measure:  0.05217835307121277
The Distance Measure:  0.05590900778770447
The Distance Measure:  0.05643364042043686
The Distance Measure:  0.056576214730739594
The Distance Measure:  0.05719864368438721
The Distance Measure:  0.058262795209884644
The Distance Measure:  0.07015926390886307
The Distance Measure:  0.07027793675661087
The Distance Measure:  0.07097502425312996
The Distance Measure:  0.07237998396158218


In [6]:
ans_3 = nearest_search(sample_input_4)

(0.05217835307121277, 4, 'drink_output.json')
(0.05590900778770447, 106, 'drink_output.json')
(0.05643364042043686, 29, 'swordexcercise_output.json')
(0.056576214730739594, 39, 'drink_output.json')
(0.05719864368438721, 7, 'cartwheel_output.json')
(0.058262795209884644, 38, 'cartwheel_output.json')
(0.07015926390886307, 18, 'ridebikeoutput.json')
(0.07027793675661087, 63, 'sword_output.json')
(0.07097502425312996, 37, 'wave_output.json')
(0.07237998396158218, 57, 'ridebikeoutput.json')
The Distance Measure:  0.05217835307121277
The Distance Measure:  0.05590900778770447
The Distance Measure:  0.05643364042043686
The Distance Measure:  0.056576214730739594
The Distance Measure:  0.05719864368438721
The Distance Measure:  0.058262795209884644
The Distance Measure:  0.07015926390886307
The Distance Measure:  0.07027793675661087
The Distance Measure:  0.07097502425312996
The Distance Measure:  0.07237998396158218


In [9]:
final_result = {
    sample_input_1: ans,
    sample_input_2: ans_1,
    sample_input_3: ans_2,
    sample_input_4: ans_3,
}

In [10]:
with open("output_task1.json", "w") as file:
    json.dump(final_result, file, indent=4)