In [1]:
#import the libraries
import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn

In [2]:
sample_input_1 = '/Users/srirupin/Downloads/target/cartwheel/Bodenturnen_2004_cartwheel_f_cm_np1_le_med_0.avi'
sample_input_2 = '/Users/srirupin/Downloads/target/sword_exercise/Blade_Of_Fury_-_Scene_1_sword_exercise_f_cm_np1_ri_med_3.avi'
sample_input_3 = '/Users/srirupin/Downloads/target/sword/AHF_longsword_against_Rapier_and_Dagger_Fight_sword_f_cm_np2_ri_bad_0.avi'
sample_input_4 = '/Users/srirupin/Downloads/target/drink/CastAway2_drink_u_cm_np1_le_goo_8.avi'

# Gather the output for layer3

In [3]:
# Load the pre-trained model and sift to eval mode
model = models.video.r3d_18(pretrained=True)
model.eval()
#Defining the hook for the output
def output_hook(desired_layers):
    layer_outputs = {}
     # Gather the output for modules for the specified layers
    def hook_function(module, input, output):
        for name, mod in model.named_modules():
            if mod == module:
                module_name = name
        if module_name in desired_layers:
            layer_outputs[module_name] = output.detach()
    # Registering the hook for the layer in desired_layer
    for name, module in model.named_modules():
        if name in desired_layers:
            module.register_forward_hook(hook_function)
    return layer_outputs

# Define layers to capture
desired_layers = ['layer3']
layer_outputs = output_hook(desired_layers)

#Sliding Window Technique with window_size and step
def sliding_window(frames, window_size, step):
    slided_frames = []
    total_frames = len(frames)
    #loop through every window of size 32
    for i in range(0, total_frames - window_size +1, step):
        current_slide =[]
        for j in range(window_size):
            current_slide.append(frames[i+j])
        slided_frames.append(current_slide)
    return slided_frames
    
# Function to preprocess the video and apply sliding window approach  
def frames_collect(video_file_path):    
    #Capture the video
    video_frames = []
    cap = cv2.VideoCapture(video_file_path)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(frame)
        # Visualizing the video frame using OpenCV
        cv2.imshow("The captured frame is: ", frame)
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows()    
    #Defining the maximum frames to 32 and step as 16
    max_frames = 32
    step = 16
    sliding_frames = []
    # Get sequences using sliding window
    if len(video_frames)>=32:
        sliding_frames = sliding_window(video_frames, max_frames, step)
    else:
        print("Skipping the video", video_file_path)
        return
        #diff = 32-len(video_frames)
        #for i in range(0, diff):
            #video_frames.append(video_frames[i])
    
    # Transformation pipeline
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor()
    ])
    #Gathering all layer outputs ffrom the sliding window function
    slided_output_layer = []
    transformed_frame=[]
    for frames in sliding_frames:
        transformed_frame=[]
        for frame in frames:
            transform_frame=transform(frame)
            transformed_frame.append(transform_frame)
        tensor_frames = torch.stack(transformed_frame)
        tensor_frames = tensor_frames.permute(1, 0, 2, 3).unsqueeze(0)
        # Append the layers
        with torch.no_grad():
            output = model(tensor_frames)
        for layer_name, output in layer_outputs.items():
            slided_output_layer.append(output)
    # Stack outputs along a new dimension
    stack_layers = torch.stack(slided_output_layer, dim=0)
    print("The output size: is",stack_layers.shape)
    # Apply max pooling across the windows (dim=0)
    maxpooling_output = torch.max(stack_layers, dim=0).values
    maxpooling_output = maxpooling_output.squeeze(0)
    output_current_layer = maxpooling_output.mean(dim=[1,2,3])
    fully_connected = nn.Linear(256, 512)
    output_current_layer = fully_connected(output_current_layer)
    print("The output is: ", output_current_layer)
    print("The output size: is",output_current_layer.shape)



In [4]:
frames_collect(sample_input_1)

The output size: is torch.Size([1, 1, 256, 8, 14, 14])
The output is:  tensor([ 0.0990, -0.1084,  0.0919,  0.0403,  0.0946, -0.0779, -0.0827, -0.0630,
         0.0057,  0.1921,  0.0034, -0.0213, -0.0260, -0.0489, -0.1136, -0.0471,
         0.0205,  0.0069, -0.0073, -0.0289,  0.1373, -0.0526, -0.1285, -0.0909,
        -0.0782, -0.0896, -0.0206,  0.0644, -0.1007, -0.1039, -0.0247, -0.0106,
        -0.0576,  0.0726, -0.0192, -0.0970, -0.0066,  0.1217,  0.0262, -0.0237,
         0.1730,  0.0543, -0.0304,  0.0298, -0.0322, -0.0617, -0.0817, -0.0462,
        -0.0892, -0.0161, -0.0046,  0.0825,  0.1646,  0.0720, -0.0878, -0.0814,
         0.1157,  0.0181,  0.0241, -0.0275,  0.0079,  0.0264, -0.0884,  0.0378,
         0.0361,  0.0472,  0.0048,  0.1543,  0.1070,  0.0133,  0.0558,  0.0068,
        -0.0335,  0.0975,  0.0118, -0.0207,  0.0047,  0.0258, -0.1178,  0.0730,
         0.0428,  0.0471, -0.0853, -0.0902,  0.0585, -0.0584, -0.1048,  0.0268,
         0.0822, -0.0505,  0.0702, -0.0164, -0.05

In [5]:
frames_collect(sample_input_2)

The output size: is torch.Size([5, 1, 256, 8, 14, 14])
The output is:  tensor([ 1.4120e-02,  1.3162e-01,  1.2358e-01,  9.4904e-03,  5.8576e-02,
        -6.3274e-03, -1.4801e-01,  4.7116e-04,  1.7889e-01,  8.9640e-02,
         1.6909e-02, -1.4039e-01, -7.7162e-02,  3.4133e-02, -8.7308e-03,
        -1.0690e-01, -1.6448e-01, -1.9020e-01, -1.3194e-01, -9.1860e-02,
         1.5554e-02, -1.3204e-01,  7.8106e-02,  1.6684e-02, -2.3045e-01,
         2.4443e-02,  2.3388e-02,  1.1729e-02, -1.1378e-01, -5.5132e-02,
        -1.8625e-01, -2.0549e-01,  8.0442e-03,  1.5091e-01, -1.0420e-01,
         5.8827e-02, -1.1247e-01,  6.7035e-02,  3.2475e-01,  1.4229e-01,
         9.3153e-02,  6.2190e-02, -2.7965e-01, -1.2180e-01, -5.9984e-02,
         2.4322e-01, -1.2436e-01, -3.0442e-02,  1.7546e-03, -6.8871e-02,
        -4.2046e-03,  5.5112e-02, -2.5628e-02,  2.2816e-01,  1.0023e-01,
        -9.5258e-02,  3.6586e-02, -5.0268e-02,  2.6501e-02,  2.9987e-02,
         1.2623e-01, -4.6841e-02,  7.9813e-02, -2.172

In [6]:
frames_collect(sample_input_3)

The output size: is torch.Size([1, 1, 256, 8, 14, 14])
The output is:  tensor([ 8.1665e-02,  3.4364e-02,  6.1930e-02,  9.1873e-03,  4.9294e-02,
        -5.1080e-02, -9.9974e-02, -2.7652e-02, -2.8828e-02,  7.5942e-02,
         5.4733e-02, -1.2082e-01, -4.3095e-02,  7.4740e-03, -2.4453e-02,
         6.8239e-02,  1.0285e-01, -6.1899e-02, -7.7998e-03, -5.7646e-02,
         8.9190e-02, -2.4081e-03,  7.6920e-02, -1.2542e-01, -9.0308e-02,
         4.3935e-02, -6.1942e-02, -1.2831e-02,  3.3199e-02,  1.3606e-01,
        -6.9822e-02, -6.9676e-02, -5.0564e-02,  2.4191e-02, -6.1515e-02,
         3.1978e-02,  5.5891e-02,  3.8564e-02, -9.3211e-03,  2.6767e-02,
        -4.7748e-02, -1.3109e-02,  8.1868e-03, -4.0652e-02, -1.3115e-01,
        -1.1526e-01, -2.2859e-02,  3.0477e-02, -1.2870e-01, -4.5798e-03,
        -3.2916e-02,  3.7236e-02,  4.5703e-02,  5.1948e-02,  1.1599e-01,
        -1.5780e-02, -1.8405e-02,  1.8339e-01, -1.2520e-01,  6.5151e-02,
         7.2755e-02,  2.0302e-02,  6.5847e-03,  6.444

In [7]:
frames_collect(sample_input_4)

The output size: is torch.Size([9, 1, 256, 8, 14, 14])
The output is:  tensor([-0.2307, -0.0642, -0.0689, -0.1007, -0.2788,  0.1421,  0.1256,  0.0864,
        -0.0403,  0.1457,  0.1639,  0.3238, -0.0476, -0.3572,  0.2370, -0.0010,
         0.1660, -0.1253, -0.1086, -0.1417, -0.2164, -0.0291,  0.0639, -0.1175,
         0.3338,  0.2228, -0.0900, -0.2499, -0.0356,  0.2439, -0.0643, -0.1139,
         0.0230, -0.1626,  0.1055,  0.1143,  0.1869, -0.0109,  0.0686,  0.1791,
        -0.0630, -0.0791,  0.1917,  0.0611, -0.1312,  0.1573, -0.1952, -0.1504,
         0.1373, -0.1132, -0.1429,  0.0370,  0.1789,  0.1670,  0.2650,  0.0116,
        -0.0111, -0.2472,  0.0112, -0.0300,  0.2424, -0.0268, -0.0276, -0.0553,
         0.1722, -0.0864, -0.1903,  0.0412, -0.0608, -0.0573, -0.2163, -0.0894,
        -0.1644, -0.0783, -0.1227,  0.1677,  0.2340,  0.4091, -0.2211,  0.0345,
         0.0750, -0.0853,  0.1347, -0.0926,  0.0826,  0.0567, -0.0686,  0.0645,
         0.1049, -0.0031,  0.3269,  0.0791, -0.38

# Gather output for layer4

In [8]:
# Load the pre-trained model and sift to eval mode
model = models.video.r3d_18(pretrained=True)
model.eval()
#Defining the hook for the output
def output_hook(desired_layers):
    layer_outputs = {}
     # Gather the output for modules for the specified layers
    def hook_function(module, input, output):
        for name, mod in model.named_modules():
            if mod == module:
                module_name = name
        if module_name in desired_layers:
            layer_outputs[module_name] = output.detach()
    # Registering the hook for the layer in desired_layer
    for name, module in model.named_modules():
        if name in desired_layers:
            module.register_forward_hook(hook_function)
    return layer_outputs

# Define layers to capture
desired_layers = ['layer4']
layer_outputs = output_hook(desired_layers)

#Sliding Window Technique with window_size and step
def sliding_window(frames, window_size, step):
    slided_frames = []
    total_frames = len(frames)
    #loop through every window of size 32
    for i in range(0, total_frames - window_size +1, step):
        current_slide =[]
        for j in range(window_size):
            current_slide.append(frames[i+j])
        slided_frames.append(current_slide)
    return slided_frames
    
# Function to preprocess the video and apply sliding window approach  
def frames_collect(video_file_path):    
    #Capture the video
    video_frames = []
    cap = cv2.VideoCapture(video_file_path)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(frame)
        # Visualizing the video frame using OpenCV
        cv2.imshow("The captured frame is: ", frame)
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows()    
    #Defining the maximum frames to 32 and step as 16
    max_frames = 32
    step = 16
    sliding_frames = []
    # Get sequences using sliding window
    if len(video_frames)>=32:
        sliding_frames = sliding_window(video_frames, max_frames, step)
    else:
        print("Skipping the video", video_file_path)
        return
        #diff = 32-len(video_frames)
        #for i in range(0, diff):
            #video_frames.append(video_frames[i])
    
    # Transformation pipeline
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor()
    ])
    #Gathering all layer outputs ffrom the sliding window function
    slided_output_layer = []
    transformed_frame=[]
    for frames in sliding_frames:
        transformed_frame=[]
        for frame in frames:
            transform_frame=transform(frame)
            transformed_frame.append(transform_frame)
        tensor_frames = torch.stack(transformed_frame)
        tensor_frames = tensor_frames.permute(1, 0, 2, 3).unsqueeze(0)
        # Append the layers
        with torch.no_grad():
            output = model(tensor_frames)
        for layer_name, output in layer_outputs.items():
            slided_output_layer.append(output)
    # Stack outputs along a new dimension
    stack_layers = torch.stack(slided_output_layer, dim=0)
    # Apply max pooling across the windows (dim=0)
    maxpooling_output = torch.max(stack_layers, dim=0).values
    maxpooling_output = maxpooling_output.squeeze(0)
    output_current_layer = maxpooling_output.mean(dim=[1,2,3])
    print("The output is: ", output_current_layer.shape)
    print("The output size: is",output_current_layer)

In [9]:
frames_collect(sample_input_1)

The output is:  torch.Size([512])
The output size: is tensor([7.7579e-01, 4.5129e-01, 2.0717e-01, 1.6730e+00, 8.9261e-01, 3.6058e-01,
        2.4383e-01, 5.6395e-02, 1.7158e-01, 2.5123e-01, 3.4304e-01, 1.3392e+00,
        2.9629e-01, 2.7374e-01, 3.4943e-01, 3.2740e-01, 2.7913e-01, 9.6901e-01,
        7.1079e-01, 5.3321e-01, 3.8604e-01, 3.0036e-01, 5.4745e-01, 2.0259e-01,
        7.1918e-01, 7.5260e-01, 4.9647e-02, 2.1695e-01, 6.9807e-01, 1.0871e+00,
        4.1526e-01, 2.7820e-02, 5.7344e-01, 4.3983e-01, 5.9127e-01, 3.0679e-01,
        5.5573e-01, 7.2641e-01, 1.8070e-02, 1.5918e+00, 1.5167e-01, 1.2040e-01,
        4.5441e-01, 6.0110e-01, 1.4058e+00, 7.6430e-02, 2.6468e-01, 4.4147e-02,
        3.6267e-01, 1.6038e+00, 1.4299e+00, 6.2218e-01, 4.0504e-01, 3.3926e-01,
        8.3149e-01, 4.0040e-01, 9.8210e-01, 4.4458e-01, 3.9537e-03, 5.5132e-01,
        1.5499e-01, 5.2958e-01, 5.2714e-01, 1.9877e+00, 1.6646e+00, 4.6035e-01,
        2.7412e-01, 4.4741e-01, 3.3356e-01, 2.5586e-01, 2.7008e-01

In [10]:
frames_collect(sample_input_2)

The output is:  torch.Size([512])
The output size: is tensor([1.6837, 0.6771, 0.7004, 0.6793, 1.2531, 0.3773, 0.3125, 0.4773, 0.6487,
        0.5478, 1.4792, 1.7842, 0.6825, 0.9925, 1.1459, 1.1041, 1.0127, 2.1688,
        1.7884, 0.6928, 1.1464, 0.4939, 0.7932, 0.9736, 1.0556, 1.1016, 0.8514,
        1.0055, 0.9254, 1.1441, 0.5119, 1.1958, 0.5374, 1.2089, 1.1962, 0.8480,
        0.4494, 1.3445, 0.4514, 2.0262, 0.4957, 0.9180, 0.4583, 1.4063, 2.3397,
        1.0352, 1.2072, 0.3386, 0.4911, 1.7410, 2.0319, 1.0079, 1.8192, 1.0472,
        0.9110, 1.2798, 3.0166, 0.6431, 0.2193, 0.9200, 0.9569, 0.8908, 0.5870,
        0.9834, 3.0343, 1.9360, 1.7962, 1.4064, 1.7433, 1.2147, 0.8337, 1.1533,
        0.7220, 1.8619, 1.0576, 2.4818, 0.9272, 0.4529, 0.2872, 0.9707, 0.6102,
        0.9694, 1.2390, 0.7037, 0.3970, 0.6868, 1.5592, 0.9836, 0.8231, 1.7682,
        1.7261, 1.6992, 1.4541, 0.3497, 1.8535, 1.2403, 0.4826, 1.3325, 1.3015,
        0.7488, 2.2028, 0.5315, 0.3048, 2.6131, 0.8301, 1.7838, 1.

In [11]:
frames_collect(sample_input_3)

The output is:  torch.Size([512])
The output size: is tensor([0.9367, 0.4827, 1.0497, 0.4203, 0.3126, 0.4230, 1.2803, 0.3168, 0.6442,
        0.9106, 0.7423, 2.5590, 1.0426, 1.0067, 0.9097, 0.2099, 0.2941, 0.4455,
        0.3833, 0.5824, 0.0207, 0.4135, 0.8301, 0.8446, 0.5102, 1.0307, 0.7937,
        0.3483, 0.9858, 0.6824, 0.1545, 0.6911, 0.7652, 1.0923, 1.2133, 1.5107,
        0.3752, 2.1716, 0.9251, 0.7581, 0.3904, 0.1953, 0.2319, 1.0024, 1.1077,
        0.4836, 0.6344, 0.9908, 0.7618, 2.8145, 0.5304, 1.2291, 1.4844, 1.0228,
        0.2132, 0.5384, 0.4033, 0.5564, 0.7473, 0.8041, 0.1867, 1.1768, 0.1432,
        0.5795, 0.7811, 0.0133, 0.7557, 1.0523, 1.9415, 0.7934, 0.2676, 1.0487,
        1.9105, 2.5016, 1.7352, 0.1147, 0.1244, 0.2670, 0.1145, 0.2925, 0.3337,
        0.9030, 0.9493, 1.3562, 0.2458, 1.0983, 1.0096, 1.2707, 0.8021, 1.8631,
        0.7592, 0.5140, 0.3852, 0.8677, 0.7106, 1.4081, 0.3101, 0.3991, 0.9322,
        1.0870, 0.2850, 0.5233, 0.3938, 2.3760, 1.3855, 2.0014, 0.

In [12]:
frames_collect(sample_input_4)

The output is:  torch.Size([512])
The output size: is tensor([2.4411, 0.5666, 1.8598, 2.0171, 1.9870, 1.8907, 2.0599, 1.3425, 0.8550,
        2.7212, 1.6989, 1.2895, 1.3887, 1.9664, 0.9090, 1.5021, 1.9278, 1.6439,
        2.8154, 1.6013, 1.0401, 1.3525, 1.4834, 2.5557, 1.0083, 1.6117, 0.9043,
        1.1451, 1.3762, 2.2180, 0.3626, 0.6087, 2.4042, 1.2277, 1.4794, 1.1843,
        1.3084, 1.0991, 0.7163, 2.8924, 1.4175, 1.5645, 0.6755, 2.5139, 2.5528,
        1.3097, 0.7739, 0.3060, 1.3931, 1.0105, 2.1934, 3.1912, 1.5876, 1.7339,
        2.1120, 2.0152, 2.5356, 0.9265, 1.4910, 2.1628, 1.0012, 1.5088, 0.7987,
        2.6206, 1.2426, 1.9988, 1.3777, 2.1804, 2.5331, 1.6962, 0.7735, 1.3231,
        2.1047, 0.8420, 1.2953, 1.6759, 1.2353, 1.3414, 1.4294, 1.3430, 2.3848,
        3.3447, 2.1617, 1.1585, 1.7472, 0.7377, 2.9514, 1.0049, 1.8954, 1.7356,
        1.7746, 1.4275, 0.8666, 1.1147, 1.4269, 0.8324, 2.3343, 2.5240, 1.4658,
        1.6645, 0.1974, 2.3741, 0.9591, 1.3320, 0.4813, 1.1087, 2.

# Gather output for avgpool

In [13]:
# Load the pre-trained model and sift to eval mode
model = models.video.r3d_18(pretrained=True)
model.eval()
#Defining the hook for the output
def output_hook(desired_layers):
    layer_outputs = {}
     # Gather the output for modules for the specified layers
    def hook_function(module, input, output):
        for name, mod in model.named_modules():
            if mod == module:
                module_name = name
        if module_name in desired_layers:
            layer_outputs[module_name] = output.detach()
    # Registering the hook for the layer in desired_layer
    for name, module in model.named_modules():
        if name in desired_layers:
            module.register_forward_hook(hook_function)
    return layer_outputs

# Define layers to capture
desired_layers = ['avgpool']
layer_outputs = output_hook(desired_layers)

#Sliding Window Technique with window_size and step
def sliding_window(frames, window_size, step):
    slided_frames = []
    total_frames = len(frames)
    #loop through every window of size 32
    for i in range(0, total_frames - window_size +1, step):
        current_slide =[]
        for j in range(window_size):
            current_slide.append(frames[i+j])
        slided_frames.append(current_slide)
    return slided_frames
    
# Function to preprocess the video and apply sliding window approach  
def frames_collect(video_file_path):    
    #Capture the video
    video_frames = []
    cap = cv2.VideoCapture(video_file_path)
    if not cap.isOpened():
        print("Couldnot read the video")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        video_frames.append(frame)
        # Visualizing the video frame using OpenCV
        cv2.imshow("The captured frame is: ", frame)
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break
    
    # Release and destroy the windows
    cap.release()
    cv2.destroyAllWindows()    
    #Defining the maximum frames to 32 and step as 16
    max_frames = 32
    step = 16
    sliding_frames = []
    # Get sequences using sliding window
    if len(video_frames)>=32:
        sliding_frames = sliding_window(video_frames, max_frames, step)
    else:
        print("Skipping the video", video_file_path)
        return
        #diff = 32-len(video_frames)
        #for i in range(0, diff):
            #video_frames.append(video_frames[i])
    # Transformation pipeline
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor()
    ])
    #Gathering all layer outputs ffrom the sliding window function
    slided_output_layer = []
    transformed_frame=[]
    for frames in sliding_frames:
        transformed_frame=[]
        for frame in frames:
            transform_frame=transform(frame)
            transformed_frame.append(transform_frame)
        tensor_frames = torch.stack(transformed_frame)
        tensor_frames = tensor_frames.permute(1, 0, 2, 3).unsqueeze(0)
        # Append the layers
        with torch.no_grad():
            output = model(tensor_frames)
        for layer_name, output in layer_outputs.items():
            slided_output_layer.append(output)
    # Stack outputs along a new dimension
    stack_layers = torch.stack(slided_output_layer, dim=0)
    # Apply max pooling across the windows (dim=0)
    maxpooling_output = torch.max(stack_layers, dim=0).values
    maxpooling_output = maxpooling_output.squeeze(0)
    output_current_layer = maxpooling_output.mean(dim=[1,2,3])
    print("The output is: ", output_current_layer)
    print("The output size: is",output_current_layer.shape)

In [14]:
frames_collect(sample_input_1)

The output is:  tensor([7.7579e-01, 4.5129e-01, 2.0717e-01, 1.6730e+00, 8.9261e-01, 3.6058e-01,
        2.4383e-01, 5.6395e-02, 1.7158e-01, 2.5123e-01, 3.4304e-01, 1.3392e+00,
        2.9629e-01, 2.7374e-01, 3.4943e-01, 3.2740e-01, 2.7913e-01, 9.6901e-01,
        7.1079e-01, 5.3321e-01, 3.8604e-01, 3.0036e-01, 5.4745e-01, 2.0259e-01,
        7.1918e-01, 7.5260e-01, 4.9647e-02, 2.1695e-01, 6.9807e-01, 1.0871e+00,
        4.1526e-01, 2.7820e-02, 5.7344e-01, 4.3983e-01, 5.9127e-01, 3.0679e-01,
        5.5573e-01, 7.2641e-01, 1.8070e-02, 1.5918e+00, 1.5167e-01, 1.2040e-01,
        4.5441e-01, 6.0110e-01, 1.4058e+00, 7.6430e-02, 2.6468e-01, 4.4147e-02,
        3.6267e-01, 1.6038e+00, 1.4299e+00, 6.2218e-01, 4.0504e-01, 3.3926e-01,
        8.3149e-01, 4.0040e-01, 9.8210e-01, 4.4458e-01, 3.9537e-03, 5.5132e-01,
        1.5499e-01, 5.2958e-01, 5.2714e-01, 1.9877e+00, 1.6646e+00, 4.6035e-01,
        2.7412e-01, 4.4741e-01, 3.3356e-01, 2.5586e-01, 2.7008e-01, 8.6741e-02,
        1.1212e+00, 3.73

In [15]:
frames_collect(sample_input_2)

The output is:  tensor([1.5335, 0.4238, 0.4668, 0.5618, 0.8520, 0.2227, 0.2807, 0.3332, 0.4589,
        0.3058, 1.0729, 1.4955, 0.4570, 0.7664, 0.8046, 0.7344, 0.6005, 1.7180,
        1.6090, 0.4628, 0.7581, 0.2809, 0.4536, 0.7133, 0.8271, 0.7206, 0.7130,
        0.5719, 0.5614, 0.9395, 0.3142, 1.0028, 0.4288, 0.9812, 0.9470, 0.5246,
        0.2880, 0.9706, 0.4048, 1.5862, 0.2256, 0.7025, 0.3525, 1.1178, 1.9905,
        0.7110, 0.7580, 0.2655, 0.2853, 1.2196, 1.6573, 0.8917, 1.4085, 0.7297,
        0.5269, 0.8930, 2.3949, 0.4386, 0.1402, 0.5878, 0.7193, 0.5694, 0.4566,
        0.8101, 2.4654, 1.4529, 1.3236, 1.1483, 1.3578, 0.8419, 0.5992, 0.6983,
        0.6348, 1.2482, 0.6887, 1.8605, 0.6350, 0.2882, 0.1456, 0.7113, 0.3922,
        0.7331, 0.7470, 0.5255, 0.1934, 0.4903, 1.0999, 0.6590, 0.5998, 1.2706,
        1.3048, 1.3398, 1.1573, 0.2108, 1.5209, 0.7340, 0.3765, 0.8705, 0.8831,
        0.4856, 1.7266, 0.2835, 0.2238, 1.8971, 0.3803, 1.2640, 1.3930, 3.3544,
        1.0169, 1.1262, 

In [16]:
frames_collect(sample_input_3)

The output is:  tensor([0.9367, 0.4827, 1.0497, 0.4203, 0.3126, 0.4230, 1.2803, 0.3168, 0.6442,
        0.9106, 0.7423, 2.5590, 1.0426, 1.0067, 0.9097, 0.2099, 0.2941, 0.4455,
        0.3833, 0.5824, 0.0207, 0.4135, 0.8301, 0.8446, 0.5102, 1.0307, 0.7937,
        0.3483, 0.9858, 0.6824, 0.1545, 0.6911, 0.7652, 1.0923, 1.2133, 1.5107,
        0.3752, 2.1716, 0.9251, 0.7581, 0.3904, 0.1953, 0.2319, 1.0024, 1.1077,
        0.4836, 0.6344, 0.9908, 0.7618, 2.8145, 0.5304, 1.2291, 1.4844, 1.0228,
        0.2132, 0.5384, 0.4033, 0.5564, 0.7473, 0.8041, 0.1867, 1.1768, 0.1432,
        0.5795, 0.7811, 0.0133, 0.7557, 1.0523, 1.9415, 0.7934, 0.2676, 1.0487,
        1.9105, 2.5016, 1.7352, 0.1147, 0.1244, 0.2670, 0.1145, 0.2925, 0.3337,
        0.9030, 0.9493, 1.3562, 0.2458, 1.0983, 1.0096, 1.2707, 0.8021, 1.8631,
        0.7592, 0.5140, 0.3852, 0.8677, 0.7106, 1.4081, 0.3101, 0.3991, 0.9322,
        1.0870, 0.2850, 0.5233, 0.3938, 2.3760, 1.3855, 2.0014, 0.7741, 1.7549,
        0.3006, 0.7468, 

In [17]:
frames_collect(sample_input_4)

The output is:  tensor([1.4409, 0.2856, 1.0468, 1.3077, 1.1834, 1.2234, 1.2630, 1.0709, 0.4479,
        2.0818, 1.0171, 0.7398, 0.9103, 1.4350, 0.3696, 1.0779, 1.4036, 0.7182,
        2.2314, 1.1611, 0.6076, 0.9305, 0.9713, 1.9461, 0.4874, 0.8314, 0.4516,
        0.6637, 0.7863, 1.1021, 0.1933, 0.2794, 1.8596, 0.7272, 0.8621, 0.5504,
        0.7251, 0.5561, 0.4284, 1.9499, 0.8450, 1.1227, 0.3555, 1.7234, 1.5830,
        1.0052, 0.4463, 0.1038, 0.8854, 0.3777, 1.3387, 2.2872, 1.2170, 0.9491,
        1.3649, 1.1907, 1.7864, 0.3595, 1.0697, 1.4559, 0.4935, 0.8914, 0.3439,
        1.8230, 0.8692, 1.4212, 0.7097, 1.3653, 1.5365, 0.9737, 0.3791, 0.6992,
        1.5996, 0.3821, 0.7864, 1.1483, 0.8616, 0.7107, 1.1375, 0.6962, 1.7805,
        2.6542, 1.3253, 0.6685, 1.1710, 0.2853, 2.0289, 0.4045, 1.2759, 0.7948,
        1.1902, 0.6980, 0.4122, 0.5687, 0.7518, 0.4956, 1.9488, 1.7498, 0.8833,
        1.0536, 0.0978, 1.4926, 0.4372, 0.8673, 0.2266, 0.5248, 1.4031, 1.2539,
        0.8904, 1.0884, 