In [1]:
import pandas as pd
import numpy as np
import cv2
import torch
import os

In [None]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

In [3]:
annot = pd.read_csv("data/annotations.csv", sep='\t')
annot.head(5)

Unnamed: 0,attachment_id,text,user_id,height,width,length,train,begin,end
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,1920,1080,156.0,True,36,112
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,1920,1080,150.0,True,36,76
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,1920,1080,133.0,True,40,97
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,1920,1080,144.0,True,43,107
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,1920,1080,96.0,True,20,70


In [4]:
labels = ["Привет!","Добро пожаловать!","С днем рождения","Пока", "MakDonalds"]

In [6]:
train150 = annot.query("text in @labels and train")

In [7]:
val50 = annot.query("text in @labels and not train")
val50.shape

(25, 9)

In [8]:
def crop_frame(frame):
    """
    Crops the frame to a square shape
    :param frame: frame to crop
    :return: cropped frame
    """
    height, width = frame.shape[:2]
    th_dim = frame.shape[2]
    max_dim = max(height, width)
    dif = abs(height-width)

    first_side = dif // 2
    second_side = dif - first_side
    
    
    if width == max_dim:
        f_array = np.zeros(shape=(first_side, max_dim, th_dim))
        s_array = np.zeros(shape=(second_side, max_dim, th_dim))
        frame = np.concatenate((f_array, np.array(frame), s_array), axis=0)
    else:
        f_array = np.zeros(shape=(max_dim, first_side, th_dim))
        s_array = np.zeros(shape=(max_dim, second_side, th_dim))
        frame = np.concatenate((f_array, np.array(frame), s_array), axis=1)

    return frame

In [9]:
def load_video(path, img_size, num_frames=132):
    """
    Loads the video from the path and returns a tensor of frames
    :param path: path to the video
    :param img_size: size of the image
    :param num_frames: number of frames to sample
    :return: tensor of frames
    """
    cap = cv2.VideoCapture(path)
    frames = []
    i = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if i % 4 == 0:
            frame = crop_frame(frame)
            frame = cv2.resize(frame, (img_size, img_size))
            frame = frame[:, :, [2, 1, 0]]  # BGR to RGB
            frame_tensor = torch.Tensor(frame).permute(2, 0, 1).to(device)
            frames.append(frame_tensor)
        i += 1
        
    return frames

In [10]:
from pathlib import Path

tensor_dir = "tensors"
Path(tensor_dir).mkdir(parents=True, exist_ok=True)

In [None]:
def process_and_save_tensors(annot_subset, subset_name, subdir = 'train'):
    """
    Processes and saves tensors to disk
    :param annot_subset: DataFrame with annotations
    :param subset_name: name of the subset
    :return: None
    """
    i = 0
    for ind, row in annot_subset.iterrows():
        path = row['attachment_id']
        full_path = "/kaggle/input/slovo/slovo/" + str(subdir) + "/" + str(path) + ".mp4"

        # Load and process video
        frames = load_video(full_path, 244, 244)
    
        # Save tensor to disk and store path in DataFrame
        tensor_dir = "tensors"
        tensor_path = os.path.join(tensor_dir, f"{subset_name}_{path}.pt")
        torch.save(frames, tensor_path)
        annot_subset.loc[ind, 'attachment_id'] = str(tensor_path)
        
        i += 1
        if i % 10 == 0:
            print(f"We are done on the image number {i}")
    
    # Save DataFrame to CSV
    annot_subset.to_csv(f"processed_annotations_{subset_name}.csv", index=False)


In [None]:
process_and_save_tensors(train150, "train", subdir = 'train')
process_and_save_tensors(val50, "valid", subdir = 'test')