# Transforming to tensors

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

In [None]:
# train_tensors = list(annot_train150['attachment_id'])
# val_tensors = list(annot_val50['attachment_id'])

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
class PaddedSignLanguageDataset(Dataset):
    def __init__(self, annotations, transform=None, max_length=None):
        """
        Corrected custom dataset for loading sign language video tensors with padding.

        Args:
        annotations (DataFrame): DataFrame containing the annotations.
        transform (callable, optional): Optional transform to be applied on a sample.
        max_length (int, optional): Maximum length of the video tensors. If not provided, it will be calculated.
        """
        self.annotations = annotations
        self.transform = transform
        self.max_length = 132

        if self.max_length is None:
            # Calculate the maximum length among all tensors
            self.max_length = max(len(torch.load(row['attachment_id'], map_location=torch.device('cpu'))) for _, row in annotations.iterrows())

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        tensor_path = self.annotations.iloc[idx]['attachment_id']
        label = self.annotations.iloc[idx]['text']
        
        # Load the tensor
        tensor = torch.load(tensor_path, map_location=torch.device('cpu'))

        # Check if the tensor is empty or None
        if tensor is None or len(tensor) == 0:
            print(f"Empty tensor found at index {idx}.")
            return None, label


        # Pad the tensor to the maximum length
        padded_tensor = torch.zeros((self.max_length, *tensor[0].shape))
        padded_tensor[:len(tensor)] = torch.stack(tensor)
                
        # Apply transform if any
        if self.transform:
            padded_tensor = self.transform(padded_tensor)

        return padded_tensor, label

In [None]:
train_dataset = PaddedSignLanguageDataset(annot_train150)
val_dataset = PaddedSignLanguageDataset(annot_val50)

In [None]:
num_classes = 10
train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Model

In [None]:
model_path = "/kaggle/input/mvit16-4/mvit16-4.pt"
model = torch.jit.load(model_path)

In [None]:
model.eval()
window_size = 16 # from model name
threshold = 0.5
frame_interval = 1
mean = [123.675, 116.28, 103.53]
std = [58.395, 57.12, 57.375]

In [None]:
model

In [None]:
prediction_list = []
prediction_list.append("---")

frame_counter = 0
for tensor, label in train_data_loader:
    i = 0
    while True:
        tensor16 = tensor[i:i+window_size]
        print(tensor16.shape)
#         input_tensor = np.stack(tensor16[: window_size], axis=1)
        print(input_tensor.shape)
        input_tensor = input_tensor.astype(np.float32)
        input_tensor = torch.from_numpy(input_tensor)
        with torch.no_grad():
            print(input_tensor.shape)
            outputs = model(input_tensor)[0]
        gloss = str(classes[outputs.argmax().item()])
        if outputs.max() > threshold:
            if gloss != prediction_list[-1] and len(prediction_list):
                if gloss != "---":
                    prediction_list.append(gloss)
            tensor16.clear()
            i += window_size

    text = "  ".join(prediction_list)
    text_div = np.zeros((50, frame.shape[1], 3), dtype=np.uint8)
    cv2.putText(text_div, text, (10, 30), cv2.FONT_HERSHEY_COMPLEX, 0.7, (255, 255, 255), 2)

    frame = np.concatenate((frame, text_div), axis=0)
    writer.write(frame)
writer.release()
cap.release()