## Copy of the code run in Kaggle to train the model using supervised learning.

#### Paths are based on Kaggle input/working directories

### Initialize the data collection/preprocessing function

In [None]:
import cv2
import os
import torch
import json
from tqdm import tqdm

def compile_data(root, requested_split, split_file, num_classes):
    labels = open(split_file, "r")
    labels = json.load(labels)
    data = []
    
    for id in tqdm(labels.keys()):
        if requested_split == "train":
            if labels[id]["subset"] not in ["train", "val"]:
                continue
        else:
            if labels[id]["subset"] != "test":
                continue
                
        path = os.path.join(root, id + ".mp4")
        
        if not os.path.exists(path):
            continue
            
        frames = int(cv2.VideoCapture(path).get(cv2.CAP_PROP_FRAME_COUNT))
        
        if frames < 9: 
            continue
        
        cls_label = labels[id]["action"][0]
        
        start_frame = labels[id]["action"][1] - 1
        num_frames = labels[id]["action"][2] - labels[id]["action"][1] + 1
        
        if num_frames < 16:
            continue
                
        data.append((id, cls_label, start_frame, num_frames))
    
    
    return data, len(data)
        

#### Install pytorchvideo package to help with transforms

In [None]:
!pip install pytorchvideo

#### Define the custom dataset class and data loading function

In [None]:
import torchvision 
from torch.utils.data import Dataset
import pytorchvideo

def load_frames_from_video(path, start_frame, num_frames):
    cap = cv2.VideoCapture(path) 
    interval = num_frames // 16
    current_frame = start_frame + 1
    cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame); 
    frames = []
    for i in range(16):
        ret, frame = cap.read()
        current_frame += interval
        frame = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        frames.append(frame)
    result = torch.stack(frames)
    return result

class ASLDataset(Dataset):
    def __init__(self, root, requested_split, split_file, num_classes, transforms):
        self.data, self.length = compile_data(root, requested_split, split_file, num_classes)
        self.root = root
        self.requested_split = requested_split
        self.split_file = split_file
        self.num_classes = num_classes
        self.transf = transforms
    def __getitem__(self, index):
        
        vid_id, label, start_frame, num_frames = self.data[index]
        
        path = os.path.join(self.root, vid_id + ".mp4")

        #if num_frames > len(timestamps):
        #    end_frame = len(timestamps) - 1
        #else:
        #    end_frame = start_frame + num_frames - 1
        
        imgs = load_frames_from_video(path, start_frame, num_frames).permute(3, 0, 1, 2)
        #print(imgs.shape)
        data = {"video": imgs}
        data = self.transf(data)
        
        return data["video"], label
    def __len__(self):
        return self.length
        

### Define transforms for the datasets

In [None]:
# Transforms derived from the PyTorch official X3D training examples: https://pytorch.org/hub/facebookresearch_pytorchvideo_x3d/
from torchvision.transforms import Compose, Lambda, RandomCrop, RandomHorizontalFlip
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
    RandomCropVideo,
    RandomHorizontalFlipVideo
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample, 
    Normalize, 
    RandomShortSideScale,
)

mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]

transform_params = {
    "side_size":  256,
    "num_frames": 16, 
    "crop_size": 256, 
    
}

train_transform = ApplyTransformToKey(
    key = "video",
    transform = Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            RandomShortSideScale(min_size=256, max_size=320),
            RandomCrop(256),
            RandomHorizontalFlip(p=0.5),
        ]
    ),
)

test_transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)


#### Load model, define dataloaders and run training loop
###### The val loader is a work in progress. Here I have defined it as the original test split but this set of data was found to be incomplete.

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
root = "../input/wlasl-processed/videos"
split_file = "../input/wlasl-processed/nslt_100.json"
num_classes = 100

print("Make Datasets")
train_dataset = ASLDataset(root, "train", split_file, num_classes, train_transform)
val_dataset = ASLDataset(root, "test", split_file, num_classes, test_transform)

print("Make Loaders")

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 8, shuffle = True, num_workers = 2, pin_memory = True)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = 4, shuffle = True, num_workers = 2, pin_memory = True)

device = "cuda"

model_name = 'x3d_m'
model = torch.hub.load('facebookresearch/pytorchvideo:main', model_name, pretrained=True)

model.blocks[5].proj = nn.Linear(2048, num_classes)


for name, param in model.named_parameters():
    if "proj" not in name:
        param.requires_grad = False



# Uncomment these two lines to load weights
#PATH = "./Epoch23" Uncomment this Line
#model.load_state_dict(torch.load(PATH))

model.to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.00075)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4, factor=0.3)

num_epochs = 50

print("Start Training")
    
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}")
    model.train()
    train_count = 0
    train_correct = 0
    total_train_loss = 0
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        data = data.to(device)
        labels = targets.to(device)

        scores = model(data)
        train_loss = criterion(scores, labels)
        #total_train_loss += loss
        
        _, preds = scores.max(1)
        train_correct += (preds == labels).sum()
        train_count += preds.size(0)
        
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    print(f"Training Loss: {train_loss}, Training Acc: {train_correct / train_count}, # Correct: {train_correct}") 
    
    model.eval()
    val_count = 0
    val_correct = 0
    total_val_loss = 0
    with torch.no_grad():
        for batch_idx, (data, targets) in enumerate(tqdm(val_loader)):
            data = data.to(device)
            labels = targets.to(device)
        
            activation = nn.Softmax(dim = 1)
            scores = model(data)
            
            val_loss = criterion(scores, labels)
            
            _, preds = scores.max(1)
            val_correct += (preds == labels).sum()
            val_count += preds.size(0)
            
    scheduler.step(val_loss)
    print(f"Val Loss: {val_loss}, Val Acc: {val_correct / val_count}, # Correct: {val_correct}")

    
    torch.save(model.state_dict(), f"/kaggle/working/Epoch{epoch}")