## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, WeightedRandomSampler, Dataset

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import cv2
import os

## Load the dataset

In [None]:
action_space = ['walking', 'running', 'jogging', 'boxing', 'handwaving', 'handclapping']
root_folder = ".\\dataset\\"

frames_dictionary = {}

for action in action_space:
    folder = os.path.join(root_folder, action)
    file_num = 0

    frames = []
    frames_dictionary[action] = frames
    for filename in os.listdir(folder):
        size = len(os.listdir(folder))
        file_num += 1
        print(f"loading {action} [{file_num:03}/{size:03}]", end="\r")
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            cap = cv2.VideoCapture(file_path)
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                # The images are grayscale, so we can omit 2 of the 3 RGB channels.
                frame = frame[:, :, 0]/255.0 # Normalize to [0, 1]

                # Append original frame
                frames.append(frame)
                # Append horizontally flipped frame
                frames.append(np.fliplr(frame))
                # Append vertically flipped frame
                frames.append(np.flipud(frame))
            
            cap.release()
        frames_dictionary[action] = np.array(frames)
    print(f"loaded {action:23}")

loaded walking                
loaded running                
loaded jogging                
loaded boxing                 
loaded handwaving             
loaded handclapping           


# A brief overview of our gathered data
We can see that the data is of shape (frames, height, width)

In [14]:
for action in action_space:
    print(f"{action+":":9}\t{frames_dictionary[action].shape}")


walking: 	(3690, 120, 160)
running: 	(2100, 120, 160)
jogging: 	(2520, 120, 160)
boxing:  	(2250, 120, 160)
handwaving:	(3372, 120, 160)
handclapping:	(2784, 120, 160)


## Split the data into train, test, validate

In [None]:
class ActionDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        # Convert to tensor if needed
        image = torch.tensor(image, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)

        if self.transform:
            image = self.transform(image)

        return image, label


In [None]:
all_frames = []
all_actions = []

for counter, (action, frames) in enumerate(frames_dictionary.items()):
    all_frames.extend(frames)
    all_actions.extend([counter] * len(frames))
    
images = np.array(all_frames)
labels = np.array(all_actions)   

X_train, X_temp, y_train, y_temp = train_test_split(
    images, labels, 
    stratify=labels, 
    test_size=0.3, 
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    stratify=y_temp, 
    test_size=0.5, 
    random_state=42
)


train_dataset = ActionDataset(X_train, y_train, transform=None)
val_dataset = ActionDataset(X_val, y_val, transform=None)
test_dataset = ActionDataset(X_test, y_test, transform=None)

# Create sample weights based on training label frequency
class_sample_count = np.bincount(y_train)
class_weights = 1. / class_sample_count
sample_weights = class_weights[y_train]

sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## TODO

1. Can't load all images at once, so implement loading of images in batches.
2. AutoEncoder training
