# Model 2

In [1]:
import torch.nn as nn
import torch
from torchsummary import summary

## Constants

In [2]:
NUM_SEGMENTS = 8
RES_NEXT_OUT = 2048
NUM_EPOCHS = 20

In [3]:
LABELS_PATH = "jester-v1-labels.csv"
with open(LABELS_PATH) as labels_file:
    labels = labels_file.readlines()
    #labels = [label[:-1] for label in labels]
    labels_encode_dict = dict(zip(labels, range(len(labels))))
    labels_decode_dict = dict(zip(range(len(labels)), labels))

## Architecture

In [4]:
class GestureClassifier(nn.Module):
    def __init__(self, input_size, num_classes):

        super().__init__()

        # define 
        self.ln1 = nn.Linear(input_size, int(input_size/1.5))
        self.ln2 = nn.Linear(int(input_size/1.5), int(input_size/3))
        self.ln3 = nn.Linear(int(input_size/3), int(input_size/6))
        self.ln4 = nn.Linear(int(input_size/6), int(input_size/12))
        self.ln5 = nn.Linear(int(input_size/12), num_classes)
        
        # init
        self.initialize_layer(self.ln1)
        self.initialize_layer(self.ln2)
        self.initialize_layer(self.ln3)


    def forward(self, x):
        x = torch.relu(self.ln1(x))
        x = torch.relu(self.ln2(x))
        x = torch.relu(self.ln3(x))
        x = torch.relu(self.ln4(x))
        x = self.ln5(x)
        return x
        

    @staticmethod
    def initialize_layer(layer):
        if hasattr(layer, "bias"):
            nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            nn.init.kaiming_normal_(layer.weight)

In [5]:
model = GestureClassifier(input_size = RES_NEXT_OUT * NUM_SEGMENTS, num_classes=len(labels_encode_dict))
device = "cuda"
model.to(device)
summary(model, input_size = (RES_NEXT_OUT*NUM_SEGMENTS,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 10922]     178,956,970
            Linear-2                 [-1, 5461]      59,650,503
            Linear-3                 [-1, 2730]      14,911,260
            Linear-4                 [-1, 1365]       3,727,815
            Linear-5                   [-1, 27]          36,882
Total params: 257,283,430
Trainable params: 257,283,430
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.06
Forward/backward pass size (MB): 0.16
Params size (MB): 981.46
Estimated Total Size (MB): 981.68
----------------------------------------------------------------


# Frame selection function

In [6]:
random = True

In [7]:
@staticmethod
def _select_frames(list_of_frames, num_segments, random):
    n = len(list_of_frames)
    segment_boundaries = np.linspace(0, n, num_segments + 1, dtype=int)  # Define segment boundaries
    if not random:
        selected_indices = segment_boundaries[:-1]  # Take the first index of each segment
    else:
        selected_indices = [np.random.randint(segment_boundaries[i], segment_boundaries[i + 1]) 
                        for i in range(num_segments)]  # Sample 1 index per segment
    selected_frames = [list_of_frames[i] for i in selected_indices]  # Map indices to frames

    return selected_frames