In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch

### Annotaions

In [2]:
# Load the annotations
annotations_train_path = 'gg/processed_annotations_train.csv'
annotations_train = pd.read_csv(annotations_train_path)

annotations_val_path = 'gg/processed_annotations_valid.csv'
annotations_val = pd.read_csv(annotations_val_path)

In [3]:
print(annotations_train.head())

                                       attachment_id        text  \
0  gg/tensors\train_530cbaa0-c25d-4acd-a6d5-9dcf7...  MakDonalds   
1  gg/tensors\train_aca0a032-9b26-4eee-949e-981c6...  MakDonalds   
2  gg/tensors\train_ed83f161-e83e-4c92-8997-2efda...  MakDonalds   
3  gg/tensors\train_f06bef78-4143-44b1-8d96-741cf...  MakDonalds   
4  gg/tensors\train_e52ee302-7952-4cf6-be52-c80bc...  MakDonalds   

                            user_id  height  width  length  train  begin  end  
0  db573f94204e56e0cf3fc2ea000e5bdc    1280    720   126.0   True     22   78  
1  2d84da20c251acaeb3186642fcb04f2e    1920   1080    68.0   True      6   40  
2  0df9d6e419cb18069e696edaa170ba87    1920   1080   114.0   True     19   76  
3  95af8e702c909eee7145c6dc1a3d756b    1280    720    85.0   True      1   60  
4  0211b488644476dd0fec656ccb9b74fc    1920   1080   121.0   True     23   88  


In [4]:
print(annotations_val.head())

                                       attachment_id        text  \
0  gg/tensors\valid_08ba3f14-0d22-4c96-9e66-d1fed...  MakDonalds   
1  gg/tensors\valid_3d9ffa25-0346-48b4-afc4-16978...  MakDonalds   
2  gg/tensors\valid_3b2832fc-200c-43cc-aa17-3f3aa...  MakDonalds   
3  gg/tensors\valid_758a8d0c-69c7-4605-8884-ac0be...  MakDonalds   
4  gg/tensors\valid_95816840-b7fa-4e4a-b39d-f6e53...  MakDonalds   

                            user_id  height  width  length  train  begin  end  
0  d2b4042ec6d8505a41b809e64d5adb7c    1920   1080    55.0  False      3   35  
1  e4bd328bca8e6f51bd6f4f019692b666    1920   1080    73.0  False      6   45  
2  4299b8ccf39ace57287b463fbe4a489b    1920    960   101.0  False     14   66  
3  3018b64d2c938f5b6a0826dfdf486f2c    1920   1080   132.0  False     18   94  
4  e3e1fd4bbf07a0423ee20d5c9baa49cc    1920   1080    95.0  False     14   71  


### Definition of custom dateset class with padding

In [5]:
class PaddedSignLanguageDataset(Dataset):
    def __init__(self, annotations, transform=None, max_length=None):
        """
        Custom dataset for loading sign language video tensors with padding.
        Each video tensor is padded to a uniform length for consistent processing.

        :param annotations (DataFrame): DataFrame containing the annotations.
        :param transform (callable, optional): Optional transform to be applied on a sample.
        :param max_length (int, optional): Maximum length of the video tensors. If not provided, it will be calculated.
        """
        self.annotations = annotations
        self.transform = transform
        self.max_length = 64
        self.tensor_path = ""

        if self.max_length is None:
            # Calculate the maximum length among all tensors
            self.max_length = max(len(self.tensor_path + torch.load(row['attachment_id'], map_location=torch.device('cpu'))) for _, row in annotations.iterrows())

    def __len__(self):
        """
        Returns the number of samples in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, idx):
        """
        Returns the sample at the given index.

        :param idx (int): Index
        :return: Tuple of (video tensor, label)
        """
        tensor_path = self.annotations.iloc[idx]['attachment_id']
        label = self.annotations.iloc[idx]['text']
        
        # Load the tensor
        tensor = torch.load(self.tensor_path + tensor_path, map_location=torch.device('cpu'))

        # Pad the tensor to the maximum length
        padded_tensor = torch.zeros((self.max_length, *tensor[0].shape))
        padded_tensor[:len(tensor)] = torch.stack(tensor)
                
        # Apply transform if any
        if self.transform:
            padded_tensor = self.transform(padded_tensor)

        return padded_tensor, label

In [6]:
# Create the padded dataset and dataloader
padded_dataset_train = PaddedSignLanguageDataset(annotations_train)
padded_dataloader_train = DataLoader(padded_dataset_train, batch_size=16, shuffle=True)

padded_dataset_val = PaddedSignLanguageDataset(annotations_val)
padded_dataloader_val = DataLoader(padded_dataset_val, batch_size=16, shuffle=True)

In [7]:
# Display a sample from the padded dataset
for tensor, label in padded_dataloader_train:
    print("Sample tensor shape:", tensor.shape)
    print("Sample label:", label)
    break  # Display only the first batch

Sample tensor shape: torch.Size([16, 64, 3, 64, 64])
Sample label: ('Пока', 'Привет!', 'Привет!', 'MakDonalds', 'MakDonalds', 'С днем рождения', 'Добро пожаловать!', 'Привет!', 'Добро пожаловать!', 'Привет!', 'С днем рождения', 'С днем рождения', 'MakDonalds', 'Пока', 'С днем рождения', 'С днем рождения')


### Models definition

In [8]:
class TwoStream3DConvNet(nn.Module):
    def __init__(self, num_classes):
        super(TwoStream3DConvNet, self).__init__()
        """
        A two-stream 3D Convolutional Neural Network for video classification.
        This network processes spatial and temporal information separately and then combines them.

        :param num_classes (int): Number of classes for classification.
        """
        # Spatial Stream
        self.spatial_stream = nn.Sequential(
            nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2),
            nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )
        
        # Temporal Stream
        self.temporal_stream = nn.Sequential(
            nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2),
            nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(262144, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        """
        Forward pass of the network.

        :param x (Tensor): Input tensor of shape (batch_size, 3, max_length, 112, 112)
        :return: Output tensor of shape (batch_size, num_classes)
        """
        spatial_out = self.spatial_stream(x)
        temporal_out = self.temporal_stream(x)
        
        # Concatenate the outputs of the two streams
        combined = torch.cat((spatial_out, temporal_out), dim=1)
        
        combined = torch.flatten(combined, 1)
        combined = self.relu(self.fc1(combined))
        combined = self.fc2(combined)
        return combined

In [9]:
num_classes = len(set(annotations_train['text']))

models = {
#     'simple_3d_conv_net': Simple3DConvNet(num_classes),
    'two_stream_3d_conv_net': TwoStream3DConvNet(num_classes)
#     'resnet3d': ResNet3D(num_classes)
}

### Labels Mapping

In [10]:
# Initialize an empty set to collect unique labels
unique_labels_train = set()

# Iterate over your dataset to collect unique labels
for _, label_data in padded_dataloader_train:
    unique_labels_train.update(label_data)

# Sort the labels for consistency
sorted_labels = sorted(unique_labels_train)

# Create the label mapping
train_label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

# Print the label mapping
print("Label Mapping:", train_label_mapping)

Label Mapping: {'MakDonalds': 0, 'Добро пожаловать!': 1, 'Пока': 2, 'Привет!': 3, 'С днем рождения': 4}


In [11]:
# Initialize an empty set to collect unique labels
unique_labels_test = set()

# Iterate over your dataset to collect unique labels
for _, label_data in padded_dataloader_val:
    unique_labels_test.update(label_data)

# Sort the labels for consistency
sorted_labels = sorted(unique_labels_test)

# Create the label mapping
val_label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

# Print the label mapping
print("Label Mapping:", val_label_mapping)

Label Mapping: {'MakDonalds': 0, 'Добро пожаловать!': 1, 'Пока': 2, 'Привет!': 3, 'С днем рождения': 4}


### Train and Validation loops

### Plotting

In [12]:
# Loss function
criterion = nn.CrossEntropyLoss()

num_epochs = 25
results = {}

best_score = 0.28
# Train the models
# Training and Validation
for model_name, model in models.items():
    print(f"Training {model_name}")

    # Loss function and optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # List to store epoch-wise validation accuracies for the current model
    epoch_val_accuracies = []

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        # Training Phase
        model.train()
        loop = tqdm(enumerate(padded_dataloader_train, 0), total=len(padded_dataloader_train), desc=f"Epoch {epoch+1}/{num_epochs} - Training")
        for i, data in loop:
            inputs, label_data = data
            target_labels = torch.tensor([train_label_mapping[label] for label in label_data], dtype=torch.long)
#             print(target_labels)
            inputs = inputs.permute(0, 2, 1, 3, 4)
            labels = torch.zeros((len(target_labels), 5))
            for i, label in enumerate(target_labels):
                labels[i, label] = 1
            
#             print(target_labels[0], labels[0])
            optimizer.zero_grad()

            outputs = model(inputs)
            
#             print(outputs)
            loss = criterion(outputs, labels)
#             print(loss)
            loss.backward()

            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
#             print(predicted)
#             print(target_labels)
            correct_predictions += (predicted == target_labels).sum().item()

            # Update progress bar
            loop.set_description(f"Epoch {epoch+1}/{num_epochs} - Training")
            loop.set_postfix(loss = running_loss / (i+1), accuracy = correct_predictions / total_predictions)

        # Validation Phase
        model.eval()
        val_running_loss = 0.0
        val_correct_predictions = 0
        val_total_predictions = 0
        loop_val = tqdm(enumerate(padded_dataloader_val, 0), total=len(padded_dataloader_val), desc=f"Epoch {epoch+1}/{num_epochs} - Validation")
        for i, data in loop_val:
            inputs, label_data = data

            if len(inputs) == 0 or len(label_data) == 0:
                continue

#             labels = torch.tensor([val_label_mapping.get(label) for label in label_data], dtype=torch.long)
#             inputs = inputs.permute(0, 2, 1, 3, 4)
            target_labels = torch.tensor([train_label_mapping[label] for label in label_data], dtype=torch.long)
#             print(target_labels)
            inputs = inputs.permute(0, 2, 1, 3, 4)
            labels = torch.zeros((len(target_labels), 5))
            for i, label in enumerate(target_labels):
                labels[i, label] = 1
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total_predictions += labels.size(0)
#             print((predicted == target_labels).sum().item())
            val_correct_predictions += (predicted == target_labels).sum().item()

            # Update progress bar
            loop_val.set_description(f"Epoch {epoch+1}/{num_epochs} - Validation")
            loop_val.set_postfix(loss = val_running_loss / (i+1), accuracy = val_correct_predictions / val_total_predictions)

        val_accuracy = val_correct_predictions / val_total_predictions
        epoch_val_accuracies.append(val_accuracy)
        
        if val_correct_predictions / val_total_predictions > best_score:
            print("new best")
            best_score = val_correct_predictions / val_total_predictions
            torch.save(model.state_dict(), model_name + ".pt")

    # Store the epoch-wise validation accuracies for this model
    results[model_name] = epoch_val_accuracies
    
print('Finished Training')


Training two_stream_3d_conv_net


  from .autonotebook import tqdm as notebook_tqdm
Epoch 1/25 - Training: 100%|██████████| 5/5 [00:25<00:00,  5.00s/it, accuracy=0.253, loss=197] 
Epoch 1/25 - Validation: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s, accuracy=0.28, loss=18.7]
Epoch 2/25 - Training: 100%|██████████| 5/5 [00:25<00:00,  5.01s/it, accuracy=0.36, loss=12.3] 
Epoch 2/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it, accuracy=0.32, loss=0.658]


new best


Epoch 3/25 - Training: 100%|██████████| 5/5 [00:27<00:00,  5.59s/it, accuracy=0.387, loss=0.833]
Epoch 3/25 - Validation: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s, accuracy=0.2, loss=0.363]  
Epoch 4/25 - Training: 100%|██████████| 5/5 [00:27<00:00,  5.49s/it, accuracy=0.52, loss=0.514] 
Epoch 4/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it, accuracy=0.24, loss=0.404]
Epoch 5/25 - Training: 100%|██████████| 5/5 [00:28<00:00,  5.68s/it, accuracy=0.627, loss=0.471] 
Epoch 5/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it, accuracy=0.24, loss=0.413] 
Epoch 6/25 - Training: 100%|██████████| 5/5 [00:29<00:00,  5.96s/it, accuracy=0.653, loss=0.374]
Epoch 6/25 - Validation: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s, accuracy=0.2, loss=0.378]  
Epoch 7/25 - Training: 100%|██████████| 5/5 [00:26<00:00,  5.30s/it, accuracy=0.84, loss=0.242]  
Epoch 7/25 - Validation: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s, accuracy=0.2, loss=0.459]  
Epoch 8/25 - Traini

new best


Epoch 11/25 - Training: 100%|██████████| 5/5 [00:28<00:00,  5.79s/it, accuracy=1, loss=0.0141] 
Epoch 11/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it, accuracy=0.4, loss=0.586]  


new best


Epoch 12/25 - Training: 100%|██████████| 5/5 [00:30<00:00,  6.08s/it, accuracy=1, loss=0.00678]
Epoch 12/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it, accuracy=0.36, loss=0.702] 
Epoch 13/25 - Training: 100%|██████████| 5/5 [00:30<00:00,  6.04s/it, accuracy=1, loss=0.00129] 
Epoch 13/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it, accuracy=0.32, loss=0.913] 
Epoch 14/25 - Training: 100%|██████████| 5/5 [00:28<00:00,  5.63s/it, accuracy=1, loss=0.000437]
Epoch 14/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it, accuracy=0.32, loss=0.995] 
Epoch 15/25 - Training: 100%|██████████| 5/5 [00:31<00:00,  6.40s/it, accuracy=1, loss=9.49e-5]
Epoch 15/25 - Validation: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it, accuracy=0.32, loss=1.1]   
Epoch 16/25 - Training: 100%|██████████| 5/5 [00:30<00:00,  6.06s/it, accuracy=1, loss=4.96e-5]
Epoch 16/25 - Validation: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s, accuracy=0.32, loss=1.12]  
Epoch 17/25 - Trai

Finished Training





In [53]:
hello_sign = annotations_val.query("text == 'MakDonalds'").sample(1)
hello_sign

Unnamed: 0,attachment_id,text,user_id,height,width,length,train,begin,end
3,gg/tensors\valid_758a8d0c-69c7-4605-8884-ac0be...,MakDonalds,3018b64d2c938f5b6a0826dfdf486f2c,1920,1080,132.0,False,18,94


In [54]:
hello_sign_dataset = PaddedSignLanguageDataset(hello_sign)
hello_sign_dataloader = DataLoader(hello_sign_dataset, shuffle=False)

In [55]:
model = TwoStream3DConvNet(num_classes)

model_path = "checkpoints/two_stream_3d_conv_net_val04.pt"
model.load_state_dict(torch.load(model_path)) 

<All keys matched successfully>

In [56]:
model

TwoStream3DConvNet(
  (spatial_stream): Sequential(
    (0): Conv3d(3, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): ReLU()
    (2): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (4): ReLU()
    (5): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (temporal_stream): Sequential(
    (0): Conv3d(3, 16, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): ReLU()
    (2): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv3d(16, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (4): ReLU()
    (5): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=262144, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=5, bias=True)
  (relu): ReLU()
)

In [57]:
model.eval()
window_size = 16
threshold = 0.5
frame_interval = 1
mean = [123.675, 116.28, 103.53]
std = [58.395, 57.12, 57.375]

In [58]:
print(hello_sign_dataset[0][1])

MakDonalds


In [59]:
print(len(hello_sign_dataloader))

1


In [60]:
inputs, label_data = hello_sign_dataset[0]
print(inputs.shape)
inputs = inputs.reshape((1, 3, 64, 64, 64))
print(inputs.shape)
out = model(inputs)
pred_class = torch.argmax(out)
print(f"Predicted class: {pred_class}, Target class: {label_data}")

torch.Size([64, 3, 64, 64])
torch.Size([1, 3, 64, 64, 64])
Predicted class: 0, Target class: MakDonalds
