In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F

### Annotaions

In [2]:
# Load the annotations
annotations_train_path = 'data/processed_annotations_train.csv'
annotations_train = pd.read_csv(annotations_train_path)

annotations_val_path = 'data/processed_annotations_valid.csv'
annotations_val = pd.read_csv(annotations_val_path)

In [3]:
print(annotations_train.head())

                                       attachment_id                  text  \
0  data/tensors\train_2590430a-cddf-460c-83dc-5a9...            аккуратный   
1  data/tensors\train_d988c0b8-8418-47e3-8f07-89a...  адаптивное поведение   
2  data/tensors\train_507d6f3c-f2b9-4411-8b0d-e6c...            аккуратный   
3  data/tensors\train_f7693961-c80f-4e38-afc9-5c3...            аккуратный   
4  data/tensors\train_e44625da-e950-41aa-91b2-d33...            аккуратный   

                            user_id  height  width  length  train  
0  db573f94204e56e0cf3fc2ea000e5bdc    1280    720    39.0   True  
1  db573f94204e56e0cf3fc2ea000e5bdc    1280    720    54.0   True  
2  0211b488644476dd0fec656ccb9b74fc    1920   1080    23.0   True  
3  db573f94204e56e0cf3fc2ea000e5bdc    1280    720    45.0   True  
4  185bd3a81d9d618518d10abebf0d17a8    1920   1080    25.0   True  


In [4]:
print(annotations_val.head())

                                       attachment_id                  text  \
0  data/tensors\valid_eed5b47e-8223-41e8-928b-981...            аккуратный   
1  data/tensors\valid_3c6d1fc6-52ac-4484-91c5-0d5...  адаптивное поведение   
2  data/tensors\valid_252de8df-ccf2-4cea-a784-335...            аккуратный   
3  data/tensors\valid_388017ed-bc9a-4c64-9c8c-036...            аккуратный   
4  data/tensors\valid_8ccbdea0-502a-48fc-b457-626...  адаптивное поведение   

                            user_id  height  width  length  train  
0  dbc8429e8eaae0ac020b062e02d00c47    1920   1080    36.0  False  
1  dbc8429e8eaae0ac020b062e02d00c47    1920   1080    81.0  False  
2  8fe40f6fd9510b901edd9e64641d8618    1280    720    55.0  False  
3  8e9000e00d96e04ae2be3a81390e42ce    1080   1920    28.0  False  
4  e4bd328bca8e6f51bd6f4f019692b666    1920   1080    52.0  False  


### Definition of custom dateset class with padding

In [5]:
class PaddedSignLanguageDataset(Dataset):
    def __init__(self, annotations, transform=None, max_length=None):
        """
        Corrected custom dataset for loading sign language video tensors with padding.

        Args:
        annotations (DataFrame): DataFrame containing the annotations.
        transform (callable, optional): Optional transform to be applied on a sample.
        max_length (int, optional): Maximum length of the video tensors. If not provided, it will be calculated.
        """
        self.annotations = annotations
        self.transform = transform
        self.max_length = 132

        if self.max_length is None:
            # Calculate the maximum length among all tensors
            self.max_length = max(len(torch.load(row['attachment_id'], map_location=torch.device('cpu'))) for _, row in annotations.iterrows())

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        tensor_path = self.annotations.iloc[idx]['attachment_id']
        label = self.annotations.iloc[idx]['text']
        
        # Load the tensor
        tensor = torch.load(tensor_path, map_location=torch.device('cpu'))
        
        # Debugging: Print tensor information
        # if isinstance(tensor, list):
        #     print(f"Tensor at Index {idx} is a list with length {len(tensor)}")
        #     if tensor:
        #         if tensor[0] is None or len(tensor[0]) == 0:
        #             print(f"Malformed tensor found at index {idx}.")
        #             return None, label
        #     else:
        #         print("Empty list.")
        # else:
        #     print(f"Tensor at Index {idx}: {tensor}")
        #     print(f"Tensor Shape: {tensor.shape if tensor else 'Empty Tensor'}")
        # print(f"Label: {label}")

        # Check if the tensor is empty or None
        if tensor is None or len(tensor) == 0:
            print(f"Empty tensor found at index {idx}.")
            return None, label


        # Pad the tensor to the maximum length
        padded_tensor = torch.zeros((self.max_length, *tensor[0].shape))
        padded_tensor[:len(tensor)] = torch.stack(tensor)
                
        # Apply transform if any
        if self.transform:
            padded_tensor = self.transform(padded_tensor)

        return padded_tensor, label

In [6]:
# Create the padded dataset and dataloader
padded_dataset_train = PaddedSignLanguageDataset(annotations_train)
padded_dataloader_train = DataLoader(padded_dataset_train, batch_size=4, shuffle=True)

padded_dataset_val = PaddedSignLanguageDataset(annotations_val)
padded_dataloader_val = DataLoader(padded_dataset_val, batch_size=4, shuffle=True)

In [7]:
# Display a sample from the padded dataset
for tensor, label in padded_dataloader_train:
    print("Sample tensor shape:", tensor.shape)
    print("Sample label:", label)
    break  # Display only the first batch

Sample tensor shape: torch.Size([4, 132, 3, 100, 100])
Sample label: ('адаптивное поведение', 'обучать', 'наружу', 'расписание')


### Models definition

In [8]:
class Simple3DConvNet(nn.Module):
    def __init__(self, num_classes):
        super(Simple3DConvNet, self).__init__()
        self.conv1 = nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool3d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(660000, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        # print(x.shape)
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        # print(x.shape)
        x = torch.flatten(x, 1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [9]:
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [10]:
class TwoStream3DConvNet(nn.Module):
    def __init__(self, num_classes):
        super(TwoStream3DConvNet, self).__init__()
        
        # Spatial Stream
        self.spatial_stream = nn.Sequential(
            nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2),
            nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )
        
        # Temporal Stream
        self.temporal_stream = nn.Sequential(
            nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2),
            nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(1320000, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        spatial_out = self.spatial_stream(x)
        temporal_out = self.temporal_stream(x)
        
        # Concatenate the outputs of the two streams
        combined = torch.cat((spatial_out, temporal_out), dim=1)
        
        combined = torch.flatten(combined, 1)
        combined = self.relu(self.fc1(combined))
        combined = self.fc2(combined)
        return combined

In [11]:
class ResNet3D(nn.Module):
    def __init__(self, num_classes):
        super(ResNet3D, self).__init__()
        self.conv1 = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3))
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        
        # Residual Layers
        self.residual_layers = nn.Sequential(
            nn.Conv3d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv3d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        )
        
        self.fc = nn.Linear(64 * 132 * 25 * 25, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.maxpool(x)
        
        # Residual Layers
        residual = x
        x = self.residual_layers(x)
        x += residual
        
        # print(x.shape)

        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [12]:
num_classes = len(set(annotations_train['text']))

models = {
    'simple_3d_conv_net': Simple3DConvNet(num_classes),
    'two_stream_3d_conv_net': TwoStream3DConvNet(num_classes),
    'resnet3d': ResNet3D(num_classes)
}

### Labels Mapping

In [13]:
# Initialize an empty set to collect unique labels
unique_labels_train = set()

# Iterate over your dataset to collect unique labels
for _, label_data in padded_dataloader_train:
    unique_labels_train.update(label_data)

# Sort the labels for consistency
sorted_labels = sorted(unique_labels_train)

# Create the label mapping
train_label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

# Print the label mapping
print("Label Mapping:", train_label_mapping)

Label Mapping: {'адаптивное поведение': 0, 'аккуратный': 1, 'много': 2, 'наружу': 3, 'обучать': 4, 'отчаянный': 5, 'переваривать': 6, 'расписание': 7, 'расслабление': 8, 'стоять': 9}


In [14]:
# Initialize an empty set to collect unique labels
unique_labels_test = set()

# Iterate over your dataset to collect unique labels
for _, label_data in padded_dataloader_val:
    unique_labels_test.update(label_data)

# Sort the labels for consistency
sorted_labels = sorted(unique_labels_test)

# Create the label mapping
val_label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

# Print the label mapping
print("Label Mapping:", val_label_mapping)

Label Mapping: {'адаптивное поведение': 0, 'аккуратный': 1, 'много': 2, 'наружу': 3, 'обучать': 4, 'отчаянный': 5, 'переваривать': 6, 'расписание': 7, 'расслабление': 8, 'стоять': 9}


### Train and Validation loops

In [15]:
# Loss function
criterion = nn.CrossEntropyLoss()

num_epochs = 10
results = {}

# Train the models
# Training and Validation
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}")

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        # Training Phase
        model.train()
        loop = tqdm(enumerate(padded_dataloader_train, 0), total=len(padded_dataloader_train), desc=f"Epoch {epoch+1}/{num_epochs} - Training")
        for i, data in loop:
            inputs, label_data = data
            labels = torch.tensor([train_label_mapping[label] for label in label_data], dtype=torch.long)
            inputs = inputs.permute(0, 2, 1, 3, 4)

            optimizer.zero_grad()

            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            # Update progress bar
            loop.set_description(f"Epoch {epoch+1}/{num_epochs} - Training")
            loop.set_postfix(loss = running_loss / (i+1), accuracy = correct_predictions / total_predictions)

        # Validation Phase
        model.eval()
        val_running_loss = 0.0
        val_correct_predictions = 0
        val_total_predictions = 0
        loop_val = tqdm(enumerate(padded_dataloader_val, 0), total=len(padded_dataloader_val), desc=f"Epoch {epoch+1}/{num_epochs} - Validation")
        for i, data in loop_val:
            inputs, label_data = data

            if len(inputs) == 0 or len(label_data) == 0:
                continue

            labels = torch.tensor([val_label_mapping.get(label) for label in label_data], dtype=torch.long)
            inputs = inputs.permute(0, 2, 1, 3, 4)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total_predictions += labels.size(0)
            val_correct_predictions += (predicted == labels).sum().item()

            # Update progress bar
            loop_val.set_description(f"Epoch {epoch+1}/{num_epochs} - Validation")
            loop_val.set_postfix(loss = val_running_loss / (i+1), accuracy = val_correct_predictions / val_total_predictions)

        val_accuracy = val_correct_predictions / val_total_predictions
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy

    # Store the best validation accuracy for this model
    results[model_name] = best_val_accuracy

print('Finished Training')

Training simple_3d_conv_net


Epoch 1/10 - Training: 100%|██████████| 38/38 [02:48<00:00,  4.44s/it, accuracy=0.167, loss=348]    
Epoch 1/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s, accuracy=0.22, loss=2.44] 
Epoch 2/10 - Training: 100%|██████████| 38/38 [02:36<00:00,  4.12s/it, accuracy=0.453, loss=1.63]
Epoch 2/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.37it/s, accuracy=0.22, loss=2.6]  
Epoch 3/10 - Training: 100%|██████████| 38/38 [02:40<00:00,  4.23s/it, accuracy=0.547, loss=1.24]
Epoch 3/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.34it/s, accuracy=0.2, loss=3.06]  
Epoch 4/10 - Training: 100%|██████████| 38/38 [02:32<00:00,  4.02s/it, accuracy=0.567, loss=0.992]
Epoch 4/10 - Validation: 100%|██████████| 13/13 [00:08<00:00,  1.46it/s, accuracy=0.22, loss=3.17] 
Epoch 5/10 - Training: 100%|██████████| 38/38 [02:31<00:00,  3.98s/it, accuracy=0.74, loss=0.709] 
Epoch 5/10 - Validation: 100%|██████████| 13/13 [00:08<00:00,  1.46it/s, accuracy=0.26, loss=3.66] 
Epoch

Training two_stream_3d_conv_net


Epoch 1/10 - Training:  58%|█████▊    | 22/38 [13:01<19:20, 72.50s/it, accuracy=0.114, loss=843]     

In [None]:
# # Load Swin Transformer and MViTv2 models as generic models
# swin_transformer_model = torch.jit.load('models/swin32-2.pt')
# mvitv2_model = torch.jit.load('models/mvit32-2.pt')

In [None]:
# # Make sure to call eval() if you're using the models for inference
# swin_transformer_model.eval()
# mvitv2_model.eval()

RecursiveScriptModule(
  original_name=Recognizer3D
  (data_preprocessor): RecursiveScriptModule(original_name=ActionDataPreprocessor)
  (backbone): RecursiveScriptModule(
    original_name=SwinTransformer3D
    (patch_embed): RecursiveScriptModule(
      original_name=PatchEmbed3D
      (proj): RecursiveScriptModule(original_name=Conv3d)
      (norm): RecursiveScriptModule(original_name=LayerNorm)
    )
    (pos_drop): RecursiveScriptModule(original_name=Dropout)
    (layers): RecursiveScriptModule(
      original_name=ModuleList
      (0): RecursiveScriptModule(
        original_name=BasicLayer
        (blocks): RecursiveScriptModule(
          original_name=ModuleList
          (0): RecursiveScriptModule(
            original_name=SwinTransformerBlock3D
            (norm1): RecursiveScriptModule(original_name=LayerNorm)
            (attn): RecursiveScriptModule(
              original_name=WindowAttention3D
              (qkv): RecursiveScriptModule(original_name=Linear)
           

In [None]:
# additional_models = {
#     'swin_transformer': swin_transformer_model,
#     'mvitv2': mvitv2_model
# }

In [None]:
# def adjust_inputs_for_model(inputs):
#     adjusted_inputs = inputs.squeeze()
#     return adjusted_inputs

In [None]:
# for model_name, model in additional_models.items():
#     val_correct_predictions = 0
#     val_total_predictions = 0

#     for i, data in enumerate(padded_dataloader_val, 0):
#         inputs, label_data = data

#         if len(inputs) == 0 or len(label_data) == 0:
#             continue

#         labels = torch.tensor([val_label_mapping.get(label) for label in label_data], dtype=torch.long)

#         # Ensure the input is in the expected format: (N, C, T, H, W)
#         inputs = inputs.permute(0, 2, 1, 3, 4)
#         print("Original shape:", inputs.shape)

#         # Split the input tensor into smaller tensors with 32 frames each
#         num_splits = inputs.shape[2] // 32
#         input_splits = torch.split(inputs, 32, dim=2)[:num_splits]

#         for split_inputs in input_splits:
#             print("Split shape:", split_inputs.shape)

#             # Adjust the inputs tensor if necessary for the Swin Transformer model
#             if model_name == 'swin_transformer':
#                 split_inputs = adjust_inputs_for_model(split_inputs)

#             outputs = model(split_inputs)
#             _, predicted = torch.max(outputs.data, 1)
#             val_total_predictions += labels.size(0)
#             val_correct_predictions += (predicted == labels).sum().item()

#     val_accuracy = val_correct_predictions / val_total_predictions
#     results[model_name] = val_accuracy

### Plotting

In [None]:
# Visualization of Results
model_names = list(results.keys())
accuracies = list(results.values())

plt.figure(figsize=(10, 6))
plt.barh(model_names, accuracies, color='skyblue')
plt.xlabel('Validation Accuracy')
plt.title('Model Comparison')
plt.xlim(0, 1)
plt.grid(axis='x')
plt.show()

In [None]:
# Statistical Test
for i in range(len(model_names)):
    for j in range(i+1, len(model_names)):
        model_1_name = model_names[i]
        model_2_name = model_names[j]
        
        accuracies_model_1 = results[model_1_name]
        accuracies_model_2 = results[model_2_name]
        
        t_stat, p_value = ttest_rel(np.array(accuracies_model_1), np.array(accuracies_model_2))
        
        if p_value < 0.05:
            print(f"{model_1_name} and {model_2_name} are significantly different (p={p_value:.3f}).")
        else:
            print(f"No significant difference between {model_1_name} and {model_2_name} (p={p_value:.3f}).")

In [None]:
# # Loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# num_epochs = 10

# for epoch in range(num_epochs):
#     running_loss = 0.0
#     correct_predictions = 0
#     total_predictions = 0

#     # Training Phase
#     model.train()
#     loop = tqdm(enumerate(padded_dataloader_train, 0), total=len(padded_dataloader_train), desc=f"Epoch {epoch+1}/{num_epochs} - Training")
#     for i, data in loop:
#         inputs, label_data = data
#         labels = torch.tensor([train_label_mapping[label] for label in label_data], dtype=torch.long)
#         inputs = inputs.permute(0, 2, 1, 3, 4)
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         _, predicted = torch.max(outputs.data, 1)
#         total_predictions += labels.size(0)
#         correct_predictions += (predicted == labels).sum().item()

#         # Update progress bar
#         loop.set_description(f"Epoch {epoch+1}/{num_epochs} - Training")
#         loop.set_postfix(loss = running_loss / (i+1), accuracy = correct_predictions / total_predictions)

#     # Validation Phase
#     model.eval()
#     val_running_loss = 0.0
#     val_correct_predictions = 0
#     val_total_predictions = 0
#     loop_val = tqdm(enumerate(padded_dataloader_val, 0), total=len(padded_dataloader_val), desc=f"Epoch {epoch+1}/{num_epochs} - Validation")
#     for i, data in loop_val:
#         inputs, label_data = data

#         if len(inputs) == 0 or len(label_data) == 0:
#             continue

#         labels = torch.tensor([val_label_mapping.get(label) for label in label_data], dtype=torch.long)
#         inputs = inputs.permute(0, 2, 1, 3, 4)
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         val_running_loss += loss.item()
#         _, predicted = torch.max(outputs.data, 1)
#         val_total_predictions += labels.size(0)
#         val_correct_predictions += (predicted == labels).sum().item()

#         # Update progress bar
#         loop_val.set_description(f"Epoch {epoch+1}/{num_epochs} - Validation")
#         loop_val.set_postfix(loss = val_running_loss / (i+1), accuracy = val_correct_predictions / val_total_predictions)

# print('Finished Training')

Epoch 1/10 - Training: 100%|██████████| 38/38 [02:52<00:00,  4.54s/it, accuracy=0.18, loss=232]    
Epoch 1/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.37it/s, accuracy=0.2, loss=2.27]  
Epoch 2/10 - Training: 100%|██████████| 38/38 [02:39<00:00,  4.19s/it, accuracy=0.3, loss=2.07]  
Epoch 2/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.37it/s, accuracy=0.18, loss=2.45] 
Epoch 3/10 - Training: 100%|██████████| 38/38 [02:44<00:00,  4.33s/it, accuracy=0.473, loss=1.72]
Epoch 3/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.35it/s, accuracy=0.22, loss=2.66] 
Epoch 4/10 - Training: 100%|██████████| 38/38 [02:42<00:00,  4.27s/it, accuracy=0.58, loss=1.21] 
Epoch 4/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.36it/s, accuracy=0.18, loss=6.17] 
Epoch 5/10 - Training: 100%|██████████| 38/38 [02:37<00:00,  4.13s/it, accuracy=0.76, loss=0.764] 
Epoch 5/10 - Validation: 100%|██████████| 13/13 [00:09<00:00,  1.35it/s, accuracy=0.2, loss=5.37]  
Epoch 6

Finished Training



