In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        data_path = self.dataframe['attachment_id'][idx]
        image = torch.load(data_path)

        # image = transforms.ToPILImage()(image)

        # Ensure image is not a list; convert to tensor if it is a PIL Image
        if isinstance(image, list):
            raise ValueError(f"Image at index {idx} was a list, not a tensor or PIL Image.")
        

        label_mapping = {}  # Define a mapping from label names to indices
        label = label_mapping.get(self.dataframe['text'][idx], -1) 
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            image = self.transform(image)
        
        return image, label


## Models for compairing

In [None]:
class ModelLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(ModelLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
class Model3DConvNet(nn.Module):
    def __init__(self, num_classes):
        super(Model3DConvNet, self).__init__()
        self.conv3d = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3d = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        self.fc = nn.Linear(64*64*64, num_classes)
    
    def forward(self, x):
        x = self.pool3d(torch.relu(self.conv3d(x)))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [None]:
class TwoStream3DConvNet(nn.Module):
    def __init__(self, num_classes):
        super(TwoStream3DConvNet, self).__init__()
        self.spatial_stream = Model3DConvNet(num_classes)
        self.temporal_stream = Model3DConvNet(num_classes)
        
    def forward(self, spatial_data, temporal_data):
        spatial_out = self.spatial_stream(spatial_data)
        temporal_out = self.temporal_stream(temporal_data)
        out = (spatial_out + temporal_out) / 2
        return out

In [None]:
import torchvision.models as models

class ModelResNet2D(nn.Module):
    def __init__(self, num_classes):
        super(ModelResNet2D, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
        
    def forward(self, x):
        return self.resnet(x)

In [None]:
class ModelSwinTransformer(nn.Module):
    def __init__(self, num_classes):
        super(ModelSwinTransformer, self).__init__()
        self.swin_transformer = torch.hub.load('facebookresearch/swin-transformer', 'swin_base_patch4_window7_224_in22k', pretrained=True)
        self.swin_transformer.head = nn.Linear(self.swin_transformer.head.in_features, num_classes)
        
    def forward(self, x):
        return self.swin_transformer(x)

In [None]:
class ModelMViTv2(nn.Module):
    def __init__(self, num_classes):
        super(ModelMViTv2, self).__init__()
        self.mvit_v2 = torch.hub.load('facebookresearch/mvit', 'mvit_base_16x4', pretrained=True)
        self.mvit_v2.head = nn.Linear(self.mvit_v2.head.in_features, num_classes)
        
    def forward(self, x):
        return self.mvit_v2(x)

In [None]:
train_dataframe = pd.read_csv("data/processed_annotations_train.csv")
val_dataframe = pd.read_csv("data/processed_annotations_valid.csv")

In [None]:
print(train_dataframe.columns)

In [None]:
train_dataset = CustomDataset(dataframe=train_dataframe, transform=transform)
val_dataset = CustomDataset(dataframe=val_dataframe, transform=transform)

In [None]:
num_classes = 10
train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
# Models
models = {
    "LSTM": ModelLSTM(input_size=224*224*3, hidden_size=512, num_layers=2, num_classes=num_classes),
    "3D-ConvNet": Model3DConvNet(num_classes=num_classes),
    "Two-Stream 3D-ConvNet": TwoStream3DConvNet(num_classes=num_classes),
    "ResNet2D": ModelResNet2D(num_classes=num_classes),
    # "Swin Transformer": ModelSwinTransformer(num_classes=num_classes),
    # "MViTv2": ModelMViTv2(num_classes=num_classes)
}

data_loaders = {
    "train": train_data_loader,
    "val": val_data_loader,
}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

results = {}
num_epochs = 10

for model_name, model in models.items():
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in data_loaders['train']:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loaders["train"].dataset):.4f}')
        
        # Validation loop
        model.eval()
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for inputs, labels in data_loaders['val']:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                all_labels.append(labels.cpu())
                all_preds.append(preds.cpu())

        all_labels = torch.cat(all_labels)
        all_preds = torch.cat(all_preds)
        val_accuracy = torch.sum(all_preds == all_labels).item() / len(all_labels)
        print(f'Validation Accuracy: {val_accuracy*100:.2f}%')

    results[model_name] = val_accuracy

In [None]:
import matplotlib.pyplot as plt

# Visualization of Results
model_names = list(results.keys())
accuracies = list(results.values())

plt.figure(figsize=(10, 6))
plt.barh(model_names, accuracies, color='skyblue')
plt.xlabel('Validation Accuracy')
plt.title('Model Comparison')
plt.xlim(0, 1)
plt.grid(axis='x')
plt.show()

In [None]:
model_names = list(models.keys())
for i in range(len(model_names)):
    for j in range(i+1, len(model_names)):
        model_1_name = model_names[i]
        model_2_name = model_names[j]
        
        accuracies_model_1 = results[model_1_name]
        accuracies_model_2 = results[model_2_name]
        
        t_stat, p_value = ttest_rel(accuracies_model_1, accuracies_model_2)
        
        if p_value < 0.05:
            print(f"{model_1_name} and {model_2_name} are significantly different (p={p_value:.3f}).")
        else:
            print(f"No significant difference between {model_1_name} and {model_2_name} (p={p_value:.3f}).")