### Video Sequences data preparation

In [158]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import re

### Padded with lastet frame

In [159]:
# class VideoSequenceDataset(Dataset):
#     def __init__(self, root_dir, transform=None, sequence_length=160):
#         self.root_dir = root_dir
#         self.transform = transform
#         self.sequence_length = sequence_length  # Set target sequence length
#         self.classes = os.listdir(root_dir)
#         self.data = self.load_sequences()

#     def load_sequences(self):
#         data = []
#         for class_idx, class_folder in enumerate(self.classes):
#             class_path = os.path.join(self.root_dir, class_folder)

#             # List and sort frames numerically
#             image_sequences = sorted(os.listdir(class_path), key=self.natural_sort_key)
#             frames = [os.path.join(class_path, img) for img in image_sequences]

#             data.append((frames, class_idx))
#         return data

#     def natural_sort_key(self, string):
#         # Extract numbers from the filename to ensure numeric sorting
#         return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', string)]

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         image_paths, label = self.data[idx]
#         images = [Image.open(img_path).convert('RGB') for img_path in image_paths]

#         # Pad the sequence to the target length (self.sequence_length)
#         if len(images) < self.sequence_length:
#             pad_size = self.sequence_length - len(images)
#             # Repeat the last frame or pad with black frames
#             last_image = images[-1]
#             pad_images = [last_image] * pad_size
#             images.extend(pad_images)
#         elif len(images) > self.sequence_length:
#             # Truncate the sequence if it exceeds the target length
#             images = images[:self.sequence_length]

#         if self.transform:
#             images = [self.transform(img) for img in images]

#         images = torch.stack(images, dim=0)  # Stack into a tensor

#         return images, label

### Padded with zeros

In [160]:
class VideoSequenceDataset(Dataset):
    def __init__(self, root_dir, transform, sequence_length=160):
        self.root_dir = root_dir
        self.transform = transform
        self.sequence_length = sequence_length  # Set target sequence length
        self.classes = os.listdir(root_dir)
        self.data = self.load_sequences()

    def load_sequences(self):
        data = []
        for class_idx, class_folder in enumerate(self.classes):
            class_path = os.path.join(self.root_dir, class_folder)

            # List and sort frames numerically
            image_sequences = sorted(os.listdir(class_path), key=self.natural_sort_key)
            frames = [os.path.join(class_path, img) for img in image_sequences]

            data.append((frames, class_idx))
        return data

    def natural_sort_key(self, string):
        # Extract numbers from the filename to ensure numeric sorting
        return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', string)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_paths, label = self.data[idx]
        images = [Image.open(img_path).convert('RGB') for img_path in image_paths]
        # print(image_paths)
        # print(images)
        # print(labels)

        # If there are fewer images than the sequence length, pad with zeros
        if len(images) < self.sequence_length:
            pad_size = self.sequence_length - len(images)
            # Create a black (zero) image of the same size as the input (e.g., 224x224)
            zero_image = Image.new('RGB', (224, 224), (0, 0, 0))  # Black image
            pad_images = [zero_image] * pad_size
            images.extend(pad_images)
        elif len(images) > self.sequence_length:
            # Truncate the sequence if it exceeds the target length
            images = images[:self.sequence_length]

        if self.transform:
            images = [self.transform(img) for img in images]

        images = torch.stack(images, dim=0)  # Stack into a tensor

        return torch.tensor(images, dtype=torch.float32), label

In [161]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a fixed size
    transforms.ToTensor(),  # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
])

In [None]:
# Set parameters
test_dir = 'C:/Users/araya/Desktop/frame/train'
sequence_length = 160  # Set how many frames per sequence

# Initialize dataset and dataloader
dataset = VideoSequenceDataset(root_dir=test_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Example of accessing the data
for batch_idx, (frames, labels) in enumerate(dataloader):
    print(f"Batch {batch_idx+1}")
    print("Frames shape:", frames.shape)  # Should be [batch_size, sequence_length, C, H, W]
    print("Labels:", labels)
    break  # Process just the first batch

In [None]:
dataloader

In [None]:
dataset

In [None]:
len(dataloader.dataset[0][0])

In [None]:
dataloader.dataset[0][0][0]

In [None]:
import matplotlib.pyplot as plt 

plt.imshow(dataloader.dataset[0][0][141][0]) # [จำนวน class][][เฟรมที่][เปลี่ยนสี]

In [None]:
dataloader.dataset.classes

In [None]:
print(len(dataloader.dataset[0][0]))  
print(len(dataloader.dataset[1][0]))  
print(len(dataloader.dataset[2][0]))  
print(len(dataloader.dataset[3][0]))  
print("เราต้อง padded lenght ให้เท่ากัน ก่อนเข้า model")

In [None]:
for batch_idx, (frames, labels) in enumerate(dataloader):
    print(f"Batch {batch_idx+1}")
    print("Frames shape:", frames)  # Should be [batch_size, sequence_length, C, H, W]
    print("Labels:", labels)
    break  # Process just the first batch

### -------------------------------------------------------------------------------------------------------------------------
### cuda or cpu ?

In [None]:
device = torch.device('cpu')
device

In [None]:
len(dataloader.dataset[0][0])

### Model

In [173]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
import numpy as np 
from torchvision import models

In [174]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.attention_weights = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, lstm_output):
        # lstm_output: (batch_size, sequence_length, hidden_size)
        attention_scores = self.attention_weights(lstm_output)  # (batch_size, sequence_length, 1)
        attention_weights = torch.softmax(attention_scores, dim=1)  # (batch_size, sequence_length, 1)
        weighted_output = torch.sum(lstm_output * attention_weights, dim=1)  # (batch_size, hidden_size)
        return weighted_output, attention_weights

In [175]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes,dropout):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.pool = nn.MaxPool2d(kernel_size=(2,1), stride=1)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.attention = AttentionLayer(hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Apply pooling before LSTM
        x = self.pool(x)

        # Forward propagate the LSTM
        lstm_output, _ = self.lstm(self.dropout(x))

        # Apply attention to the LSTM output
        attention_output, attention_weights = self.attention(lstm_output)

        # Classification based on attention output
        out = self.fc1(attention_output)
        out = self.fc2(out)
        return out

In [176]:
# model = CNNModel(in_channels=3, num_classes=9, dropout=0.5).to(device)
# model = SignLanguageCNN()
# model = LSTMModel(input_size=224, hidden_size=256, num_layers=2, num_classes=9, dropout=0.5).to(device)
model = LSTMModel(input_size=224, hidden_size=256, num_layers=2, num_classes=10, dropout=0.5).to(device)

In [177]:
# spatial_features = model(dataloader.dataset[0][0])
# spatial_features

In [178]:
# spatial_features.shape

In [179]:
# plt.imshow(spatial_features.detach().numpy())

In [180]:
criterion = nn.CrossEntropyLoss() 

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [181]:
# x = dataloader.dataset[0][0]
# x = x[:, 0, :, :]  # Select the first channel (0 for R in RGB)
# x.shape

In [182]:
batch_size = 4

In [None]:
# Training loop
# References : https://saturncloud.io/blog/calculating-the-accuracy-of-pytorch-models-every-epoch/#:~:text=In%20order%20to%20calculate%20the,tensor%20along%20a%20specified%20dimension

num_epochs = 900
loss_logger = []
accuracy_logger = []
# n_epochs = []
for epoch in range(num_epochs):
    total_correct = 0
    total_samples = 0
    model.train()
    for i, (sequences, labels) in enumerate(dataloader):
        # Move data to the device
        # labels = labels.type(torch.LongTensor)   # casting to long
        sequences = sequences.to(device)
        sequences = sequences[0,:,:,:,:]
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(sequences)
        _, predicted = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # loss_logger.append(loss.item())
    loss_logger.append(loss.item())
    accuracy = 100 * total_correct /total_samples


    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    accuracy_logger.append(accuracy)
    # n_epochs.append(epoch)

In [None]:
# num_epochs = 100

# loss_logger = []

# for epoch in range(num_epochs):
#     for i, (data, targets) in enumerate(dataloader.dataset):
#         # print(data)
#         # print(targets)

#         data = data.to(device)
#         targets = targets

#         outputs = model(data)
#         loss = criterion(outputs, targets)

#         optimizer.zero_grad()
#         loss.backward()

#         optimizer.step()

#     loss_logger.append(loss.item())
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [17]:
import torch
torch.cuda.empty_cache()