In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np



In [15]:
# Define input shape
img_height, img_width = 64, 64  # Adjust based on dataset
num_classes = 10  # Number of sign classes
sequence_length = 5  # Number of frames per sequence

In [None]:
# CNN-LSTM Model
class CNNLSTM(nn.Module):
    def __init__(self):
        super(CNNLSTM, self).__init__()
        
        # CNN for feature extraction
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding='same')
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding='same')
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        
        # Calculate flattened features
        self.flattened_size = 64 * (img_height // 4) * (img_width // 4)
        
        # LSTM for temporal processing
        self.lstm = nn.LSTM(input_size=self.flattened_size, hidden_size=64, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        
        # Fully connected layer
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # Process each frame in the sequence
        cnn_output = []
        for t in range(sequence_length):
            # Extract frame at time t
            frame = x[:, t, :, :, :]  # Shape: (batch_size, 3, height, width)
            
            # CNN layers
            frame = self.relu(self.conv1(frame))
            frame = self.pool1(frame)
            frame = self.relu(self.conv2(frame))
            frame = self.pool2(frame)
            
            # Flatten
            frame = frame.view(batch_size, -1)
            cnn_output.append(frame)
        
        # Stack CNN outputs to create a sequence
        cnn_output = torch.stack(cnn_output, dim=1)  # Shape: (batch_size, seq_len, features)
        
        # LSTM layer
        lstm_out, (h_n, _) = self.lstm(cnn_output)
        
        # Take the output of the last LSTM cell
        lstm_out = h_n.squeeze(0)
        
        # Dropout
        lstm_out = self.dropout(lstm_out)
        
        # Fully connected layers
        fc_out = self.relu(self.fc1(lstm_out))
        fc_out = self.fc2(fc_out)
        output = self.softmax(fc_out)
        
        return output



In [17]:
# Initialize model
model = CNNLSTM()

# Print model architecture
print(model)



CNNLSTM(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ReLU()
  (lstm): LSTM(16384, 64, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)


In [18]:
# compile the model
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()  