In [28]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import cv2
import os
from PIL import Image
from torchvision import transforms

In [29]:
# Variables
data_path = 'C:/Users/C25Thomas.Blalock/Coding/Data Competition Team/asl/data/asl_alphabet_test'
num_epochs = 10

In [38]:

# Define dataset class to load images
class FingerspellingDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.images = []
        self.labels = []
        
        # Loop through data directory and read in images and labels
        for root, dirs, files in os.walk(data_dir):
            for file in files:
                image_path = os.path.join(root, file)
                label = os.path.basename(root)
                
                image = Image.open(image_path) 
                image = image.convert('RGB') # makes channels first dim
                image = transforms.ToTensor()(image) 
                self.images.append(image)
                self.labels.append(label)

        # Create label maps
        self.label_map = {}
        self.label_text = []

        for label in self.labels:
            if label not in self.label_map:
                self.label_map[label] = len(self.label_map)
                self.label_text.append(label)
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label_text = self.labels[idx]

        # Convert label to tensor
        label_idx = self.label_map[label_text]
        label_tensor = torch.tensor(label_idx)
        
        return image, label_tensor, label_text


# Matrix sizing for convolutional layers: https://www.baeldung.com/cs/convolutional-layer-size

Sizing calculator: https://madebyollin.github.io/convnet-calculator/

out_channels is filter count in the calculator

spacial extent is kernal_size

Input: Batch_size = N ; Number_of_Channels = C_in ; Height = H_in ; Width = W_in

Input: num_channels x heigt x width x Batch_Size

Conv Layers Params: in_channels = C_in ; out_channels = C_out ; kernel_size = K ; stride = S ; padding = P

Output of Conv Layer: Batch_size = N ; Number_of_Channels = C_out ; Height = H_out ; Width = W_out

Output of Conv Layer: N x C_out x H_out x W_out

H_out = (H_in + 2*P - K)/S + 1

W_out = (W_in + 2*P - K)/S + 1

ReLu: size_in = size_out

MaxPool Params: kernel_size = K ; stride = S

Output of MaxPool: Batch_size = N ; Number_of_Channels = C_out ; Height = H_out ; Width = W_out

H_out = (H_in - K)/S + 1

W_out = (W_in - K)/S + 1

In [31]:

# Define network architecture 
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolutional layer 1
        self.conv1 = nn.Sequential(
            torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),
            # add batch norm

            torch.nn.MaxPool2d(kernel_size=2, stride=2) 
        )

        # Convolutional layer 2
        self.conv2 = nn.Sequential(
            torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Convolutional layer 3
        self.conv3 = nn.Sequential(
            torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Resize layer so dense can process
        self.resize = nn.Sequential(
            torch.nn.Flatten()
        )

        # Dense layer 1
        self.dense1 = nn.Sequential(
            torch.nn.Linear(in_features=64*25*25, out_features=1024),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.3)
        )

        # Output layer
        self.output = nn.Sequential(
            torch.nn.Linear(in_features=1024, out_features=29),
            torch.nn.Softmax(dim=1)
        )

        # Output: 28x1xBatch_Size
        # 28 = 26 letters + 1 space + 1 nothing + 1 del

    def forward(self, x):
        # Forward pass
        x = self.conv1(x)
        # print(" conv1 Output Shape: ", x.shape) # conv1 Output Shape:  torch.Size([28, 16, 100, 100])
        x = self.conv2(x)
        # print(" conv2 Output Shape: ", x.shape) # conv2 Output Shape:  torch.Size([28, 32, 50, 50])
        x = self.conv3(x)
        # print(" conv3 Output Shape: ", x.shape) # conv3 Output Shape:  torch.Size([28, 64, 25, 25])
        x = self.resize(x)
        # print(" resize Output Shape: ", x.shape) # resize Output Shape:  torch.Size([28, 40000])
        x = self.dense1(x)
        # print(" dense1 Output Shape: ", x.shape) # dense1 Output Shape:  torch.Size([28, 1024])
        x = self.output(x)
        # print(" output Output Shape: ", x.shape) # output Output Shape:  torch.Size([28, 29])
        return x


In [39]:

# Create dataset and dataloader, test and train
dataset = FingerspellingDataset(data_path)

 # split into a train and test set
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)


In [43]:
# check to see the dataloader is working
for image, label_tensor, label_text in train_loader:
    print(image.shape)
    for i in range(5):
        print(label_text[i], label_tensor[i].item())
    break

torch.Size([32, 3, 200, 200])
D 3
F 6
L 12
del 4
del 4


In [64]:

# Instantiate model, loss function and optimizer
model = Model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


In [69]:

# Training loop
for epoch in range(num_epochs):
    for images, labels, _ in train_loader:
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Print loss  
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {100*torch.sum(labels==torch.argmax(outputs, dim=1))/len(labels):.4f}%')


Epoch [1/10], Loss: 3.1910, Accuracy: 22.7273%
Epoch [2/10], Loss: 3.0686, Accuracy: 45.4545%
Epoch [3/10], Loss: 3.0446, Accuracy: 40.9091%
Epoch [4/10], Loss: 2.9935, Accuracy: 45.4545%
Epoch [5/10], Loss: 3.0226, Accuracy: 36.3636%
Epoch [6/10], Loss: 3.0011, Accuracy: 40.9091%
Epoch [7/10], Loss: 2.9776, Accuracy: 40.9091%
Epoch [8/10], Loss: 2.9754, Accuracy: 40.9091%
Epoch [9/10], Loss: 2.9282, Accuracy: 50.0000%
Epoch [10/10], Loss: 2.9248, Accuracy: 45.4545%


In [None]:

# Test loop
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels, _ in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    print(f'Accuracy: {correct/total*100:.2f}%')