In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import torch

#### Check if GPU (CUDA) is available or not

In [None]:
import torch
if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

#### Loading the dataset

In [None]:
dataset = pd.read_csv("hand_coordinates.csv")

In [None]:
dataset.columns

In [None]:
test = dataset["Image"][0]

In [None]:
img = cv2.imread(test)
plt.imshow(img)

In [None]:
backward = dataset[dataset["Gesture"] == "backward"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
brightness = dataset[dataset["Gesture"] == "brightness"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
forward = dataset[dataset["Gesture"] == "forward"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
Next = dataset[dataset["Gesture"] == "next"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
no_gesture = dataset[dataset["Gesture"] == "NoGesture"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
pointer = dataset[dataset["Gesture"] == "pointer"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
previous = dataset[dataset["Gesture"] == "previous"].reset_index().sample(n=68, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)
volume = dataset[dataset["Gesture"] == "volume"].reset_index().sample(n=70, random_state=42, replace=True).reset_index(drop=True).drop(columns=["index"], axis=1)

In [None]:
volume.head()

In [None]:
new_data = pd.concat([backward, brightness, forward, Next, no_gesture, pointer, previous, volume], ignore_index=True)
new_data.reset_index(drop=True, inplace=True)
# Shuffling the data
new_data = new_data.sample(frac=1, random_state=42).reset_index(drop=True)
new_data.head()

#### Encoding categorical variables, in this case the dependent variable "Gesture"

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
new_data['Gesture'] = label_encoder.fit_transform(new_data['Gesture'])

In [None]:
def load_image(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image

### To build a data pipeline, we proceed  with the following steps
<br>
1. Defining the class to load the input data
    <br>|<br>
    --- read the image <br>
    --- preprocess it <br>
    --- return the image and label <br>

<br>
2. Define the variable to transform  the data 
    <br>|<br>
    ---  A dictionary that transforms training data, and a dictionary to transform validation data<br>
    ---  Resize the image<br>
    ---  Convert the data into tensor and then normalize the data<br>
<br>
3. Train-test-split<br>
<br>
4. Prepare training and validation dataset by loading the images from image path using above defined transformation methods<br>
<br>
5. Creating the training and validation batch using training and validation dataset

In [None]:
class HandGestureDataset(data.Dataset):
    def __init__(self, dataframe, transform = True):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['Image']
        try:
            image = Image.fromarray(load_image(img_path))
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            return None, None
        label = self.dataframe.iloc[idx]['Gesture']
        
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(label, dtype=torch.long) 
        
        return image, label

In [None]:
data_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    "val": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

In [None]:
train_data, val_data = train_test_split(new_data, test_size=0.2, random_state=42, stratify=new_data["Gesture"])

In [None]:
train_dataset = HandGestureDataset(train_data, transform=data_transforms["train"])
val_dataset = HandGestureDataset(val_data, transform=data_transforms["val"])

In [None]:
def collate_fn(batch):
    batch = [b for b in batch if b[0] is not None]
    return torch.utils.data.dataloader.default_collate(batch)

In [None]:
train_loader = data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_loader = data.DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn)

Loading a sample batch from the training loader

In [None]:
sample_batch = next(iter(train_loader))

in_img, label = sample_batch

In [None]:
images = in_img.to(device)
labels = label.to(device)
print("Images shape:", images.shape)
print("Labels shape:", labels.shape)

# Show the labels
print("Labels:", labels)

The above shape indicates that the sample batch contains 8 images in each batch, 3 channels, and a 224 x 224 sized image

In [None]:
images = in_img.cpu()
labels = label.cpu()
print("Images shape:", images.shape)
print("Labels shape:", labels.shape)

# Show the labels
print("Labels:", labels)

<p>Images loaded on GPU cannot be showed using plt or cv2. It has to be first loaded on the cpu.</p>

In [None]:
plt.imshow(in_img[3][1])

#### Defining the model architecture

In [None]:
class GestureRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(GestureRecognitionModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),  # Reduced number of filters
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(128 * 28 * 28, 512),  # Reduced number of neurons
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


In [None]:
torch.cuda.empty_cache()

In [None]:
num_classes = len(label_encoder.classes_)
model = GestureRecognitionModel(num_classes=num_classes).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=25):
    
    # Storing the initial weights. 
    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase. 

        # During the training phase we train the model from training loader data 
        # and during validation we evaluate the model using validation loader
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                dataloader = train_loader
            else:
                model.eval()   # Set model to evaluate mode
                dataloader = val_loader

            # Declaring the loss and accuracy variables
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            # We send these data onto the GPU for faster model training
            for inputs, labels in dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            # Calculating loss and accuracy in eack epoch
            epoch_loss = running_loss / len(dataloader.dataset)
            epoch_acc = running_corrects.double() / len(dataloader.dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                torch.save(best_model_wts, 'hand_gesture_model_wts.pth')

        print()

    print(f'Best val Acc: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model


#### Train the Model

In [None]:
# model = train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=25)

In [None]:
# model.eval()

#### Saving the trained model

In [None]:
# torch.save(model, 'trained_model.pth')

#### Load the saved model

In [None]:
model = torch.load('trained_model.pth')
model.eval()

##### Inference on sample data

In [None]:
new_data.columns

In [None]:
sample_image_path = new_data["Image"][56]
corr_label = new_data["Gesture"][56]
sample_image = cv2.imread(sample_image_path)
plt.imshow(sample_image)
print("Correlated Label (Gesture): ", corr_label)

In [None]:
import torch
from torchvision import transforms
from PIL import Image

# Define transforms
data_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load and preprocess the sample image


# sample_image_path = 'path_to_sample_image.jpg'
sample_image_path = sample_image_path
sample_image = Image.open(sample_image_path).convert('RGB')
input_tensor = data_transform(sample_image)
input_batch = input_tensor.unsqueeze(0)  



In [None]:
# Move input tensor to the device (GPU if available)


plt.imshow(input_batch[0][0])
input_batch = input_batch.to(device)

# Perform inference
with torch.no_grad():
    output = model(input_batch)

# Get predicted class probabilities and class label
probabilities = torch.nn.functional.softmax(output[0], dim=0)
predicted_class_index = torch.argmax(probabilities).item()
predicted_class = label_encoder.classes_[predicted_class_index]

print("Predicted class: ", predicted_class_index)
print(f"Predicted Gesture: {predicted_class}")
print(f"Confidence: {probabilities[predicted_class_index].item():.4f}")


