In [None]:
""" IN THE NAME OF  ALLAH , THE MOST GRACIOUS, THE MOST MERCIFUL. """

In [1]:
import sys
import cv2
import time
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch import Tensor
from torch.utils.data import DataLoader, random_split

modules_path = '../external_modules/'
dataset_path = '../../Dataset/mp'
save_path = "../model"

sys.path.insert(1, modules_path)
from video_dataset import VideoFrameDataset, ImglistToTensor
from global_defines import GESTURE_MAP_NUM2STR

if False and torch.xpu.is_available():
    device = torch.device("xpu")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

  warn(


cpu


In [2]:
# Load dataset
#img_w = 640
#img_h = 480
#frames_per_video = 7
batch_size = 8
num_workers = 1

dataset = VideoFrameDataset(
    root_path= f"{dataset_path}",
    annotationfile_path=f"{dataset_path}/annotations.txt",
    num_segments=8,
    frames_per_segment=1,
    imagefile_template='{:01d}.jpg',
    transform=ImglistToTensor(),
    test_mode=False
)
class_map = {0:"down", 1:"left", 2:"right", 3:"up"}

train_dataset, valid_dataset, test_dataset = random_split(dataset, [0.7,0.2,0.1])

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, num_workers=num_workers, shuffle=True)

In [15]:
# Define preprocessings
import mediapipe as mp

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.35,
    min_tracking_confidence=0.35
)
mp_drawing = mp.solutions.drawing_utils

def get_hand_landmarks_frame(frame, istransform=False):
    if istransform==True:
        frame = 255*frame
        frame = frame.transpose(2,0).numpy().astype('uint8')
    results = hands.process(frame)

    hand_landmarks = []
    if results.multi_hand_landmarks:
        for hand_landmark in results.multi_hand_landmarks[0].landmark:
            hand_landmarks.append([hand_landmark.x, hand_landmark.y])
        hand_landmarks = np.array(hand_landmarks).flatten()
    else:
        hand_landmarks = np.zeros(21*2)

    return torch.from_numpy(hand_landmarks.astype('float32'))


In [25]:
# Define model
class NN_LSTM(torch.nn.Module):
    """
    """
    def __init__(self, layer_sizes):
        super(NN_LSTM, self).__init__()
        self.linear1 = torch.nn.Linear(layer_sizes[0],layer_sizes[1])
        self.relu1 = torch.nn.ReLU()
        self.lstm = torch.nn.LSTM(layer_sizes[1],layer_sizes[2],batch_first=True)
        self.linear2 = torch.nn.Linear(layer_sizes[2],layer_sizes[3])

    def forward(self, x):
        out1 = self.linear1(x)
        out2 = self.relu1(out1)
        out3 = self.lstm(out2)
        out4 = self.linear2(out3[0])
        return out4

torch.set_default_dtype(torch.float32)
model = NN_LSTM([21*2,10,5,5]).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_function = torch.nn.CrossEntropyLoss()

In [27]:
# Define training loop
def train(model, data):
    model.train()
    for i, (x, y) in enumerate(data):
        print("\r"+str(i), end="")
            
        y = y.to(device)
        hand_landmarks = torch.zeros(batch_size, 8, 21*2)

        for i, video in enumerate(x):    # x is batch
            for j, frame in enumerate(video):
                hand_landmarks[i][j] = get_hand_landmarks_frame(frame, True)
        
        y_hat = model(hand_landmarks)
        y_hat = y_hat.transpose(0,1)[-1]
        loss = loss_function(y_hat, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()

def test(model, data, label=""):
    correct = 0
    model.eval()
    with torch.no_grad():
        for k, (x,y) in enumerate(data):
            hand_landmarks = torch.zeros(batch_size, 8, 21*2)

            for i, video in enumerate(x):    # x is batch
                for j, frame in enumerate(video):
                    hand_landmarks[i][j] = get_hand_landmarks_frame(frame, True)

            y_hat = model(hand_landmarks)
            y_hat = y_hat.transpose(0,1)[-1]
            _, y_hat = torch.max(y_hat,1)
            correct += (y_hat==y).sum()
    print(label+"accuracy:", round(float(correct/(k+1)/batch_size), 3))

In [31]:
%%time

# Train
NumEpochs=1 
for epoch in range(NumEpochs):
    loss = train(model, train_loader)
    print("  Epoch:",epoch, " Loss:", round(loss,3))

212

ValueError: Expected input batch_size (8) to match target batch_size (2).

In [None]:
torch.save(model, save_path+"/model_lstm.model")

In [None]:
model = torch.load(save_path+"/model_lstm.model", map_location=torch.device('cpu')).to(device)

In [None]:
%%time

# Test
test(model, train_loader, "Training")
test(model, valid_loader, "Vailidation")

In [None]:
def predict(model, data, label=""):
    correct = 0
    model.eval()
    with torch.no_grad():
        for k, x in enumerate(data):
            hand_landmarks = torch.zeros(batch_size, 8, 63)

            for i, video in enumerate(x):    # x is batch
                for j, frame in enumerate(video):
                    hand_landmarks[i][j] = get_hand_landmarks_frame(frame)

            y_hat = model(hand_landmarks)
            y_hat = y_hat.transpose(0,1)[-1]
            confidence, y_hat = torch.max(y_hat,1)
            print(y_hat)
            if confidence[0].item() >= 0.6:
                print(GESTURE_MAP_NUM2STR[y_hat[0].item()], confidence[0].item())
            else:
                print("None", confidence[0].item())

def LiveVideoTest():
    # Captures videos
    num_batches = 10
    batch_size = 1
    frames_per_video = 8
    frame_rate = 10

    for n in range(num_batches):
        batch = []
        for v in range(batch_size):
            video = []
            for f in range(frames_per_video):
                ret, frame = cam.read()
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                video.append(frame)

                # Draw hand landmarks on the frame
                if results.multi_hand_landmarks:
                    for hand_landmarks in results.multi_hand_landmarks:
                        self.mp_drawing.draw_landmarks(
                            frame, hand_landmarks, self.mp_hands.HAND_CONNECTIONS)
                cv2.imshow('Hand Gesture Recognition', frame)
                    
                time.sleep(1/frame_rate)
                
            batch.append(video)
        predict(model,[batch])
    cam.release()
    cv2.destroyAllWindows()

cam = cv2.VideoCapture(0)
try:
    LiveVideoTest()
except Exception as e:
    print("Error:", e)
    cam.release()
    cv2.destroyAllWindows()

In [None]:
cam = cv2.VideoCapture(0)
LiveVideoTest()

In [None]:
cam.release()
cv2.destroyAllWindows()