Installing and importing relevant dependencies 

In [None]:
! pip install opencv-python numpy scikit-image scikit-learn matplotlib pandas tensorflow mediapipe

In [5]:
import cv2
import mediapipe as mp
import numpy as np
import os
import random
import datetime
import matplotlib.pyplot as plt
from skimage import io 
import glob  
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import threading
import time
from collections import deque
import tensorflow as tf
from tensorflow import keras 
import pandas as pd

## Collecting input data to train models

Collecting data for Static Gestures

In [7]:
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils

SAVE_DIR = "gesture_data"
os.makedirs(SAVE_DIR, exist_ok=True)

gesture_type = input("Enter gesture type (S for Static): ").strip().upper()
gesture_label = input("Enter gesture label (A, B, C): ").strip().upper()
num_samples = int(input("Enter number of continuous samples to collect: "))

print(f"Collecting {num_samples} continuous samples for {gesture_type}-{gesture_label}")
csv_path = os.path.join(SAVE_DIR, f"{gesture_type}_{gesture_label}.csv")

if not os.path.exists(csv_path):
    with open(csv_path, "w") as f:
        f.write("gesture_label," + ",".join([f"x{i},y{i},z{i}" for i in range(21)]) + "\n")

def save_landmarks(landmarks):
    data_row = [gesture_label]
    for lm in landmarks:
        data_row.extend([lm.x, lm.y, lm.z]) 

    with open(csv_path, "a") as f:
        f.write(",".join(map(str, data_row)) + "\n")

def classify_static(hand_landmarks):
    index_up = hand_landmarks[8].y < hand_landmarks[6].y 
    middle_up = hand_landmarks[12].y < hand_landmarks[10].y  

    if index_up and not middle_up:
        return "A"
    elif index_up and middle_up:
        return "B"
    elif not index_up and not middle_up:
        return "C"
    return None 

cap = cv2.VideoCapture(0)
sample_count = 0
prev_landmarks = None
collecting = False 

with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7) as hands:
    print("Press 'q' to quit. Press 's' to start data collection.")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            continue

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)

        detected_gesture = None

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                landmarks = hand_landmarks.landmark

                if gesture_type == "S":
                    detected_gesture = classify_static(landmarks)
                
                color = (0, 255, 0) if detected_gesture else (0, 0, 255)
                cv2.putText(frame, f"Gesture: {detected_gesture if detected_gesture else 'None'}",
                            (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)

                key = cv2.waitKey(1) & 0xFF
                if key == ord('s') and not collecting:
                    print("Starting in 2 seconds...")
                    time.sleep(2)
                    collecting = True
                    sample_count = 0
                    print(f"Collecting {num_samples} continuous samples...")

                if collecting and sample_count < num_samples:
                    save_landmarks(landmarks)
                    sample_count += 1
                    if sample_count % 50 == 0:
                        print(f"{sample_count}/{num_samples} frames collected...")

                if collecting and sample_count >= num_samples:
                    collecting = False
                    print(f"Finished collecting {num_samples} samples for {gesture_type}-{gesture_label}")
                    break

        cv2.imshow('Hand Gesture Collection', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()
print(f"Data collection complete. Saved {num_samples} samples in {csv_path}.")

Enter gesture type (S for Static, D for Dynamic):  S
Enter gesture label (A, B, C, D, E, etc.):  C
Enter number of continuous samples to collect:  1000


Collecting 1000 continuous samples for S-C
Press 'q' to quit. Press 's' to start data collection.
Starting in 2 seconds...
Collecting 1000 continuous samples...
50/1000 frames collected...
100/1000 frames collected...
150/1000 frames collected...
200/1000 frames collected...
250/1000 frames collected...
300/1000 frames collected...
350/1000 frames collected...
400/1000 frames collected...
450/1000 frames collected...
500/1000 frames collected...
550/1000 frames collected...
600/1000 frames collected...
650/1000 frames collected...
700/1000 frames collected...
750/1000 frames collected...
800/1000 frames collected...
850/1000 frames collected...
900/1000 frames collected...
950/1000 frames collected...
1000/1000 frames collected...
Finished collecting 1000 samples for S-C
Data collection complete. Saved 1000 samples in gesture_data\S_C.csv.


Collecting data for dynamic gesture recognition. Using 30 frames as one sequence.

In [None]:
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils

SAVE_DIR = "gesture_sequences"
os.makedirs(SAVE_DIR, exist_ok=True)

gesture_label = input("Enter dynamic gesture label (D, E, etc.): ").strip().upper()
num_sequences = int(input("Enter number of sequences to collect: ")) 
sequence_length = int(input("Enter number of frames per sequence: ")) 
print(f"Collecting {num_sequences} sequences of {sequence_length} frames for '{gesture_label}' gesture.")

csv_path = os.path.join(SAVE_DIR, f"{gesture_label}.csv")
if not os.path.exists(csv_path):
    with open(csv_path, "w") as f:
        header = ["sequence_id", "frame_id"] + [f"x{i},y{i},z{i}" for i in range(21)]
        f.write(",".join(header) + "\n")

def save_sequence(sequence_id, frame_id, landmarks):
    data_row = [sequence_id, frame_id]  
    for lm in landmarks:
        data_row.extend([lm.x, lm.y, lm.z]) 

    with open(csv_path, "a") as f:
        f.write(",".join(map(str, data_row)) + "\n")

cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7) as hands:
    print("Press 's' to start collecting data. Press 'q' to quit.")
    while True:
        ret, frame = cap.read()
        if not ret:
            continue

        cv2.putText(frame, "Press 's' to start recording", (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
        cv2.imshow('Dynamic Gesture Collection', frame)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('s'):
            print("Starting in 2 seconds...")
            time.sleep(2)
            break
        elif key == ord('q'):
            cap.release()
            cv2.destroyAllWindows()
            exit()

    sequence_id = 0  
    
    while sequence_id < num_sequences:
        frame_count = 0 

        while frame_count < sequence_length:
            ret, frame = cap.read()
            if not ret:
                continue        
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame_rgb)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    landmarks = hand_landmarks.landmark
                    save_sequence(sequence_id, frame_count, landmarks)
                    frame_count += 1
            
            cv2.putText(frame, f"Recording Sequence {sequence_id + 1}/{num_sequences}",
                        (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow('Dynamic Gesture Collection', frame)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                cap.release()
                cv2.destroyAllWindows()
                exit()

        sequence_id += 1 
print(f"Finished collecting {num_sequences} sequences for {gesture_label}.")
cap.release()
cv2.destroyAllWindows()
print(f"Data collection complete. Data saved in {csv_path}.")

## Training models 

Loading the saved csv file, and creating the Train-Test Split

In [3]:
from sklearn.model_selection import train_test_split

df_A = pd.read_csv("gesture_data/S_A.csv")
df_B = pd.read_csv("gesture_data/S_B.csv")
df_C = pd.read_csv("gesture_data/S_C.csv")

df = pd.concat([df_A, df_B, df_C], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

label_mapping = {"A": 0, "B": 1, "C": 2}
df.iloc[:, 0] = df.iloc[:, 0].map(label_mapping)

y = df.iloc[:, 0].values
X = df.iloc[:, 1:].values

num_landmarks = 21  
num_coordinates = 3  

expected_features = num_landmarks * num_coordinates  
actual_features = X.shape[1]

if actual_features != expected_features:
    raise ValueError(f"Expected {expected_features} features, but got {actual_features}")

X = X.reshape(X.shape[0], num_landmarks, num_coordinates)

y = keras.utils.to_categorical(y, num_classes=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (3000, 21, 3)
y shape: (3000, 3)


Used a CNN to exploit the spatial relation between the 21 points. Saved model locally.

In [5]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam


model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(21, 3)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

model.save("gesture_cnn_model.h5")
print("Model saved successfully!")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4815 - loss: 1.0030 - val_accuracy: 0.8433 - val_loss: 0.4742
Epoch 2/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8560 - loss: 0.4242 - val_accuracy: 0.8867 - val_loss: 0.2669
Epoch 3/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9038 - loss: 0.2586 - val_accuracy: 0.9200 - val_loss: 0.2037
Epoch 4/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9174 - loss: 0.2043 - val_accuracy: 0.9367 - val_loss: 0.1652
Epoch 5/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9400 - loss: 0.1570 - val_accuracy: 0.9267 - val_loss: 0.1903
Epoch 6/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9320 - loss: 0.1706 - val_accuracy: 0.9417 - val_loss: 0.1473
Epoch 7/20
[1m75/75[0m [32m━━━━━━━━━━



Model saved successfully!


Loading the locally saved .csv to dataframe and creating Train-Test Split

In [7]:
from sklearn.preprocessing import LabelEncoder
DATA_DIR = "gesture_sequences"

sequence_length = 30 
num_landmarks = 21 * 3  

X, y = [], []
gesture_labels = [] 

for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        label = file.split(".")[0]  
        gesture_labels.append(label)
        
        df = pd.read_csv(os.path.join(DATA_DIR, file))

        grouped = df.groupby("sequence_id")

        for _, sequence in grouped:
            if len(sequence) == sequence_length:
                X.append(sequence.iloc[:, 2:].values)
                y.append(label)

print(df.head())

X = np.array(X)
y = np.array(y)


encoder = LabelEncoder()
y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3, stratify=y)

   sequence_id  frame_id        x0        y0            z0        x1  \
0            0         0  0.194402  0.697508  3.612226e-07  0.253598   
1            0         1  0.183029  0.705346  4.700153e-07  0.252113   
2            0         2  0.178611  0.707601  4.641686e-07  0.251979   
3            0         3  0.175116  0.709159  4.515502e-07  0.248226   
4            0         4  0.174221  0.710845  4.316040e-07  0.248648   

         y1        z1        x2        y2  ...       z17       x18       y18  \
0  0.687416 -0.010835  0.301478  0.611149  ... -0.039869  0.274102  0.400163   
1  0.690928 -0.014209  0.304526  0.618035  ... -0.032517  0.270675  0.402734   
2  0.691607 -0.013351  0.304803  0.620390  ... -0.032343  0.273874  0.399957   
3  0.691573 -0.013077  0.302036  0.622272  ... -0.033773  0.274288  0.395755   
4  0.691815 -0.011558  0.300989  0.621751  ... -0.034071  0.273905  0.397119   

        z18       x19       y19       z19       x20       y20       z20  
0 -0.061841 

Using an LSTM as for dynamic gesture recognition we need to keep track of past frames.

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(sequence_length, num_landmarks)),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(len(gesture_labels), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)

model.save("gesture_lstm_model.h5")
np.save("gesture_labels.npy", encoder.classes_)

print("Training complete! Model saved as 'gesture_lstm_model.h5'.")

Epoch 1/20


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 82ms/step - accuracy: 0.5386 - loss: 0.7112 - val_accuracy: 0.6000 - val_loss: 0.6605
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6565 - loss: 0.6448 - val_accuracy: 0.7750 - val_loss: 0.5586
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7813 - loss: 0.5383 - val_accuracy: 0.8750 - val_loss: 0.3823
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9605 - loss: 0.3199 - val_accuracy: 1.0000 - val_loss: 0.0982
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9715 - loss: 0.1168 - val_accuracy: 1.0000 - val_loss: 0.0254
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9870 - loss: 0.0599 - val_accuracy: 1.0000 - val_loss: 0.0089
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



Training complete! Model saved as 'gesture_lstm_model.h5'.


## Setting up live gesture classificaiton

Firstly, loading up both models.

In [7]:
cnn_model = tf.keras.models.load_model("gesture_cnn_model.h5")
lstm_model = tf.keras.models.load_model("gesture_lstm_model.h5")
gesture_labels = np.load("gesture_labels.npy")



Setting up variables and mapping to actual gestures

In [9]:
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
label_mapping = {0: "A", 1: "B", 2: "C"}
dynamic_display_mapping = {"D": "BYE", "E": "WAVES"}
display_mapping = {"A": ";)", "B": "FIST", "C": "PEACE"}


landmarks_queue = deque(maxlen=2)
dynamic_gesture_sequence = deque(maxlen=30) 
gesture_label = "No Gesture"
gesture_type = "STATIC"
running = True
dynamic_cooldown = 15 #to stop the classifier from flcikering between static and dynamic 
cooldown_counter = 0 

Defining function to process the 21 points and corresponding 63 coordinates (21*3) and convert to 1-D Numpy Array

In [11]:
def preprocess_landmarks(landmarks):
    flat_landmarks = np.array([[lm.x, lm.y, lm.z] for lm in landmarks]).flatten()
    return flat_landmarks.reshape(1, 21, 3) if flat_landmarks.shape[0] == 63 else None

def preprocess_dynamic_landmarks(landmarks):
    flat_landmarks = np.array([[lm.x, lm.y, lm.z] for lm in landmarks]).flatten()
    if flat_landmarks.shape[0] == 63:
        dynamic_gesture_sequence.append(flat_landmarks)

Defining function to classify gesture as Static or Dynamic. 
Did this by setting up a treshold for allowed movement and calculating the "movement" by taking average of the difference between the 21 current and previous landmarks.

In [13]:
def classify_gesture(threshold=0.02):  
    global gesture_type, cooldown_counter
    if cooldown_counter > 0:
        cooldown_counter -= 1
        return  
    
    if len(landmarks_queue) < 2:
        return  
    prev_landmarks, curr_landmarks = landmarks_queue
    movement = np.mean([
    np.linalg.norm(np.array([curr.x, curr.y, curr.z]) - np.array([prev.x, prev.y, prev.z]))
    for curr, prev in zip(curr_landmarks, prev_landmarks)
])
    if movement > threshold:
        gesture_type = "DYNAMIC"
        cooldown_counter = dynamic_cooldown  
    else:
        gesture_type = "STATIC"

Uses the cnn model to classify static gestures

In [15]:
def classify_static_gesture(landmarks):
    global gesture_label
    X_input = preprocess_landmarks(landmarks)
    if X_input is not None:
        prediction = cnn_model.predict(X_input, verbose=0)
        gesture_label = label_mapping[np.argmax(prediction)]

Uses the lstm model to classify dynamic gesture

In [17]:
def classify_dynamic_gesture():
    global gesture_label
    if len(dynamic_gesture_sequence) < 30:
        return
    X_input = np.array(dynamic_gesture_sequence).reshape(1, 30, 63)
    prediction = lstm_model.predict(X_input, verbose=0)
    gesture_label = dynamic_display_mapping.get(gesture_labels[np.argmax(prediction)], "No Gesture")

Capturing live video

In [19]:
def capture_video():
    global running, gesture_label, frame_count
    cap = cv2.VideoCapture(0)
    with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7) as hands:
        print("Press 'q' to quit.")
        while running:
            ret, frame = cap.read()
            if not ret:
                continue
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame_rgb)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    landmarks_queue.append(hand_landmarks.landmark)
                    preprocess_dynamic_landmarks(hand_landmarks.landmark)
                classify_gesture()
                if gesture_type == "STATIC" and cooldown_counter == 0:
                    classify_static_gesture(hand_landmarks.landmark)
                elif gesture_type == "DYNAMIC":
                    classify_dynamic_gesture()
            else:
                gesture_label = "No Gesture"
            display_text = display_mapping.get(gesture_label, gesture_label)
            cv2.putText(frame, f"Gesture: {display_text}", (50, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0) if gesture_type == "STATIC" else (0, 0, 255), 2)
            cv2.imshow('Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                running = False
                break
    cap.release()
    cv2.destroyAllWindows()

Note: Green text -> Static gesture & Red text -> Dynamic gesture

In [21]:
def start_video():
    video_thread = threading.Thread(target=capture_video, daemon=True)
    video_thread.start()

In [23]:
def reset_variables():
    global running, cooldown_counter, gesture_label, landmarks_queue, dynamic_gesture_sequence
    running = True
    cooldown_counter = 0
    gesture_label = "No Gesture"
    landmarks_queue.clear()
    dynamic_gesture_sequence.clear()

Run this cell to start

In [27]:
reset_variables()
start_video()

Press 'q' to quit.
