# Develop a hand gesture recognition model that can accurately identify and classify different hand gestures from image or video data, enabling intuitive human-computer interaction and gesture-based control systems.

# Setup and Dependencies

In [1]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from collections import deque

# Data Collection and Preprocessing

In [2]:
class GestureDataCollector:
    def __init__(self, gesture_labels, num_samples=500, img_size=(224, 224)):
        self.gesture_labels = gesture_labels
        self.num_samples = num_samples
        self.img_size = img_size
        self.hands = mp.solutions.hands.Hands(
            static_image_mode=True,
            max_num_hands=1,
            min_detection_confidence=0.7)
        self.dataset = []
        self.labels = []
        
    def capture_gestures(self):
        cap = cv2.VideoCapture(0)
        
        for label_idx, gesture_name in enumerate(self.gesture_labels):
            print(f"Collecting samples for {gesture_name}. Press 'c' to start...")
            while True:
                ret, frame = cap.read()
                if not ret:
                    continue
                    
                cv2.putText(frame, f"Press 'c' to collect {gesture_name} gestures", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                cv2.imshow('Data Collection', frame)
                
                if cv2.waitKey(1) & 0xFF == ord('c'):
                    break
                    
            print(f"Collecting {self.num_samples} samples...")
            collected = 0
            
            while collected < self.num_samples:
                ret, frame = cap.read()
                if not ret:
                    continue
                    
                # Process frame
                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = self.hands.process(rgb)
                
                if results.multi_hand_landmarks:
                    # Extract hand region
                    hand_landmarks = results.multi_hand_landmarks[0]
                    x_coords = [lm.x * frame.shape[1] for lm in hand_landmarks.landmark]
                    y_coords = [lm.y * frame.shape[0] for lm in hand_landmarks.landmark]
                    
                    x_min, x_max = int(min(x_coords)), int(max(x_coords))
                    y_min, y_max = int(min(y_coords)), int(max(y_coords))
                    
                    # Add padding and crop
                    padding = 30
                    x_min = max(0, x_min - padding)
                    y_min = max(0, y_min - padding)
                    x_max = min(frame.shape[1], x_max + padding)
                    y_max = min(frame.shape[0], y_max + padding)
                    
                    hand_crop = frame[y_min:y_max, x_min:x_max]
                    hand_crop = cv2.resize(hand_crop, self.img_size)
                    
                    # Store data
                    self.dataset.append(hand_crop)
                    self.labels.append(label_idx)
                    collected += 1
                    
                    # Display
                    cv2.putText(frame, f"Collected: {collected}/{self.num_samples}", 
                               (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                
                cv2.imshow('Data Collection', frame)
                cv2.waitKey(1)
                
        cap.release()
        cv2.destroyAllWindows()
        
        # Convert to numpy arrays
        self.dataset = np.array(self.dataset, dtype='float32') / 255.0
        self.labels = np.array(self.labels)
        
        return self.dataset, self.labels

# Model Architecture

In [3]:
def build_gesture_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model

# Training Pipeline

In [4]:
def train_gesture_model():
    # Define gestures
    gesture_labels = ['open_hand', 'fist', 'point', 'peace', 'thumbs_up']
    
    # Collect data
    collector = GestureDataCollector(gesture_labels, num_samples=300)
    X, y = collector.capture_gestures()
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    # Build model
    model = build_gesture_model(X_train.shape[1:], len(gesture_labels))
    
    # Train
    history = model.fit(X_train, y_train,
                       epochs=20,
                       batch_size=32,
                       validation_data=(X_test, y_test))
    
    # Save model
    model.save('gesture_recognition.h5')
    
    return model, gesture_labels

#  Real-Time Gesture Recognition

In [5]:
class GestureRecognizer:
    def __init__(self, model_path, gesture_labels):
        self.hands = mp.solutions.hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.7,
            min_tracking_confidence=0.5)
        self.model = tf.keras.models.load_model(model_path)
        self.gesture_labels = gesture_labels
        self.smoother = GestureSmoother(window_size=5)
        
    def recognize_gestures(self):
        cap = cv2.VideoCapture(0)
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                continue
                
            # Process frame
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.hands.process(rgb)
            
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Extract hand region
                    x_coords = [lm.x * frame.shape[1] for lm in hand_landmarks.landmark]
                    y_coords = [lm.y * frame.shape[0] for lm in hand_landmarks.landmark]
                    
                    x_min, x_max = int(min(x_coords)), int(max(x_coords))
                    y_min, y_max = int(min(y_coords)), int(max(y_coords))
                    
                    # Add padding and crop
                    padding = 30
                    x_min = max(0, x_min - padding)
                    y_min = max(0, y_min - padding)
                    x_max = min(frame.shape[1], x_max + padding)
                    y_max = min(frame.shape[0], y_max + padding)
                    
                    hand_crop = frame[y_min:y_max, x_min:x_max]
                    hand_crop = cv2.resize(hand_crop, (224, 224))
                    hand_crop = np.expand_dims(hand_crop, axis=0) / 255.0
                    
                    # Predict gesture
                    pred = self.model.predict(hand_crop)[0]
                    smoothed_pred = self.smoother.smooth(pred)
                    gesture_idx = np.argmax(smoothed_pred)
                    confidence = smoothed_pred[gesture_idx]
                    
                    # Draw results
                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                    cv2.putText(frame, 
                               f"{self.gesture_labels[gesture_idx]} ({confidence:.2f})",
                               (x_min, y_min - 10), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
            
            cv2.imshow('Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
                
        cap.release()
        cv2.destroyAllWindows()

class GestureSmoother:
    def __init__(self, window_size=5):
        self.window = deque(maxlen=window_size)
        
    def smooth(self, current_pred):
        self.window.append(current_pred)
        return np.mean(self.window, axis=0)

# Main Execution

In [1]:
if __name__ == "__main__":
    # First train the model (uncomment these lines)
    print("Training new model...")
    model, gesture_labels = train_gesture_model()
    
    # Then create recognizer with the trained model
    recognizer = GestureRecognizer('gesture_recognition.h5', gesture_labels)
    
    # Run recognition
    print("Starting gesture recognition...")
    recognizer.recognize_gestures()

Training new model...


NameError: name 'train_gesture_model' is not defined

In [4]:
'c'

'c'

# Re-running of all code to correct error

In [None]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from collections import deque

# 1. Define the GestureDataCollector class
class GestureDataCollector:
    def __init__(self, gesture_labels, num_samples=500, img_size=(224, 224)):
        self.gesture_labels = gesture_labels
        self.num_samples = num_samples
        self.img_size = img_size
        self.hands = mp.solutions.hands.Hands(
            static_image_mode=True,
            max_num_hands=1,
            min_detection_confidence=0.7)
        self.dataset = []
        self.labels = []
        
    def capture_gestures(self):
        cap = cv2.VideoCapture(0)
        
        for label_idx, gesture_name in enumerate(self.gesture_labels):
            print(f"Collecting samples for {gesture_name}. Press 'c' to start...")
            while True:
                ret, frame = cap.read()
                if not ret:
                    continue
                    
                cv2.putText(frame, f"Press 'c' to collect {gesture_name} gestures", 
                           (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                cv2.imshow('Data Collection', frame)
                
                if cv2.waitKey(1) & 0xFF == ord('c'):
                    break
                    
            print(f"Collecting {self.num_samples} samples...")
            collected = 0
            
            while collected < self.num_samples:
                ret, frame = cap.read()
                if not ret:
                    continue
                    
                # Process frame
                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = self.hands.process(rgb)
                
                if results.multi_hand_landmarks:
                    # Extract hand region
                    hand_landmarks = results.multi_hand_landmarks[0]
                    x_coords = [lm.x * frame.shape[1] for lm in hand_landmarks.landmark]
                    y_coords = [lm.y * frame.shape[0] for lm in hand_landmarks.landmark]
                    
                    x_min, x_max = int(min(x_coords)), int(max(x_coords))
                    y_min, y_max = int(min(y_coords)), int(max(y_coords))
                    
                    # Add padding and crop
                    padding = 30
                    x_min = max(0, x_min - padding)
                    y_min = max(0, y_min - padding)
                    x_max = min(frame.shape[1], x_max + padding)
                    y_max = min(frame.shape[0], y_max + padding)
                    
                    hand_crop = frame[y_min:y_max, x_min:x_max]
                    hand_crop = cv2.resize(hand_crop, self.img_size)
                    
                    # Store data
                    self.dataset.append(hand_crop)
                    self.labels.append(label_idx)
                    collected += 1
                    
                    # Display
                    cv2.putText(frame, f"Collected: {collected}/{self.num_samples}", 
                               (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                
                cv2.imshow('Data Collection', frame)
                cv2.waitKey(1)
                
        cap.release()
        cv2.destroyAllWindows()
        
        # Convert to numpy arrays
        self.dataset = np.array(self.dataset, dtype='float32') / 255.0
        self.labels = np.array(self.labels)
        
        return self.dataset, self.labels

# 2. Define the model building function
def build_gesture_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model

# 3. Define the training function
def train_gesture_model():
    # Define your gesture classes
    gesture_labels = ['open_hand', 'fist', 'point', 'peace', 'thumbs_up']  # Add/change as needed
    
    # Initialize data collector
    collector = GestureDataCollector(gesture_labels, num_samples=300)  # 300 samples per gesture
    
    # Collect data - this will open your webcam
    print("Starting data collection...")
    X, y = collector.capture_gestures()
    
    # Split data into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    # Build model
    print("Building model...")
    model = build_gesture_model(X_train.shape[1:], len(gesture_labels))
    
    # Train model
    print("Training model...")
    history = model.fit(X_train, y_train,
                       epochs=20,
                       batch_size=32,
                       validation_data=(X_test, y_test))
    
    # Save model
    model.save('gesture_recognition.h5')
    print("Model saved as 'gesture_recognition.h5'")
    
    return model, gesture_labels

# 4. Define the GestureRecognizer class
class GestureRecognizer:
    def __init__(self, model_path, gesture_labels):
        self.hands = mp.solutions.hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.7,
            min_tracking_confidence=0.5)
        self.model = tf.keras.models.load_model(model_path)
        self.gesture_labels = gesture_labels
        self.smoother = GestureSmoother(window_size=5)
        
    def recognize_gestures(self):
        cap = cv2.VideoCapture(0)
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                continue
                
            # Process frame
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.hands.process(rgb)
            
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Extract hand region
                    x_coords = [lm.x * frame.shape[1] for lm in hand_landmarks.landmark]
                    y_coords = [lm.y * frame.shape[0] for lm in hand_landmarks.landmark]
                    
                    x_min, x_max = int(min(x_coords)), int(max(x_coords))
                    y_min, y_max = int(min(y_coords)), int(max(y_coords))
                    
                    # Add padding and crop
                    padding = 30
                    x_min = max(0, x_min - padding)
                    y_min = max(0, y_min - padding)
                    x_max = min(frame.shape[1], x_max + padding)
                    y_max = min(frame.shape[0], y_max + padding)
                    
                    hand_crop = frame[y_min:y_max, x_min:x_max]
                    hand_crop = cv2.resize(hand_crop, (224, 224))
                    hand_crop = np.expand_dims(hand_crop, axis=0) / 255.0
                    
                    # Predict gesture
                    pred = self.model.predict(hand_crop)[0]
                    smoothed_pred = self.smoother.smooth(pred)
                    gesture_idx = np.argmax(smoothed_pred)
                    confidence = smoothed_pred[gesture_idx]
                    
                    # Draw results
                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                    cv2.putText(frame, 
                               f"{self.gesture_labels[gesture_idx]} ({confidence:.2f})",
                               (x_min, y_min - 10), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
            
            cv2.imshow('Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
                
        cap.release()
        cv2.destroyAllWindows()

# 5. Define the GestureSmoother class
class GestureSmoother:
    def __init__(self, window_size=5):
        self.window = deque(maxlen=window_size)
        
    def smooth(self, current_pred):
        self.window.append(current_pred)
        return np.mean(self.window, axis=0)

# 6. Main execution
if __name__ == "__main__":
    # First train the model
    print("Training new model...")
    model, gesture_labels = train_gesture_model()
    
    # Then create recognizer with the trained model
    recognizer = GestureRecognizer('gesture_recognition.h5', gesture_labels)
    
    # Run recognition
    print("Starting gesture recognition...")
    recognizer.recognize_gestures()

Training new model...
Starting data collection...
Collecting samples for open_hand. Press 'c' to start...
Collecting 300 samples...
Collecting samples for fist. Press 'c' to start...
Collecting 300 samples...
Collecting samples for point. Press 'c' to start...
Collecting 300 samples...
Collecting samples for peace. Press 'c' to start...
Collecting 300 samples...
Collecting samples for thumbs_up. Press 'c' to start...
Collecting 300 samples...
Building model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model...
Epoch 1/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 932ms/step - accuracy: 0.5262 - loss: 1.4952 - val_accuracy: 0.9867 - val_loss: 0.0494
Epoch 2/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 765ms/step - accuracy: 0.9604 - loss: 0.1263 - val_accuracy: 0.9933 - val_loss: 0.0262
Epoch 3/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 745ms/step - accuracy: 0.9863 - loss: 0.0480 - val_accuracy: 0.9967 - val_loss: 0.0092
Epoch 4/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 717ms/step - accuracy: 0.9916 - loss: 0.0297 - val_accuracy: 0.9967 - val_loss: 0.0089
Epoch 5/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 771ms/step - accuracy: 0.9965 - loss: 0.0108 - val_accuracy: 1.0000 - val_loss: 0.0043
Epoch 6/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 878ms/step - accuracy: 0.9965 - loss: 0.0076 - val_accuracy: 1.0000 - val_loss: 9.8355e-04




Model saved as 'gesture_recognition.h5'




Starting gesture recognition...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━

In [None]:
'c' 