Hand Gesture Recognition with Machine Learning and MediaPipe
This notebook implements a hand gesture recognition system using machine learning models and real-time hand tracking with MediaPipe.

# Import Necessary Libraries

In [None]:
import cv2
import mediapipe as mp
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Load and Preprocess the Dataset

In [None]:
df = pd.read_csv('hand_landmarks_data2 .csv')
X = df.drop(columns=['label'])  
X_noZ = X[[col for col in X.columns if not col.startswith('z')]]
X_zs = X.drop(columns=X_noZ.columns)
scaler = MinMaxScaler()
X_noZ_scaled = pd.DataFrame(scaler.fit_transform(X_noZ), columns=X_noZ.columns)
X = X_noZ_scaled.copy()
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Define and Train Models Using GridSearchCV

In [None]:
models = {
    'SVM': (SVC(), {
        'C': [0.01,0.1, 1, 10], 
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'degree': [2, 3, 4, 5], 
        'gamma': ['scale', 'auto'], 
        'class_weight': [None]
    }),
    'RandomForest': (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200], 
        'max_depth': [None, 10, 20], 
        'min_samples_split': [2, 5, 10], 
        'min_samples_leaf': [1, 2, 4]
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), {
        'n_estimators': [50, 100, 200], 
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 5, 10],
        'subsample': [0.5, 0.7, 1.0],
        'colsample_bytree': [0.5, 0.7, 1.0]
    })
}

# Perform Grid Search and Evaluate Models

In [None]:
best_models = {}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    pd.DataFrame(grid_search.cv_results_).to_csv(f'{model_name}_gridsearch_results.csv', index=False)
    y_pred = best_models[model_name].predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(f"{model_name} Best Params: {grid_search.best_params_}")
    print(f"{model_name} - Accuracy: {acc:.4f}, F1-score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

# Save the Best Model

In [None]:
best_model_name = max(best_models, key=lambda k: accuracy_score(y_test, best_models[k].predict(X_test)))
best_model = best_models[best_model_name]
joblib.dump(best_model, 'best_hand_gesture_modelLASTT.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
print(f"Best model saved: {best_model_name}")

# Load Best Model

In [None]:
best_model = joblib.load('best_hand_gesture_modelLASTT.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Initialize MediaPipe and Webcam

In [None]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
cap = cv2.VideoCapture(0)
with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        predicted_label = "No Hand Detected"
        if results.multi_hand_landmarks:
            landmarks_xy = []
            for landmark in results.multi_hand_landmarks[0].landmark:
                landmarks_xy.extend([landmark.x, landmark.y])
            input_features = np.array(landmarks_xy).reshape(1, -1)
            y_preds = best_model.predict(input_features)
            predicted_label = label_encoder.inverse_transform(y_preds)[0]
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        cv2.putText(image, predicted_label, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        cv2.imshow('Hand Gesture Detection', image)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()