# The first pipeline we have used - direct mediapipe detection.

In [4]:
import cv2
import numpy as np
import mediapipe as mp
import time
from collections import deque

# Initialize MediaPipe Hand Detection
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Initialize Color Deques
bpoints, gpoints, rpoints, ypoints = [deque(maxlen=1024) for _ in range(4)]
blue_index, green_index, red_index, yellow_index = 0, 0, 0, 0

# Color Palette
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)]
colorIndex = 0

# Setup Paint Window
paintWindow = np.ones((471, 636, 3)) * 255
buttons = [(40, "CLEAR", (0, 0, 0)), (160, "BLUE", (255, 0, 0)), 
           (275, "GREEN", (0, 255, 0)), (390, "RED", (0, 0, 255)), 
           (505, "YELLOW", (0, 255, 255))]

for x, text, color in buttons:
    cv2.rectangle(paintWindow, (x, 1), (x + 100, 65), color, 2)
    cv2.putText(paintWindow, text, (x + 10, 33), cv2.FONT_HERSHEY_SIMPLEX, 
                0.5, (0, 0, 0), 2, cv2.LINE_AA)

cv2.namedWindow('Paint', cv2.WINDOW_AUTOSIZE)

# Initialize Mediapipe Hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.7)
cap = cv2.VideoCapture(0)

prev_time = 0  # For FPS calculation
writing_enabled = False  # Track if writing is active

def distance(a, b):
    """Calculate Euclidean distance between two points."""
    return np.linalg.norm(np.array([a.x, a.y]) - np.array([b.x, b.y]))

def classify_hand(landmarks):
    """Classifies hand gestures."""
    finger_tips = [landmarks[i].y for i in [8, 12, 16, 20]]  # Index, Middle, Ring, Pinky
    wrist_y = landmarks[0].y

    # Check if only index finger is extended
    index_tip = landmarks[8].y
    index_distances = [distance(landmarks[i], landmarks[i - 1]) for i in [8, 7, 6]]
    avg_index_spread = np.mean(index_distances)

    other_finger_spreads = [distance(landmarks[i], landmarks[i - 1]) for i in [12, 16, 20]]
    avg_other_finger_spread = np.mean(other_finger_spreads)

    if index_tip < wrist_y and avg_index_spread > avg_other_finger_spread * 1.5:
        return "Writing Hand - Continue", (0, 255, 0)  # Green

    if all(tip < wrist_y for tip in finger_tips):
        return "Open Palm - Stop Writing", (0, 0, 255)  # Red

    return None, (255, 255, 255)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        continue

    frame = cv2.flip(frame, 1)
    
    # Preprocessing
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Convert to RGB for Mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)

    # Draw UI
    for x, text, color in buttons:
        cv2.rectangle(frame, (x, 1), (x + 100, 65), color, 2)
        cv2.putText(frame, text, (x + 10, 33), cv2.FONT_HERSHEY_SIMPLEX, 
                    0.5, (0, 0, 0), 2, cv2.LINE_AA)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Gesture classification
            label, color = classify_hand(hand_landmarks.landmark)
            if label:
                cv2.putText(frame, label, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                            1, color, 2, cv2.LINE_AA)

            # Update writing status
            if label == "Writing Hand - Continue":
                writing_enabled = True
            elif label == "Open Palm - Stop Writing":
                writing_enabled = False

            # Extract fingertip position
            index_finger = (int(hand_landmarks.landmark[8].x * frame.shape[1]), 
                            int(hand_landmarks.landmark[8].y * frame.shape[0]))
            
            cv2.circle(frame, index_finger, 5, (0, 255, 0), -1)

            # Button clicks
            if index_finger[1] <= 65:
                if 40 <= index_finger[0] <= 140:  # Clear Button
                    bpoints, gpoints, rpoints, ypoints = [deque(maxlen=512) for _ in range(4)]
                    paintWindow[67:, :, :] = 255
                elif 160 <= index_finger[0] <= 255:
                    colorIndex = 0  # Blue
                elif 275 <= index_finger[0] <= 370:
                    colorIndex = 1  # Green
                elif 390 <= index_finger[0] <= 485:
                    colorIndex = 2  # Red
                elif 505 <= index_finger[0] <= 600:
                    colorIndex = 3  # Yellow

            elif writing_enabled:  # Only draw when writing is enabled
                # Ensure each list has at least one deque
                while len(bpoints) <= blue_index:
                    bpoints.append(deque(maxlen=512))
                while len(gpoints) <= green_index:
                    gpoints.append(deque(maxlen=512))
                while len(rpoints) <= red_index:
                    rpoints.append(deque(maxlen=512))
                while len(ypoints) <= yellow_index:
                    ypoints.append(deque(maxlen=512))

                # Append points safely
                if colorIndex == 0:
                    bpoints[-1].appendleft(index_finger)
                elif colorIndex == 1:
                    gpoints[-1].appendleft(index_finger)
                elif colorIndex == 2:
                    rpoints[-1].appendleft(index_finger)
                elif colorIndex == 3:
                    ypoints[-1].appendleft(index_finger)

    # Draw lines
    for points, color in zip([bpoints, gpoints, rpoints, ypoints], colors):
        for stroke in points:
            for i in range(1, len(stroke)):
                if stroke[i - 1] is None or stroke[i] is None:
                    continue
                cv2.line(frame, stroke[i - 1], stroke[i], color, 2)
                cv2.line(paintWindow, stroke[i - 1], stroke[i], color, 2)

    # Calculate FPS
    curr_time = time.time()
    fps = int(1 / (curr_time - prev_time)) if prev_time else 0
    prev_time = curr_time

    cv2.putText(frame, f"FPS: {fps}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow("Air Canvas", frame)
    cv2.imshow("Paint", paintWindow)
    cv2.imshow("Preprocessed Grayscale", gray)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


# The second pipeline, train a model and then add it into the air-canvas model

In [None]:

import pandas as pd
import numpy as np
import cv2
import mediapipe as mp

mp_hands = mp.solutions.hands

cap = cv2.VideoCapture(0)
data = []

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            continue

        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(rgb_frame)

        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                landmarks = [coord for landmark in hand_landmarks.landmark for coord in (landmark.x, landmark.y)]
                cv2.imshow("Hand Tracking", frame)
                key = cv2.waitKey(1) & 0xFF
                if key == ord('o'):  # Open palm
                    data.append(landmarks + [0])
                elif key == ord('w'):  # Writing hand
                    data.append(landmarks + [1])
                elif key == ord('q'):  # Quit and save
                    df = pd.DataFrame(data)
                    df.to_csv("hand_data.csv", index=False)
                    cap.release()
                    cv2.destroyAllWindows()
                    exit()

cap.release()
cv2.destroyAllWindows()


In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

df = pd.read_csv("hand_data.csv")
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, "hand_model.pkl")

accuracy = model.score(X_test, y_test)
print(f"Model trained with {accuracy * 100:.2f}% accuracy")


Model trained with 100.00% accuracy


In [None]:

import cv2
import mediapipe as mp
import joblib
import numpy as np

model = joblib.load("hand_model.pkl")

mp_hands = mp.solutions.hands
cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            continue

        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = hands.process(rgb_frame)

        if result.multi_hand_landmarks:
            for hand_landmarks in result.multi_hand_landmarks:
                landmarks = np.array([coord for landmark in hand_landmarks.landmark for coord in (landmark.x, landmark.y)]).reshape(1, -1)
                prediction = model.predict(landmarks)[0]

                if prediction == 0:
                    cv2.putText(frame, "Open Palm - Stop Writing", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                else:
                    cv2.putText(frame, "Writing Hand - Continue", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        cv2.imshow("AI Hand Detection", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


In [None]:
import cv2
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("hand_data.csv")

# Extract Open Palm & Writing Hand Samples
open_palm_samples = df[df.iloc[:, -1] == 0].iloc[:, :-1].values[0].reshape(-1, 2)
writing_hand_samples = df[df.iloc[:, -1] == 1].iloc[:, :-1].values[0].reshape(-1, 2)

# Create blank images for visualization
image_size = 500
open_palm_img = np.ones((image_size, image_size, 3), dtype=np.uint8) * 255
writing_hand_img = np.ones((image_size, image_size, 3), dtype=np.uint8) * 255

def draw_landmarks(image, landmarks, color):
    for x, y in landmarks:
        x, y = int(x * image_size), int(y * image_size)  # Scale to image size
        cv2.circle(image, (x, y), 5, color, -1)

# Draw landmarks
draw_landmarks(open_palm_img, open_palm_samples, (0, 0, 255))  # Red for Open Palm
draw_landmarks(writing_hand_img, writing_hand_samples, (255, 0, 0))  # Blue for Writing Hand

# Show images
cv2.imshow("Open Palm Gesture", open_palm_img)
cv2.imshow("Writing Hand Gesture", writing_hand_img)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [5]:
import cv2
import numpy as np
import mediapipe as mp
import joblib
from collections import deque

# Load trained hand gesture model
model = joblib.load("hand_model.pkl")

# Initialize Mediapipe
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils

# Initialize OpenCV paint parameters
bpoints, gpoints, rpoints, ypoints = [deque(maxlen=1024) for _ in range(4)]
blue_index, green_index, red_index, yellow_index = 0, 0, 0, 0
kernel = np.ones((5, 5), np.uint8)
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)]
colorIndex = 0

# Setup Paint Canvas
paintWindow = np.ones((471, 636, 3), dtype=np.uint8) * 255
buttons = [("CLEAR", (40, 1), (140, 65)), ("BLUE", (160, 1), (255, 65)),
           ("GREEN", (275, 1), (370, 65)), ("RED", (390, 1), (485, 65)), 
           ("YELLOW", (505, 1), (600, 65))]

for text, start, end in buttons:
    paintWindow = cv2.rectangle(paintWindow, start, end, (0, 0, 0), 2)
    cv2.putText(paintWindow, text, (start[0] + 10, 33), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2, cv2.LINE_AA)

cv2.namedWindow('Paint', cv2.WINDOW_AUTOSIZE)

# Initialize webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Draw buttons
    for text, start, end in buttons:
        frame = cv2.rectangle(frame, start, end, (0, 0, 0), 2)
        cv2.putText(frame, text, (start[0] + 10, 33), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2, cv2.LINE_AA)

    # Process hand landmarks
    result = hands.process(framergb)

    if result.multi_hand_landmarks:
        for handslms in result.multi_hand_landmarks:
            landmarks = np.array([coord for lm in handslms.landmark for coord in (lm.x, lm.y)]).reshape(1, -1)
            prediction = model.predict(landmarks)[0]  # Predict gesture

            # Drawing landmarks
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)

            fore_finger = (int(handslms.landmark[8].x * frame.shape[1]), int(handslms.landmark[8].y * frame.shape[0]))

            if prediction == 0:  # Open Palm - Stop Writing
                cv2.putText(frame, "Open Palm - Stop Writing", (50, 50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
            else:  # Writing Hand - Continue Drawing
                cv2.putText(frame, "Writing Hand - Continue", (50, 50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

                # Handle drawing logic
                if fore_finger[1] <= 65:  # Check button clicks
                    if 40 <= fore_finger[0] <= 140:  # Clear button
                        bpoints, gpoints, rpoints, ypoints = [deque(maxlen=512) for _ in range(4)]
                        blue_index, green_index, red_index, yellow_index = 0, 0, 0, 0
                        paintWindow[67:, :, :] = 255
                    elif 160 <= fore_finger[0] <= 255:
                        colorIndex = 0  # Blue
                    elif 275 <= fore_finger[0] <= 370:
                        colorIndex = 1  # Green
                    elif 390 <= fore_finger[0] <= 485:
                        colorIndex = 2  # Red
                    elif 505 <= fore_finger[0] <= 600:
                        colorIndex = 3  # Yellow
                else:
                    if colorIndex == 0:
                        if len(bpoints) <= blue_index:
                            bpoints.append(deque(maxlen=512))
                        bpoints[blue_index].appendleft(fore_finger)
                    elif colorIndex == 1:
                        if len(gpoints) <= green_index:
                            gpoints.append(deque(maxlen=512))
                        gpoints[green_index].appendleft(fore_finger)
                    elif colorIndex == 2:
                        if len(rpoints) <= red_index:
                            rpoints.append(deque(maxlen=512))
                        rpoints[red_index].appendleft(fore_finger)
                    elif colorIndex == 3:
                        if len(ypoints) <= yellow_index:
                            ypoints.append(deque(maxlen=512))
                        ypoints[yellow_index].appendleft(fore_finger)

    # Draw painting strokes
    points = [bpoints, gpoints, rpoints, ypoints]
    for i, point_set in enumerate(points):
        for j, point_list in enumerate(point_set):
            for k in range(1, len(point_list)):
                if point_list[k - 1] is None or point_list[k] is None:
                    continue
                cv2.line(frame, point_list[k - 1], point_list[k], colors[i], 2)
                cv2.line(paintWindow, point_list[k - 1], point_list[k], colors[i], 2)

    # Show output
    cv2.imshow("Output", frame)
    cv2.imshow("Paint", paintWindow)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


# The observations from comparing the two pipelines:

The first pipeline, i.e., Enhancing the hand detection by using mediapipe - is much better than the second pipeline i.e., Training a model using Mediapipe, Random Forest in many terms.

We have seen that open palm - stop writing, or writing hand - continue writing work much efficiently and is fast in pipeline 1 when compared to pipeline 2.

This is beacuse we need to get the predictions from the model in pipeline 2 which takes a bit of time , but in pipeline 1 it is almost instantanoues.

There were still some problems that we could not resolve, like the camera's low configurations causing the hands or mediapipe indexes to fluctuate and causing errors in hand detection.