In [1]:

import os, cv2, time, pickle
import numpy as np
import mediapipe as mp

# ========= USER SETTINGS =========
MODE = "collect"   # choose: "collect", "train", "infer"
GESTURES = ["palm", "fist", "thumbs_up", "peace", "okay"]
DATA_DIR = "gesture_data"
MODEL_FILE = "gesture_mlp.h5"
ENCODER_FILE = "label_encoder.pkl"
# =================================


# =========================================================
# (1) DATA COLLECTION
# =========================================================
def collect_data():
    os.makedirs(DATA_DIR, exist_ok=True)
    for g in GESTURES:
        os.makedirs(os.path.join(DATA_DIR, g), exist_ok=True)

    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    cap = cv2.VideoCapture(0)

    with mp_hands.Hands(static_image_mode=False,
                        max_num_hands=1,
                        min_detection_confidence=0.6,
                        min_tracking_confidence=0.6) as hands:
        recording = False
        label = None
        sample_count = {g: len(os.listdir(os.path.join(DATA_DIR,g))) for g in GESTURES}

        print("Keys:", {str(i+1):g for i,g in enumerate(GESTURES)})
        print("Press number key to start/stop recording that gesture. Press 'q' to quit.")

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.flip(frame,1)
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            res = hands.process(rgb)

            if res.multi_hand_landmarks:
                for lm in res.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(frame, lm, mp_hands.HAND_CONNECTIONS)

            cv2.putText(frame, f"Recording: {recording} Label: {label}", (10,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)

            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break

            if key in [ord(str(i+1)) for i in range(len(GESTURES))]:
                idx = int(chr(key)) - 1
                label = GESTURES[idx]
                recording = not recording
                if recording:
                    print(f"Started recording: {label}")
                else:
                    print(f"Stopped recording: {label}. Samples so far: {sample_count[label]}")

            if recording and res.multi_hand_landmarks:
                lm = res.multi_hand_landmarks[0]
                coords = []
                for l in lm.landmark:
                    coords.extend([l.x, l.y, l.z])
                coords = np.array(coords)
                fname = os.path.join(DATA_DIR, label, f"{label}_{sample_count[label]:05d}.npy")
                np.save(fname, coords)
                sample_count[label] += 1

            cv2.imshow("Collect Gestures", frame)

    cap.release()
    cv2.destroyAllWindows()


# =========================================================
# (2) TRAINING
# =========================================================
def train_model():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.utils import to_categorical

    X, y = [], []
    gestures = sorted(os.listdir(DATA_DIR))
    for g in gestures:
        folder = os.path.join(DATA_DIR, g)
        for file in os.listdir(folder):
            if file.endswith(".npy"):
                v = np.load(os.path.join(folder,file))
                X.append(v)
                y.append(g)

    X = np.array(X)
    y = np.array(y)
    print("Dataset:", X.shape, len(set(y)))

    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    y_cat = to_categorical(y_enc)

    X_train, X_test, y_train, y_test = train_test_split(X, y_cat,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y_cat)

    model = Sequential([
        Dense(256, input_shape=(X.shape[1],), activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(len(gestures), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_test, y_test))

    model.save(MODEL_FILE)
    with open(ENCODER_FILE,"wb") as f:
        pickle.dump(le, f)

    print("Training complete. Model saved.")


# =========================================================
# (3) REALTIME INFERENCE
# =========================================================
def infer_realtime():
    from tensorflow.keras.models import load_model
    model = load_model(MODEL_FILE)
    le = pickle.load(open(ENCODER_FILE,"rb"))

    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    cap = cv2.VideoCapture(0)

    with mp_hands.Hands(static_image_mode=False,
                        max_num_hands=1,
                        min_detection_confidence=0.6,
                        min_tracking_confidence=0.6) as hands:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.flip(frame,1)
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            res = hands.process(rgb)

            label_text = "No hand"
            if res.multi_hand_landmarks:
                lm = res.multi_hand_landmarks[0]
                coords = []
                for l in lm.landmark:
                    coords.extend([l.x, l.y, l.z])
                coords = np.array(coords).reshape(1,-1)
                preds = model.predict(coords, verbose=0)
                idx = np.argmax(preds)
                label = le.inverse_transform([idx])[0]
                prob = preds[0][idx]
                label_text = f"{label} ({prob:.2f})"
                mp_drawing.draw_landmarks(frame, lm, mp_hands.HAND_CONNECTIONS)

            cv2.putText(frame, label_text, (10,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
            cv2.imshow("Realtime Gesture", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()


# =========================================================
# MAIN
# =========================================================
if __name__ == "__main__":
    if MODE == "collect":
        collect_data()
    elif MODE == "train":
        train_model()
    elif MODE == "infer":
        infer_realtime()
    else:
        print("Invalid MODE. Choose from: collect, train, infer")


Keys: {'1': 'palm', '2': 'fist', '3': 'thumbs_up', '4': 'peace', '5': 'okay'}
Press number key to start/stop recording that gesture. Press 'q' to quit.


In [2]:
!pip install mediapipe

