In [None]:
%%sh
# Requires python 3.12
pip install matplotlib mediapipe opencv-python scikit-learn

In [1]:
import os

import cv2
import matplotlib.pyplot as plt

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

In [None]:
DATA_DIR = './Datasets/asl-alphabet/asl_alphabet_train/asl_alphabet_train'
TEST_DIR = './Datasets/asl-alphabet/asl_alphabet_test/asl_alphabet_test'

# configuration for different methods
GROUP = True
VALIDATE = True

landmark_dataset = []
labels = []

# load options for hand landmark detection
BaseOptions = mp.tasks.BaseOptions
HandLandmarker = mp.tasks.vision.HandLandmarker
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Group letters into classifications based on the number of fingers raised
# 0 is for a closed fist
grouped_classifications = {
    'A': 0,
    'B': 0,
    'C': 0,
    'D': 1,
    'E': 0,
    'F': 3,
    'G': 2,
    'H': 0,
    'I': 1,
    'J': 1,
    'K': 2,
    'L': 2,
    'M': 0,
    'N': 0,
    'O': 0,
    'P': 2,
    'Q': 2,
    'R': 2,
    'S': 0,
    'T': 0,
    'U': 2,
    'V': 2,
    'W': 3,
    'X': 1,
    'Y': 2,
    'Z': 1,
}

individual_classifications = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11,
    'M': 12,
    'N': 13,
    'O': 14,
    'P': 15,
    'Q': 16,
    'R': 17,
    'S': 18,
    'T': 19,
    'U': 20,
    'V': 21,
    'W': 22,
    'X': 23,
    'Y': 24,
    'Z': 25,
}

classifications = None
if GROUP:
    classifications = grouped_classifications
else:
    classifications = individual_classifications

options = HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='./hand_landmarker.task'),
    running_mode=VisionRunningMode.IMAGE,
    min_hand_detection_confidence=0,
    min_hand_presence_confidence=0,
    min_tracking_confidence=0)

with HandLandmarker.create_from_options(options) as landmarker:
    total_images = 0
    no_landmarks = 0
    for letter in os.listdir(DATA_DIR):
        if letter in ['del', 'nothing', 'space']:
            continue
        for i, img_path in enumerate(os.listdir(os.path.join(DATA_DIR, letter))):
            if VALIDATE and i % 20 != 0:
                continue
            total_images += 1
            hand_landmarks = []

            # Load image from file
            mp_image = mp.Image.create_from_file(os.path.join(DATA_DIR, letter, img_path))

            results = landmarker.detect(mp_image)

            if results.hand_landmarks:
                for landmark in results.hand_landmarks[0]:
                    hand_landmarks.append(landmark.x)
                    hand_landmarks.append(landmark.y)

                landmark_dataset.append(hand_landmarks)

                classification = classifications[letter]
                labels.append(classifications[letter])
            else:
                no_landmarks += 1

        print(f'Letter: {letter}, Classification: {classification}')
        # print landmark detection rate
        print(f'Landmark detection rate: {1-(no_landmarks / total_images)}')

Letter: A, Classification: 0
Landmark detection rate: 1.0
Letter: B, Classification: 0
Landmark detection rate: 1.0
Letter: C, Classification: 0
Landmark detection rate: 1.0
Letter: D, Classification: 1
Landmark detection rate: 1.0
Letter: E, Classification: 0
Landmark detection rate: 1.0
Letter: F, Classification: 3
Landmark detection rate: 1.0
Letter: G, Classification: 2
Landmark detection rate: 1.0
Letter: H, Classification: 0
Landmark detection rate: 1.0
Letter: I, Classification: 1
Landmark detection rate: 1.0
Letter: J, Classification: 1
Landmark detection rate: 1.0
Letter: K, Classification: 2
Landmark detection rate: 1.0
Letter: L, Classification: 2
Landmark detection rate: 1.0
Letter: M, Classification: 0
Landmark detection rate: 1.0
Letter: N, Classification: 0
Landmark detection rate: 1.0
Letter: O, Classification: 0
Landmark detection rate: 1.0
Letter: P, Classification: 2
Landmark detection rate: 1.0
Letter: Q, Classification: 2
Landmark detection rate: 1.0
Letter: R, Cla

In [5]:
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Convert the data to numpy arrays
landmark_array = np.array(landmark_dataset)
label_array = np.array(labels)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(landmark_array, label_array, test_size=0.2, shuffle=True, stratify=labels)

model = RandomForestClassifier()

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

accuracy = accuracy_score(y_predict, y_test)

print('Model accuracy: {}%'.format(accuracy * 100))

Model accuracy: 83.84615384615385%


In [7]:
# example showing model inference and visualization

# pick random image in dataset
# create array of letters

letters = [letter for letter in os.listdir(TEST_DIR)]

# pick random letter
letter = np.random.choice(letters)

# pick random image
img_path = np.random.choice(os.listdir(os.path.join(TEST_DIR, letter)))

# Load image from file
mp_image = mp.Image.create_from_file(os.path.join(TEST_DIR, letter, img_path))

with HandLandmarker.create_from_options(options) as landmarker:

    results = landmarker.detect(mp_image)
    if results.hand_landmarks:
        hand_landmarks = []
        for landmark in results.hand_landmarks[0]:
            hand_landmarks.append(landmark.x)
            hand_landmarks.append(landmark.y)

        prediction = model.predict([hand_landmarks])
        print(f'Predicted class: {prediction[0]}, Actual: {classifications[letter]}')

        img = cv2.imread(os.path.join(DATA_DIR, letter, img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # draw landmarks with small radius
        for landmark in results.hand_landmarks[0]:
            cv2.circle(img, (int(landmark.x * img.shape[1]), int(landmark.y * img.shape[0])), 2, (0, 255, 0), -1)


        plt.imshow(img)
        plt.show()
    else:
        print('No hand landmarks detected')


NotADirectoryError: [WinError 267] The directory name is invalid: './Datasets/asl-alphabet/asl_alphabet_test/asl_alphabet_test\\J_test.jpg'

In [60]:
# save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
from sklearn import svm

distances = []
for hand in landmark_array:
    distances.append([
        np.linalg.norm(hand[0:2] - hand[8:10]),
        np.linalg.norm(hand[0:2] - hand[12:14]),
        np.linalg.norm(hand[0:2] - hand[16:18]),
        np.linalg.norm(hand[0:2] - hand[20:22]),
    ])

distances = np.array(distances).reshape(len(landmark_array), -1)

x_train, x_test, y_train, y_test = train_test_split(distances, label_array, test_size=0.2, shuffle=True, stratify=labels)

svm_model = svm.SVC()

svm_model.fit(x_train, y_train)

y_predict = svm_model.predict(x_test)

accuracy = accuracy_score(y_predict, y_test)

print('Model accuracy: {}%'.format(accuracy * 100))



Model accuracy: 62.69230769230769%
