In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models, Input, optimizers
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences
import mediapipe as mp
import cv2
import uuid
import os
import json

In [9]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_holistic = mp.solutions.holistic
lip_marks = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 78, 191, 80, 81, 82, 13, 312,
             311, 310, 415, 308, 95, 88, 178, 87, 14, 317, 402, 318, 324, 146, 91, 181, 84,
             17, 314, 405, 321, 375]

In [10]:
# Utilities

def extract_coordinates(results):
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros((468, 3))
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros((33, 3))
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros((21, 3))
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros((21, 3))
    return np.concatenate([face, lh, pose, rh])

def draw(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                              mp_drawing.DrawingSpec(color=(0,0,255), thickness=3, circle_radius=3),
                              mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=0))
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(0,150,0), thickness=3, circle_radius=3),
                              mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(200,56,12), thickness=3, circle_radius=3),
                              mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(250,56,12), thickness=3, circle_radius=3),
                              mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=2))

In [11]:
# Helper Function

def load_json_file(json_path):
    with open(json_path, 'r') as f:
        sign_map = json.load(f)
    return sign_map

class CFG:
    data_dir = "asl-signs/"
    sequence_length = 12
    rows_per_frame = 543

ROWS_PER_FRAME = 543
sequence = []
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    print('pq_path len: ', len(pq_path))
    sequence.append(pq_path)
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)
    
sign_map = load_json_file(CFG.data_dir + 'sign_to_prediction_index_map.json')
train_data = pd.read_csv(CFG.data_dir + 'train.csv')

s2p_map = {k.lower():v for k,v in load_json_file(CFG.data_dir + "sign_to_prediction_index_map.json").items()}
p2s_map = {v:k for k,v in load_json_file(CFG.data_dir + "sign_to_prediction_index_map.json").items()}
encoder = lambda x: s2p_map.get(x.lower())
decoder = lambda x: p2s_map.get(x)

# Processing the video

In [17]:
def predict_asl(mode=0, video_path='', model=''):
    
    if model=='ann':
        path='models/ann/model.tflite'
    elif model=='top-01':
        path='models/1st-place-solution-by-hoyso48/model.tflite'
    elif model=='cnn':
        path='models/cnn/model.tflite'
    elif model=='cnn+3trans':
        path='models/cnn+3trans/model.tflite'
    elif model=='lstm':
        path='models/lstm/model.tflite'
    elif model=='transformer':
        path='models/transformer/model.tflite'
    else:
        path='models/distance-angle-based-features-using-keras/model.tflite'
        
    interpreter = tf.lite.Interpreter(model_path=path)
    interpreter.allocate_tensors()
    print('Initilized Tensors')
    found_signatures = list(interpreter.get_signature_list().keys())
    prediction_fn = interpreter.get_signature_runner("serving_default")
    
    coordinates = []
    res = []
    
    cap = cv2.VideoCapture(0) if mode==0 else cv2.VideoCapture(video_path)
        
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while not cap.isOpened():
            print('Capture is Not Open')
            cap = cv2.VideoCapture(0) if mode==0 else cv2.VideoCapture(video_path)
            cv2.waitKey(1000)

        pos_frame = cap.get(cv2.CAP_PROP_POS_FRAMES)
        print('Starting Prediction')
        while True:
            flag, image = cap.read()
            if flag:
                image.flags.writeable = False
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                results = holistic.process(image)
                image.flags.writeable = True
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                landmarks = extract_coordinates(results)
#                 draw(image, results)
                coordinates.append(landmarks)
                if len(coordinates) == 15:
                    prediction = prediction_fn(inputs=np.array(coordinates).astype(np.float32))
                    sign = np.argmax(prediction["outputs"])
#                     print(f'{decoder(sign)} {format(prediction["outputs"][sign]*10,".2f")}%')
                    if model=='cnn+3trans' or model=='lstm':
                        if prediction["outputs"][0][sign]*10 > 50:
                            res.append(decoder(sign))
                    elif prediction["outputs"][sign]*10 > 50:
                            res.append(decoder(sign))
                    cv2.putText(image, f"Prediction:    {decoder(sign)}", (3, 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
                    coordinates = coordinates[10:]
                    
                
                cv2.imshow('Prediction',image)

            else:
                # The next frame is not ready, so we try to read it again
                cap.set(cv2.CAP_PROP_POS_FRAMES, pos_frame-1)
                # It is better to wait for a while for the next frame to be ready
                cv2.waitKey(1000)
                break

            if cv2.waitKey(10) & 0xFF == 27:
                break
            if cap.get(cv2.CAP_PROP_POS_FRAMES) == cap.get(cv2.CAP_PROP_FRAME_COUNT):
                # If the number of captured frames is equal to the total number of frames,
                # we stop
                break
    print(max(res, key=res.count) if len(res)>0 else "Couldn't Predict")
    cap.release()
    cv2.destroyAllWindows()

In [20]:
predict_asl(1, 'datasets/downloaded/animal/animal.mp4', model='cnn+3trans')

Initilized Tensors
Starting Prediction
apple
