# Preprocess

In this notebook, I will perform some basic EDA on the extracted hand landmarks from the data_collection.ipynb file.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_pickle("data/hand_landmarks.pkl")
df

Unnamed: 0,image_path,label,landmarks
0,data/raw_data/asl_alphabet_train/asl_alphabet_...,R,"[0.7500099539756775, 0.8092923760414124, 8.129..."
1,data/raw_data/asl_alphabet_train/asl_alphabet_...,R,"[0.5013842582702637, 0.1256781816482544, -1.00..."
2,data/raw_data/asl_alphabet_train/asl_alphabet_...,R,"[0.33940061926841736, 0.9188684225082397, 1.87..."
3,data/raw_data/asl_alphabet_train/asl_alphabet_...,R,"[0.3688064515590668, 0.9296466112136841, 1.295..."
4,data/raw_data/asl_alphabet_train/asl_alphabet_...,R,"[0.6972802877426147, 0.852705717086792, 9.3129..."
...,...,...,...
60346,data/raw_data/asl_alphabet_train/asl_alphabet_...,W,"[0.7270431518554688, 0.5372360944747925, 5.106..."
60347,data/raw_data/asl_alphabet_train/asl_alphabet_...,W,"[0.14381271600723267, 0.7350218892097473, 1.27..."
60348,data/raw_data/asl_alphabet_train/asl_alphabet_...,W,"[0.3868051767349243, 0.8602805733680725, 1.084..."
60349,data/raw_data/asl_alphabet_train/asl_alphabet_...,W,"[0.392451673746109, 0.875036895275116, 1.08737..."


Lets get rid of the image_path column which will not be useful to us goiing forward. We had saved this initially for troubleshooting purposes.

In [3]:
df = df[["landmarks","label"]]

In [4]:
df

Unnamed: 0,landmarks,label
0,"[0.7500099539756775, 0.8092923760414124, 8.129...",R
1,"[0.5013842582702637, 0.1256781816482544, -1.00...",R
2,"[0.33940061926841736, 0.9188684225082397, 1.87...",R
3,"[0.3688064515590668, 0.9296466112136841, 1.295...",R
4,"[0.6972802877426147, 0.852705717086792, 9.3129...",R
...,...,...
60346,"[0.7270431518554688, 0.5372360944747925, 5.106...",W
60347,"[0.14381271600723267, 0.7350218892097473, 1.27...",W
60348,"[0.3868051767349243, 0.8602805733680725, 1.084...",W
60349,"[0.392451673746109, 0.875036895275116, 1.08737...",W


Lets seperate each landmark axis (X,Y,Z) into their own dedicated columns with their associated landmark type (which part of the hand) to help when we train our model and for interpretability.

In [5]:
def createCols(x):
    return pd.Series(x)

In [6]:
df[[
    "WRIST_X", "WRIST_Y", "WRIST_Z",
    "THUMB_CMC_X", "THUMB_CMC_Y", "THUMB_CMC_Z",
    "THUMB_MCP_X", "THUMB_MCP_Y", "THUMB_MCP_Z",
    "THUMB_IP_X", "THUMB_IP_Y", "THUMB_IP_Z",
    "THUMB_TIP_X", "THUMB_TIP_Y", "THUMB_TIP_Z",
    "INDEX_FINGER_MCP_X", "INDEX_FINGER_MCP_Y", "INDEX_FINGER_MCP_Z",
    "INDEX_FINGER_PIP_X", "INDEX_FINGER_PIP_Y", "INDEX_FINGER_PIP_Z",
    "INDEX_FINGER_DIP_X", "INDEX_FINGER_DIP_Y", "INDEX_FINGER_DIP_Z",
    "INDEX_FINGER_TIP_X", "INDEX_FINGER_TIP_Y", "INDEX_FINGER_TIP_Z",
    "MIDDLE_FINGER_MCP_X", "MIDDLE_FINGER_MCP_Y", "MIDDLE_FINGER_MCP_Z",
    "MIDDLE_FINGER_PIP_X", "MIDDLE_FINGER_PIP_Y", "MIDDLE_FINGER_PIP_Z",
    "MIDDLE_FINGER_DIP_X", "MIDDLE_FINGER_DIP_Y", "MIDDLE_FINGER_DIP_Z",
    "MIDDLE_FINGER_TIP_X", "MIDDLE_FINGER_TIP_Y", "MIDDLE_FINGER_TIP_Z",
    "RING_FINGER_MCP_X", "RING_FINGER_MCP_Y", "RING_FINGER_MCP_Z",
    "RING_FINGER_PIP_X", "RING_FINGER_PIP_Y", "RING_FINGER_PIP_Z",
    "RING_FINGER_DIP_X", "RING_FINGER_DIP_Y", "RING_FINGER_DIP_Z",
    "RING_FINGER_TIP_X", "RING_FINGER_TIP_Y", "RING_FINGER_TIP_Z",
    "PINKY_MCP_X", "PINKY_MCP_Y", "PINKY_MCP_Z",
    "PINKY_PIP_X", "PINKY_PIP_Y", "PINKY_PIP_Z",
    "PINKY_DIP_X", "PINKY_DIP_Y", "PINKY_DIP_Z",
    "PINKY_TIP_X", "PINKY_TIP_Y", "PINKY_TIP_Z"
]] = df["landmarks"].apply(createCols)

In [7]:
df

Unnamed: 0,landmarks,label,WRIST_X,WRIST_Y,WRIST_Z,THUMB_CMC_X,THUMB_CMC_Y,THUMB_CMC_Z,THUMB_MCP_X,THUMB_MCP_Y,...,PINKY_MCP_Z,PINKY_PIP_X,PINKY_PIP_Y,PINKY_PIP_Z,PINKY_DIP_X,PINKY_DIP_Y,PINKY_DIP_Z,PINKY_TIP_X,PINKY_TIP_Y,PINKY_TIP_Z
0,"[0.7500099539756775, 0.8092923760414124, 8.129...",R,0.750010,0.809292,8.129081e-07,0.825599,0.765941,-0.038960,0.853383,0.673123,...,-0.029627,0.687461,0.600749,-0.072303,0.713414,0.643698,-0.068106,0.720230,0.686469,-0.052947
1,"[0.5013842582702637, 0.1256781816482544, -1.00...",R,0.501384,0.125678,-1.001989e-07,0.477497,0.132820,-0.015184,0.458090,0.141702,...,-0.037660,0.535183,0.023749,-0.040141,0.539358,0.027979,-0.037461,0.542468,0.035578,-0.036354
2,"[0.33940061926841736, 0.9188684225082397, 1.87...",R,0.339401,0.918868,1.879405e-06,0.409356,0.861666,-0.147752,0.402551,0.734529,...,-0.050533,0.130029,0.587779,-0.172358,0.164479,0.689234,-0.182736,0.214064,0.752133,-0.158354
3,"[0.3688064515590668, 0.9296466112136841, 1.295...",R,0.368806,0.929647,1.295784e-06,0.440474,0.855415,-0.130700,0.439925,0.721818,...,-0.046239,0.186575,0.594483,-0.161201,0.219543,0.684305,-0.169308,0.264434,0.747629,-0.143825
4,"[0.6972802877426147, 0.852705717086792, 9.3129...",R,0.697280,0.852706,9.312934e-07,0.777088,0.810766,-0.034454,0.801544,0.719615,...,-0.042457,0.634076,0.641588,-0.093173,0.655176,0.688718,-0.087716,0.664871,0.734331,-0.069009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60346,"[0.7270431518554688, 0.5372360944747925, 5.106...",W,0.727043,0.537236,5.106251e-07,0.773778,0.479102,-0.024153,0.783326,0.403973,...,-0.020840,0.660787,0.336614,-0.048157,0.683137,0.358557,-0.051691,0.699192,0.391754,-0.051478
60347,"[0.14381271600723267, 0.7350218892097473, 1.27...",W,0.143813,0.735022,1.276441e-06,0.234342,0.696472,-0.069900,0.275684,0.588910,...,-0.060184,0.076499,0.446228,-0.132334,0.086836,0.501884,-0.157363,0.088181,0.554310,-0.164947
60348,"[0.3868051767349243, 0.8602805733680725, 1.084...",W,0.386805,0.860281,1.084899e-06,0.457215,0.836140,-0.070152,0.491328,0.739052,...,-0.047496,0.324855,0.582516,-0.105268,0.334513,0.636539,-0.116996,0.338271,0.691184,-0.115579
60349,"[0.392451673746109, 0.875036895275116, 1.08737...",W,0.392452,0.875037,1.087373e-06,0.464097,0.833508,-0.049260,0.486289,0.747190,...,-0.048260,0.319105,0.626391,-0.103453,0.335918,0.670237,-0.118006,0.350229,0.717290,-0.120714


Now lets get rid of the landmarks columns as we dont need it anymore.

In [8]:
df = df.drop(['landmarks'], axis=1)
df

Unnamed: 0,label,WRIST_X,WRIST_Y,WRIST_Z,THUMB_CMC_X,THUMB_CMC_Y,THUMB_CMC_Z,THUMB_MCP_X,THUMB_MCP_Y,THUMB_MCP_Z,...,PINKY_MCP_Z,PINKY_PIP_X,PINKY_PIP_Y,PINKY_PIP_Z,PINKY_DIP_X,PINKY_DIP_Y,PINKY_DIP_Z,PINKY_TIP_X,PINKY_TIP_Y,PINKY_TIP_Z
0,R,0.750010,0.809292,8.129081e-07,0.825599,0.765941,-0.038960,0.853383,0.673123,-0.045357,...,-0.029627,0.687461,0.600749,-0.072303,0.713414,0.643698,-0.068106,0.720230,0.686469,-0.052947
1,R,0.501384,0.125678,-1.001989e-07,0.477497,0.132820,-0.015184,0.458090,0.141702,-0.034510,...,-0.037660,0.535183,0.023749,-0.040141,0.539358,0.027979,-0.037461,0.542468,0.035578,-0.036354
2,R,0.339401,0.918868,1.879405e-06,0.409356,0.861666,-0.147752,0.402551,0.734529,-0.202084,...,-0.050533,0.130029,0.587779,-0.172358,0.164479,0.689234,-0.182736,0.214064,0.752133,-0.158354
3,R,0.368806,0.929647,1.295784e-06,0.440474,0.855415,-0.130700,0.439925,0.721818,-0.171432,...,-0.046239,0.186575,0.594483,-0.161201,0.219543,0.684305,-0.169308,0.264434,0.747629,-0.143825
4,R,0.697280,0.852706,9.312934e-07,0.777088,0.810766,-0.034454,0.801544,0.719615,-0.042737,...,-0.042457,0.634076,0.641588,-0.093173,0.655176,0.688718,-0.087716,0.664871,0.734331,-0.069009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60346,W,0.727043,0.537236,5.106251e-07,0.773778,0.479102,-0.024153,0.783326,0.403973,-0.028428,...,-0.020840,0.660787,0.336614,-0.048157,0.683137,0.358557,-0.051691,0.699192,0.391754,-0.051478
60347,W,0.143813,0.735022,1.276441e-06,0.234342,0.696472,-0.069900,0.275684,0.588910,-0.088541,...,-0.060184,0.076499,0.446228,-0.132334,0.086836,0.501884,-0.157363,0.088181,0.554310,-0.164947
60348,W,0.386805,0.860281,1.084899e-06,0.457215,0.836140,-0.070152,0.491328,0.739052,-0.091306,...,-0.047496,0.324855,0.582516,-0.105268,0.334513,0.636539,-0.116996,0.338271,0.691184,-0.115579
60349,W,0.392452,0.875037,1.087373e-06,0.464097,0.833508,-0.049260,0.486289,0.747190,-0.062568,...,-0.048260,0.319105,0.626391,-0.103453,0.335918,0.670237,-0.118006,0.350229,0.717290,-0.120714


In [None]:
sample = df.sample(n=10, random_state=44) # random_state set so this sample can be reproduced
sample