In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np

In [3]:
import glob, os
import re

## File paths

# If you use the pickle file -- Skip to that cell

In [None]:
os.chdir("../LeapMotion/Leap_Motion_Data/")

glob_list = []

#loop through subject folders and glob
for subject in range(25):
    # Change slashes if on windows to \\
    glob_list.append(sorted(glob.glob(str(subject) + "/[A-Z0-9]*.csv")))
    
#function to flatten glob
flatten = lambda l: [item for sublist in l for item in sublist]
glob_list = flatten(glob_list)

In [None]:
#make a list of tuples of hand pairings
lr_pairs = list(zip(*[iter(glob_list)]*2))

# remove all blue files
for i in range(25):
    # Change slashes if on windows to \\
    lr_pairs.remove((str(i)+'/Blue_Left.csv', str(i)+'/Blue_Right.csv'))
# remove "bad" from subject 19
# Change slashes if on windows to \\
lr_pairs.remove(('19/Bad_Left.csv', '19/Bad_Right.csv'))
print(lr_pairs)

## Create list of DataFrames

In [None]:
df_list = []

for pair in lr_pairs:
    df_left = pd.read_csv(pair[0], index_col=None).drop(['Unnamed: 0'], axis = 1)
    df_right = pd.read_csv(pair[1], index_col=None).drop(['Unnamed: 0'], axis = 1)
    
    #rename columns
    df_left = df_left.add_prefix('left')
    df_right = df_right.add_prefix('right')
    
    #merge
    df = pd.merge(df_left, df_right, left_on='leftTime', right_on='rightTime').drop('rightTime', axis = 1)

    #covert fist column to time object
    df['leftTime'] = pd.to_datetime(df['leftTime'].str[:-3], format = '%H:%M:%S.%f')
    
    #difference between rows
    df = df.diff().iloc[1:]
    df['leftTime'] = df['leftTime'].dt.total_seconds()
    
    df.rename(columns={'leftTime':'time'}, inplace=True)
    
    #add sign and subject using regex of file name
    # Change slashes if on windows to \\
    subject_sign = re.split(r'/', re.findall('^[^_]+(?=_)', pair[0])[0])
    print(subject_sign)
    df.insert(loc = 0, column = 'Subject', value = subject_sign[0])
    df.insert(loc = 0, column = 'Sign', value = subject_sign[1])
    
    df_list.append(df)

# Pickle the dataframe list

In [None]:
import pickle

with open('sign_frames.pkl', 'wb') as f:
    pickle.dump(df_list, f)

# Load the dataframe from pickle instead of remaking the list

In [12]:
import pickle
import timeit
# This line may need to be commented/uncommented as needed
#os.chdir("../LeapMotion/Leap_Motion_Data/")
start_time = timeit.default_timer()

with open('sign_frames.pkl', 'rb') as f:
    df_list = pickle.load(f)
# code you want to evaluate
elapsed = timeit.default_timer() - start_time

In [13]:
elapsed

46.70063448600001

## Hand Classification

In [None]:
#list of signs included in data set
os.chdir("..")
hands_used  = pd.read_csv("signs_f.csv")

In [None]:
threshold = [(abs(df.iloc[:,3:].filter(regex='left').mean().sum()), df.loc[1, 'Sign']) for df in df_list]

In [None]:
thresh = .000025

In [None]:
one_hand = [tup[1] for tup in threshold if tup[0] < thresh]
one_hand = list(set(one_hand))

In [None]:
two_hand = [value for value in hands_used.Sign.values if value not in one_hand]

In [None]:
class hand_selection:
    
    def __init__(self, drop_left=False):
        self.drop_left = drop_left
        
    def transform(self, df_list, hand_list):
        if not self.drop_left:
            subset = [df for df in df_list if df.Sign.values[0] in hand_list]
        else:
            subset = [df.drop(df.filter(regex='left').columns, axis=1) \
                      for df in df_list \
                      if df.Sign.values[0] in hand_list]
            
        return subset

## Feature Extraction

In [None]:
class extraction:
    def __init__(self, df):
        self.df = df
        self.features = dict()
        
    def label(self):
        self.features['label'] = self.df['Sign'].iloc[0]
        self.df = self.df.iloc[:, 2:]
        
    def mean(self):
        for col in self.df:
            self.features[col + ' mean'] = self.df[col].mean()
            
    def stdev(self):
        for col in self.df:
            self.features[col + ' stdev'] = self.df[col].std()
            
    def extract_features(self):
        self.label()
        self.mean()
        self.stdev()

In [None]:
from sklearn.preprocessing import StandardScaler

def return_features(df_list, hand_list, drop_left):
    scaler = StandardScaler()
    
    feature_list = []
    
    select_class = hand_selection(drop_left)
    frames = select_class.transform(df_list, hand_list)
    
    for df in frames:
        class_obj = extraction(df)
        class_obj.extract_features()
        feature_list.append(class_obj.features)
        
    feat_df = pd.DataFrame(feature_list)
    
    y = feat_df.label
    X = scaler.fit_transform(feat_df.drop(['label'], axis = 1))
    
    return X, y

In [None]:
X_one_hand, y_one_hand = return_features(df_list=df_list, 
                                         hand_list=one_hand, 
                                         drop_left=True)

X_two_hand, y_two_hand = return_features(df_list=df_list, 
                                         hand_list=two_hand, 
                                         drop_left=False)

## Model Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### Two Hand

In [None]:
import warnings
warnings.filterwarnings('ignore')

lda_accuracy = []
qda_accuracy = []
knn_accuracy = []
rf_accuracy = []
nb_accuracy = []
svm_accuracy = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X_two_hand, 
                                                        y_two_hand,
                                                        stratify=y_two_hand, 
                                                        test_size=0.25)

    clf = LinearDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    lda_accuracy.append(clf.score(X_test,y_test))
    
    clf = SVC(decision_function_shape='ovo', kernel='linear', C=1, gamma=1)
    clf.fit(X_train, y_train)
    svm_accuracy.append(clf.score(X_test, y_test))
    
    
    clf = QuadraticDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    qda_accuracy.append(clf.score(X_test,y_test))
    
    neigh = KNeighborsClassifier(n_neighbors=15)
    neigh.fit(X_train, y_train)
    knn_accuracy.append(neigh.score(X_test,y_test))    
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    clf.fit(X_train, y_train)
    rf_accuracy.append(clf.score(X_test,y_test))

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    nb_accuracy.append(clf.score(X_test,y_test))    

In [None]:
accuracy_list = [qda_accuracy, nb_accuracy, knn_accuracy, rf_accuracy, lda_accuracy, svm_accuracy]

In [None]:
plt.figure(figsize=(5.5, 5.5))
plt.ylim(0, 1)
plt.boxplot(accuracy_list, labels = ['QDA', 'Naive Bayes', 'k-nn', 'Random Forest', 'LDA', 'SVM'])
plt.title('Two-Handed Model Performance')
plt.ylabel('Accuracy')
plt.savefig('plots/two_hand_model_performance_svm_ovo_linear_kernel.png', dpi = 500)
plt.show()

In [None]:
print(max(lda_accuracy))
print(max(svm_accuracy))

### One Hand

In [None]:
lda_accuracy = []
qda_accuracy = []
knn_accuracy = []
rf_accuracy = []
nb_accuracy = []
svm_accuracy = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X_one_hand, 
                                                        y_one_hand,
                                                        stratify=y_one_hand, 
                                                        test_size=0.25)
    clf = LinearDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    lda_accuracy.append(clf.score(X_test,y_test))
    
    clf = SVC(decision_function_shape='ovo', kernel='linear', C=1, gamma=1)
    clf.fit(X_train, y_train)
    svm_accuracy.append(clf.score(X_test, y_test))
    svm_accuracy.append(clf.score(X_test, y_test))
    
    clf = QuadraticDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    qda_accuracy.append(clf.score(X_test,y_test))
    
    neigh = KNeighborsClassifier(n_neighbors=15)
    neigh.fit(X_train, y_train)
    knn_accuracy.append(neigh.score(X_test,y_test))    
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    clf.fit(X_train, y_train)
    rf_accuracy.append(clf.score(X_test,y_test))

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    nb_accuracy.append(clf.score(X_test,y_test))    

In [None]:
print(max(lda_accuracy))
print(max(svm_accuracy))

In [None]:
accuracy_list = [qda_accuracy, nb_accuracy, knn_accuracy, rf_accuracy, lda_accuracy, svm_accuracy]

In [None]:
plt.figure(figsize=(5.5, 5.5))
plt.ylim(0, 1)
plt.boxplot(accuracy_list, labels = ['QDA', 'Naive Bayes', 'k-nn', 'Random Forest', 'LDA', 'SVM'])
plt.title('One-Handed Model Performance')
plt.ylabel('Accuracy')
plt.savefig('plots/one_hand_model_performance.png', dpi = 500)
plt.show()

## Model

### Two Hand

In [None]:
import warnings
warnings.filterwarnings('ignore')

accuracy_two_hand = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X_two_hand, 
                                                        y_two_hand,
                                                        stratify=y_two_hand, 
                                                        test_size=0.25)

    clf = LinearDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    #clf.scalings_
    accuracy_two_hand.append(clf.score(X_test,y_test))

### One Hand

In [None]:
accuracy_one_hand = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X_one_hand, 
                                                        y_one_hand,
                                                        stratify=y_one_hand, 
                                                        test_size=0.25)

    clf = LinearDiscriminantAnalysis()
    clf.fit(X_train, y_train)
    #clf.scalings_
    accuracy_one_hand.append(clf.score(X_test,y_test))

# Accuracy Plot

In [None]:
plt.hist(accuracy_one_hand, bins=10, label = 'One hand', alpha=0.5)
plt.hist(accuracy_two_hand, bins=10, label = 'Two hand', alpha=0.5)
plt.xlabel("Classification Accuracy")
plt.ylabel("Count")
plt.title("Model Accuracy")
plt.legend(loc='upper right')
plt.savefig('plots/lda_both_hands.png', dpi = 750)
plt.show()

## Confusion

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_two_hand, 
                                                    y_two_hand,
                                                    stratify=y_two_hand, 
                                                    test_size=0.25,
                                                    random_state = 42)

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

two_hand_pred = pd.DataFrame(zip(y_test, clf.predict(X_test)), columns=['actual', 'predicted'])

clf.score(X_test,y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_one_hand, 
                                                    y_one_hand,
                                                    stratify=y_one_hand, 
                                                    test_size=0.25,
                                                    random_state = 42)

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

one_hand_pred = pd.DataFrame(zip(y_test, clf.predict(X_test)), columns=['actual', 'predicted'])

clf.score(X_test,y_test)

In [None]:
combined_pred = pd.concat([one_hand_pred, two_hand_pred])

In [None]:
misclass = combined_pred[combined_pred.actual != combined_pred.predicted]

In [None]:
misclass_groupby = misclass.groupby(['actual', 'predicted'])[['predicted']].agg('count')
misclass_groupby[misclass_groupby > 1].dropna()

Number of times a sign is misclassified (False Negative):

In [None]:
pd.DataFrame(misclass.groupby(['actual'])[['predicted']].agg('count')['predicted'].sort_values(ascending = False)).iloc[:10]

Number of times a sign is incorrectly assigned (False Poitive):

In [None]:
pd.DataFrame(misclass.groupby(['predicted'])[['predicted']].agg('count')['predicted'].sort_values(ascending = False)).iloc[:10]

## One Versus All ROC Curves

### Two Hand

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
from sklearn.preprocessing import label_binarize

X_train, X_test, y_train, y_test = train_test_split(X_two_hand, 
                                                    label_binarize(y_two_hand, classes=two_hand),
                                                    stratify=label_binarize(y_two_hand, classes=two_hand), 
                                                    test_size=0.25,
                                                    random_state = 42)

clf = OneVsRestClassifier(LinearDiscriminantAnalysis())
clf.fit(X_train, y_train)

y_score = clf.fit(X_train, y_train).decision_function(X_test)

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt

labels = two_hand
n_classes = len(labels)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC ' + labels[i])
    plt.legend(loc="lower right")
    plt.savefig('plots/' + labels[i] + '.png', dpi = 750)
    plt.show()

### One Hand

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_one_hand, 
                                                    label_binarize(y_one_hand, classes=one_hand),
                                                    stratify=label_binarize(y_one_hand, classes=one_hand), 
                                                    test_size=0.25,
                                                    random_state = 42)

clf = OneVsRestClassifier(LinearDiscriminantAnalysis())
clf.fit(X_train, y_train)

y_score = clf.fit(X_train, y_train).decision_function(X_test)

In [None]:
labels = one_hand
n_classes = len(labels)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC ' + labels[i])
    plt.legend(loc="lower right")
    plt.savefig('plots/' + labels[i] + '.png', dpi = 750)
    plt.show()