In [4]:
# Pre-Processing imports

import pandas as pd
import numpy as np
import glob
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact
%matplotlib inline

In [5]:
# ML imports

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier

In [6]:
# Options

# np.set_printoptions(threshold=np.inf)

In [17]:
# Functions

# Train new model
def train_new_model(path, model):
    
    # Will grab any subfolders from path and their csv files
    all_files = glob.glob(path + "/*/*.csv")

    # Holder list
    li = []

    # Regex changes the target class creation
    # 'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+'

    for filename in all_files:
        # Reads individual csv files
        df = pd.read_csv(filename, index_col=None, header=0)
        # Matches specifics from the filename using regex (subject to change depending on filenaming convention)    
        match  = re.findall('[a-zA-Z]+(?=_[0-9]+_data)', filename)
        y = (''.join(match))
        # Adds target column for classification
        df['y'] = y
        # Appends the dataframe to the list
        li.append(df)

    # Concats all data into one dataframe for training/testing
    frame = pd.concat(li, axis=0, ignore_index=True)

    # Target column is y
    y_string = frame['y']

    # Changes target from string to numeric
    le = LabelEncoder().fit(y_string.ravel())
    y = le.transform(y_string.ravel())

    # Sets the X data
    X = frame.drop(['y','arrival_time'],axis=1).to_numpy()

    # Splits data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # random_state=42
    
    clf = model.fit(X_train, y_train)
    print("Model: ",model,"\nScore: ", clf.score(X_test, y_test))

# Abstracted funtion for training

In [18]:
# Model Options are: 
# DecisionTreeClassifier()
# RandomForestClassifier()
# SVC()

# Path example:
# r'C:\Users\conno\Desktop\Pastebles\data'

In [None]:
# Example of training a new model with its associted testing score
train_new_model(r'C:\Users\conno\Desktop\Pastebles\data', RandomForestClassifier())

# Quick prototyping

In [11]:
# Path to data directory

# Change to desired path
path = r'C:\Users\conno\Desktop\Pastebles\data'
# Will grab any subfolders from path and their csv files
all_files = glob.glob(path + "/*/*.csv") 

In [12]:
# Reads in all data and adds target column for classification

# Holder list
li = []

# Regex changes the target class creation
# 'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+'

for filename in all_files:
    # Reads individual csv files
    df = pd.read_csv(filename, index_col=None, header=0)
    # Matches specifics from the filename using regex (subject to change depending on filenaming convention)    
    match  = re.findall('[a-zA-Z]+(?=_[0-9]+_data)', filename)
    y = (''.join(match))
    # Adds target column for classification
    df['y'] = y
    # Appends the dataframe to the list
    li.append(df)

# Concats all data into one dataframe for training/testing
frame = pd.concat(li, axis=0, ignore_index=True)

In [13]:
# Target column is y
y_string = frame['y']

# Changes target from string to numeric
le = LabelEncoder().fit(y_string.ravel())
y = le.transform(y_string.ravel())

# Sets the X data
X = frame.drop(['y','arrival_time'],axis=1).to_numpy()

# Splits data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # random_state=42

In [14]:
# Searching for best parameters
# parameters = {'criterion':('gini', 'entropy'), 'n_estimators':[100, 200, 300, 400, 500]}

# clf = GridSearchCV(RandomForestClassifier(), parameters)
# clf = clf.fit(X_train, y_train)
# clf.score(X_test, y_test)

In [15]:
clf_dc = DecisionTreeClassifier().fit(X_train, y_train)
clf_dc.score(X_test, y_test)

0.8635458167330677

In [366]:
clf_rfc = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
clf_rfc.score(X_test, y_test)

0.927808764940239

In [367]:
# Function abstraction
train_new_model(r'C:\Users\conno\Desktop\Pastebles\data', RandomForestClassifier(n_estimators=200))

Model:  RandomForestClassifier(n_estimators=200) 
Score:  0.9277290836653387


In [347]:
# Scoring indivdual actions for the whole dataset

actions = frame['y'].unique()
scores_entire = []

for action in actions:
    # Get the data and select only the wanted action for scoring
    data = frame.loc[frame['y'] == action]
    # Get the y data
    y_string = data['y']
    # Convert the y to numeric classes
    y_data = le.transform(y_string.ravel())
    # Get the X data
    X_data = data.drop(['y','arrival_time'],axis=1).to_numpy()
    
    score = clf.score(X_data, y_data)
    scores_entire.append(score)
    print(action, score)
    
# Not quite and exact average because different actions may have more data points, but it should be really close.
print("Total average for entire dataset: ", np.mean(scores_entire, axis=0))

Posture 0.9975402233961047
BicepCurls 0.9876971608832807
SideLunges 0.9717471227621484
Sitting 0.9967969250480462
Squats 0.9761509254864736
Standing 0.9948174134906713
Total average for entire dataset:  0.9874582951777874


In [348]:
# Scoring indivdual actions for the testing dataset

actions = frame['y'].unique()
scores_testing = []
for action in actions:
    # Get y testing data
    y_frame = pd.DataFrame(le.inverse_transform(y_test))
    # Select only the wanted action for scoring
    y_string = y_frame.where(y_frame == action).dropna()
    # Get X testing data from the corresponding y indecies
    X_data = X_test[y_string.index]
    # Convert the y data back to numeric classes
    y_data = le.transform(y_string.to_numpy().ravel())
    # Reset y index (not needed currently)
    # y_data = y_data.reset_index(drop=True)
    
    score = clf.score(X_data, y_data)
    scores_testing.append(score)
    print(action, score)

# Not quite and exact average because different actions may have more data points, but it should be really close.
print("Total average for testing dataset: ", np.mean(scores_testing, axis=0))

Posture 0.9875459371171907
BicepCurls 0.9384979302188055
SideLunges 0.8605797673042792
Sitting 0.983753046303818
Squats 0.881648675171737
Standing 0.9740103958416634
Total average for testing dataset:  0.937672625326249


In [287]:
# Manually testing individual points

index = 102
x_pred = frame.drop(['arrival_time', "y"],axis=1).iloc[index].to_numpy().reshape(1,-1)
x_pred = X_test[index].reshape(1, -1)
y_pred = y_test[index].ravel()

out = clf.predict(x_pred)
# list(le.inverse_transform(out))
# list(le.inverse_transform(y_test[0].ravel()))

print("Predicted:", le.inverse_transform(out), "\nActual:",le.inverse_transform(y_pred))

Predicted: ['SideLunges'] 
Actual: ['BicepCurls']


In [16]:
frame

Unnamed: 0,arrival_time,acc_x,acc_y,acc_z,rot_x,rot_y,rot_z,ir_dist,y
0,2021-06-22 18:38:26.834683,0.960205,0.399536,0.059570,-9.033203,-3.967285,-7.446289,12,Posture
1,2021-06-22 18:38:26.896499,1.013672,0.388306,0.128784,23.559570,16.479492,-9.887695,13,Posture
2,2021-06-22 18:38:26.958310,0.945923,0.352295,0.144043,56.823730,13.916016,-2.319336,12,Posture
3,2021-06-22 18:38:27.019907,0.907959,0.367676,0.103760,31.616211,-7.751465,-0.183105,12,Posture
4,2021-06-22 18:38:27.081720,0.886719,0.404663,0.116699,-1.953125,-26.245117,-2.319336,12,Posture
...,...,...,...,...,...,...,...,...,...
125492,2021-06-30 16:11:56.917636,0.973633,0.137695,0.215698,-1.098633,-7.141113,-1.098633,114,Squats
125493,2021-06-30 16:11:56.980679,0.977783,0.142212,0.211060,-0.366211,-7.324219,-1.342773,114,Squats
125494,2021-06-30 16:11:57.041932,0.984375,0.135132,0.210815,-1.037598,-7.873535,-1.220703,114,Squats
125495,2021-06-30 16:11:57.103245,0.988770,0.135742,0.209595,-2.746582,-9.277344,-1.525879,124,Squats
