In [36]:
# Processing imports

import pandas as pd
import numpy as np
import glob
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact
from pathlib import Path
import joblib
from sklearn.pipeline import make_pipeline

%matplotlib inline

In [37]:
# ML imports

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.svm import LinearSVC

from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [38]:
# Options

# np.set_printoptions(threshold=np.inf)

In [51]:
# Functions

# Train new model
def train_new_model(path, model):
    
    # Holder list
    li = []

    # Regex changes the target class creation
    # 'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+'
    
    # Will grab any subfolders from path and their csv files

    for filename in Path(path).rglob('*.csv'):
        # Reads individual csv files
        df = pd.read_csv(filename, index_col=None, header=0)
        # Matches specifics from the filename using regex (subject to change depending on filenaming convention)   
        match  = re.findall('[a-zA-Z]+(?=_[0-9]+_data)', str(filename))
        y = (''.join(match))
        # Adds target column for classification
        df['y'] = y
        # Appends the dataframe to the list
        li.append(df)

    # Concats all data into one dataframe for training/testing
    frame = pd.concat(li, axis=0, ignore_index=True)

    # Target column is y
    y_string = frame['y']

    # Changes target from string to numeric
    le = LabelEncoder().fit(y_string.ravel())
    y = le.transform(y_string.ravel())

    # Sets the X data
    X = frame.drop(['y','arrival_time'],axis=1).to_numpy()
    
    # Splits data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # random_state=42
    
    # Pipeline from data pre-processing to model training
    pipe_model = make_pipeline(StandardScaler(), model)
    pipe_model.fit(X_train, y_train)
    
    # Model predictions
    y_pred = pipe_model.predict(X_test)

    accuracy = pipe_model.score(X_test, y_test)
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    
    # Save model
    joblib.dump(pipe_model, 'actions_model.pkl')

    print("Model:", model, "\nAccuracy:", accuracy, "\nRecall:", recall, "\nPrecision:", precision)

# Abstracted funtion for training

In [52]:
# Model Options are: 
# DecisionTreeClassifier()
# RandomForestClassifier()
# SVC()

# Path example:
# r'C:\Users\conno\Desktop\Pastebles\data'

In [53]:
# Example of training a new model with its associted testing score
train_new_model(r'C:\Users\conno\Desktop\Pastebles\data', RandomForestClassifier())

Model: RandomForestClassifier() 
Accuracy: 0.9252191235059761 
Recall: 0.9338279722751749 
Precision: 0.9340218456554473


# Quick prototyping

In [29]:
# Path to data directory

# Change to desired path
path = r'C:\Users\conno\Desktop\Pastebles\data'
# Will grab any subfolders from path and their csv files
all_files = glob.glob(path + "/*/*.csv")

In [30]:
# Reads in all data and adds target column for classification

# Holder list
li = []

# Regex changes the target class creation
# 'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+'

for filename in all_files:
    # Reads individual csv files
    df = pd.read_csv(filename, index_col=None, header=0)
    # Matches specifics from the filename using regex (subject to change depending on filenaming convention)    
    match  = re.findall('[a-zA-Z]+(?=_[0-9]+_data)', filename)
    y = (''.join(match))
    # Adds target column for classification
    df['y'] = y
    # Appends the dataframe to the list
    li.append(df)

# Concats all data into one dataframe for training/testing
frame = pd.concat(li, axis=0, ignore_index=True)

In [31]:
# Target column is y
y_string = frame['y']

# Changes target from string to numeric
le = LabelEncoder().fit(y_string.ravel())
y = le.transform(y_string.ravel())

# Sets the X data
X = frame.drop(['y','arrival_time'],axis=1).to_numpy()

# Splits data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # random_state=42

In [32]:
# Searching for best parameters
# parameters = {'criterion':('gini', 'entropy'), 'n_estimators':[100, 200, 300, 400, 500]}

# clf = GridSearchCV(RandomForestClassifier(), parameters)
# clf = clf.fit(X_train, y_train)
# clf.score(X_test, y_test)

In [33]:
# clf_dc = DecisionTreeClassifier().fit(X_train, y_train)
# clf_dc.score(X_test, y_test)

In [34]:
# Loaded model from the train_model function
loaded_model = joblib.load('actions_model.pkl')

y_pred = loaded_model.predict(X_test)

accuracy = loaded_model.score(X_test, y_test)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
print("Accuracy:", accuracy, "\nRecall:", recall, "\nPrecision:", precision)

# The accuracy is so high because the train test split is different from the trained model. 
# Testing data here could have been used as training data when the model was trained reslulting in a higher accuracy.
# This is not a problem for actual application.

Accuracy: 0.9839043824701196 
Recall: 0.9857151033847505 
Precision: 0.9858603945648308


In [35]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

clf_rfc = RandomForestClassifier().fit(X_train, y_train)
y_pred = clf_rfc.predict(X_test)

accuracy = clf_rfc.score(X_test, y_test)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy, "\nRecall:", recall, "\nPrecision:", precision)

Accuracy: 0.9231474103585657 
Recall: 0.9326212137433462 
Precision: 0.9320199194925465


In [96]:
# Scoring indivdual actions for the whole dataset

actions = frame['y'].unique()
scores_entire = []

for action in actions:
    # Get the data and select only the wanted action for scoring
    data = frame.loc[frame['y'] == action]
    # Get the y data
    y_string = data['y']
    # Convert the y to numeric classes
    y_data = le.transform(y_string.ravel())
    # Get the X data
    X_data = data.drop(['y','arrival_time'],axis=1).to_numpy()
    
    accuracy = clf_rfc.score(X_data, y_data)
    scores_entire.append(accuracy)
    print(action +" - "+ "Accuracy:", accuracy)
    
# Not quite and exact average because different actions may have more data points, but it should be really close.
print("Average for entire dataset: ", np.mean(scores_entire, axis=0))

Posture - Accuracy: 0.9963304971974677
BicepCurls - Accuracy: 0.9873422712933754
SideLunges - Accuracy: 0.9721867007672634
Sitting - Accuracy: 0.9968770019218449
Squats - Accuracy: 0.9736987818383167
Standing - Accuracy: 0.9931430393876575
Average for entire dataset:  0.9865963820676543


In [97]:
# Scoring indivdual actions for the testing dataset

actions = frame['y'].unique()
scores_testing = []

for action in actions:
    # Get y testing data
    y_frame = pd.DataFrame(le.inverse_transform(y_test))
    # Select only the wanted action for scoring
    y_string = y_frame.where(y_frame == action).dropna()
    # Get X testing data from the corresponding y indecies
    X_data = X_test[y_string.index]
    # Convert the y data back to numeric classes
    y_data = le.transform(y_string.to_numpy().ravel())
    # Reset y index (not needed currently)
    # y_data = y_data.reset_index(drop=True)
    
    accuracy = clf_rfc.score(X_data, y_data)
    scores_testing.append(accuracy)
    print(action +" - "+ "Accuracy:", accuracy)

# Not quite and exact average because different actions may have more data points, but it should be really close.
print("Average for testing dataset: ", np.mean(scores_testing, axis=0))

Posture - Accuracy: 0.9818977521384523
BicepCurls - Accuracy: 0.9364608076009501
SideLunges - Accuracy: 0.8609668397922493
Sitting - Accuracy: 0.9841784989858012
Squats - Accuracy: 0.8677669516802545
Standing - Accuracy: 0.9658865529551766
Average for testing dataset:  0.9328595671921475


In [86]:
# Function abstraction
train_new_model(r'C:\Users\conno\Desktop\Pastebles\data', DecisionTreeClassifier())

Model: DecisionTreeClassifier() 
Accuracy: 0.8654581673306773 
Recall: 0.8827407774427171 
Precision: 0.880705116506292


In [18]:
# Manually testing individual points

index = 102
x_pred = frame.drop(['arrival_time', "y"],axis=1).iloc[index].to_numpy().reshape(1,-1)
x_pred = X_test[index].reshape(1, -1)
y_pred = y_test[index].ravel()

out = clf_rfc.predict(x_pred)
# list(le.inverse_transform(out))
# list(le.inverse_transform(y_test[0].ravel()))

print("Predicted:", le.inverse_transform(out), "\nActual:",le.inverse_transform(y_pred))

Predicted: ['Squats'] 
Actual: ['SideLunges']
