In [1]:
# Do the necessary imports

import time
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print('setup complete')

setup complete


In [2]:
df = pd.read_csv('fe_still.csv')

In [4]:
df.target = df.target.replace({'Still':0, 'Walking':1, 'Car':2, 'Bus':3, 'Train':4})

In [6]:
df.speed_mean.nunique()

5

In [22]:
df = df.drop(['speed_mean'], axis = 1)

In [23]:
df.head()

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.050413,0.056351,9.758895,51.199707,,0,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,0.999981,0.999999,7.707437,82.40989,89.065143,2,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.610456,0.610456,9.804817,55.501802,,0,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,0.998112,0.998112,7.659674,95.664309,87.470377,2,0.036372,21.848949
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,0.322242,0.378193,8.965621,156.795909,89.770732,2,0.059035,24.343749


In [24]:
print(len(df))
df.nunique()

5893


id                                                5893
user                                                13
android_sensor_gyroscope_mean                     5018
android_sensor_accelerometer_std                  5861
android_sensor_gyroscope_std                      4926
android_sensor_gyroscope_uncalibrated_mean        5000
android_sensor_accelerometer_max                  5838
android_sensor_linear_acceleration_mean           4972
android_sensor_rotation_vector_mean               5002
android_sensor_rotation_vector_max                4991
android_sensor_accelerometer_min                  5833
android_sensor_magnetic_field_uncalibrated_min    4707
sound_min                                         2306
target                                               5
acc_gyro                                          5124
Lin_speed                                         4972
dtype: int64

### We need to separate users from train and test. (we need atleast 1180 rows of data thats not in train)

#### Drop few users data such that the train data is 80% of original data

In [25]:
df['user'].unique().tolist()

['Luca',
 'andrea',
 'Federica',
 'michelangelo',
 'Damiano',
 'Claudio',
 'Vincenzo',
 'Serena',
 'Pierpaolo',
 'IvanHeibi',
 'AndreaCarpineti',
 'Elena',
 'Riccardo']

In [26]:
#TRAIN DATA

#drop these users from train data so that the model will never see them in test

luca_index = df[df['user'] == 'Luca'].index
Ivan_index = df[df['user'] == 'IvanHeibi'].index
Pier_index = df[df['user'] == 'Pierpaolo'].index
Riccardo_index = df[df['user'] == 'Riccardo'].index

# Delete these row indexes from dataFrame
train_df = df.drop(luca_index)
train_df = train_df.drop(Ivan_index)
train_df = train_df.drop(Pier_index)
train_df = train_df.drop(Riccardo_index)

print(len(train_df))
train_df.head()

4454


Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,0.322242,0.378193,8.965621,156.795909,89.770732,2,0.059035,24.343749
5,13480,Federica,,0.103857,,,9.520044,,,,9.149029,,57.361127,2,,
6,18786,michelangelo,0.0041,0.013489,0.001801,0.042991,9.816197,0.027441,0.201198,0.201318,9.758751,170.521377,65.158369,4,0.000825,0.952804
8,343,andrea,0.041554,0.255053,0.030074,0.065754,10.027802,0.278997,0.780795,0.786845,8.996134,134.611517,89.808441,3,0.032445,1.49781
9,4650,andrea,0.037451,0.267791,0.021391,0.030491,10.068528,0.255172,0.968082,0.968806,9.014786,140.174223,89.815738,4,0.036255,8.860111


In [47]:
for name, group in train_df.groupby('user'):
    print(name,len(group))

AndreaCarpineti 227
Claudio 90
Damiano 531
Elena 79
Federica 214
Serena 167
Vincenzo 279
andrea 2470
michelangelo 397


In [48]:
for name, group in test_df.groupby('user'):
    print(name,len(group))

IvanHeibi 271
Luca 820
Pierpaolo 331
Riccardo 17


In [44]:
#split the features and target in train data
print(train_df.user.unique())
print(test_df.user.unique())

X = train_df.drop(['target', 'id','user'], axis = 1)
y = train_df.target

print(len(X))
print(len(y))

['andrea' 'Federica' 'michelangelo' 'Damiano' 'Claudio' 'Vincenzo'
 'Serena' 'AndreaCarpineti' 'Elena']
['Luca' 'Pierpaolo' 'IvanHeibi' 'Riccardo']
4454
4454


In [28]:
X.head()

Unnamed: 0,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,acc_gyro,Lin_speed
4,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,0.322242,0.378193,8.965621,156.795909,89.770732,0.059035,24.343749
5,,0.103857,,,9.520044,,,,9.149029,,57.361127,,
6,0.0041,0.013489,0.001801,0.042991,9.816197,0.027441,0.201198,0.201318,9.758751,170.521377,65.158369,0.000825,0.952804
8,0.041554,0.255053,0.030074,0.065754,10.027802,0.278997,0.780795,0.786845,8.996134,134.611517,89.808441,0.032445,1.49781
9,0.037451,0.267791,0.021391,0.030491,10.068528,0.255172,0.968082,0.968806,9.014786,140.174223,89.815738,0.036255,8.860111


In [29]:
#TEST DATA

andrea_index = df[df['user'] == 'andrea'].index
federica_index = df[df['user'] == 'Federica'].index
michel_index = df[df['user'] == 'michelangelo'].index
damiano_index = df[df['user'] == 'Damiano'].index
claudio_index = df[df['user'] == 'Claudio'].index
vincenzo_index = df[df['user'] == 'Vincenzo'].index
serena_index = df[df['user'] == 'Serena'].index
andreacarpi_index = df[df['user'] == 'AndreaCarpineti'].index
elena_index = df[df['user'] == 'Elena'].index


# Delete these row indexes from dataFrame
test_df = df.drop(andrea_index)
test_df = test_df.drop(federica_index)
test_df = test_df.drop(michel_index)
test_df = test_df.drop(damiano_index)
test_df = test_df.drop(claudio_index)
test_df = test_df.drop(vincenzo_index)
test_df = test_df.drop(serena_index)
test_df = test_df.drop(andreacarpi_index)
test_df = test_df.drop(elena_index)


print(len(test_df))
test_df.head()

1439


Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.050413,0.056351,9.758895,51.199707,,0,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,0.999981,0.999999,7.707437,82.40989,89.065143,2,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.610456,0.610456,9.804817,55.501802,,0,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,0.998112,0.998112,7.659674,95.664309,87.470377,2,0.036372,21.848949
7,15268,Luca,0.140902,1.044684,0.119667,0.142145,14.202603,1.00352,0.929187,0.941615,6.431646,58.646911,,3,0.130924,5.38745


In [30]:
#split the features and target in test data


X_test = test_df.drop(['target', 'id','user'], axis = 1)
y_test = test_df.target

print(len(X_test))
print(len(y_test))

1439
1439


In [31]:
X_test.head()

Unnamed: 0,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,acc_gyro,Lin_speed
0,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.050413,0.056351,9.758895,51.199707,,8.3e-05,0.003784
1,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,0.999981,0.999999,7.707437,82.40989,89.065143,0.036325,12.906844
2,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.610456,0.610456,9.804817,55.501802,,0.000931,0.006153
3,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,0.998112,0.998112,7.659674,95.664309,87.470377,0.036372,21.848949
7,0.140902,1.044684,0.119667,0.142145,14.202603,1.00352,0.929187,0.941615,6.431646,58.646911,,0.130924,5.38745


In [41]:
print(X.nunique())
print(y.nunique())

android_sensor_gyroscope_mean                     3586
android_sensor_accelerometer_std                  4428
android_sensor_gyroscope_std                      3499
android_sensor_gyroscope_uncalibrated_mean        3567
android_sensor_accelerometer_max                  4405
android_sensor_linear_acceleration_mean           3613
android_sensor_rotation_vector_mean               3596
android_sensor_rotation_vector_max                3593
android_sensor_accelerometer_min                  4400
android_sensor_magnetic_field_uncalibrated_min    3366
sound_min                                         1942
acc_gyro                                          3692
Lin_speed                                         3613
dtype: int64
5


In [35]:
#numerical columns in the train data

num_vars  = list(X.columns)
num_vars

['android_sensor_gyroscope_mean',
 'android_sensor_accelerometer_std',
 'android_sensor_gyroscope_std',
 'android_sensor_gyroscope_uncalibrated_mean',
 'android_sensor_accelerometer_max',
 'android_sensor_linear_acceleration_mean',
 'android_sensor_rotation_vector_mean',
 'android_sensor_rotation_vector_max',
 'android_sensor_accelerometer_min',
 'android_sensor_magnetic_field_uncalibrated_min',
 'sound_min',
 'acc_gyro',
 'Lin_speed']

In [36]:
# Pipeline for Tree models

num_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(missing_values=np.nan ,strategy='mean')),
  ('Normalizer',RobustScaler())
  
])


tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro

In [37]:
#import the tress models

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
#from xgboost               import XGBClassifier
#from lightgbm              import LGBMClassifier
#from catboost              import CatBoostClassifier

In [38]:
#create a dict with the models that we want to train on

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":ExtraTreesClassifier(),
  "Random Forest":RandomForestClassifier(),
  "AdaBoost":AdaBoostClassifier(),
  "Skl GBM":GradientBoostingClassifier(),
  "Skl HistGBM":GradientBoostingClassifier()
  #"XGBoost":XGBClassifier(),
  #"LightGBM":LGBMClassifier(),
  #"CatBoost":CatBoostClassifier()
}

In [39]:
#pass the pipeline to everymodel

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
#tree_classifiers

In [40]:
#fit the data to the model

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=37
)

for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    
    pred = model.predict(X_test)  
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

    
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Random Forest,67.685893,65.140432,1.226024
2,Decision Tree,65.948575,66.322481,0.064695
3,Skl GBM,55.872133,59.475851,10.447299
4,Skl HistGBM,55.872133,59.143131,10.819283
5,Extra Trees,53.99583,53.113378,0.653138
6,AdaBoost,52.883947,48.087626,0.54919


In [46]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, pred))

[[ 19 288   1   0   0]
 [ 20 432   9   3   0]
 [  0   0 130   2  13]
 [  0   9 275  47   1]
 [  0   2  12   0 176]]


In [None]:
import pickle 

# save the model to disk
filename = 'final_model_v1.sav'
pickle.dump(model, open(filename, 'wb'))
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)