In [172]:
# Do the necessary imports

import time
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print('setup complete')

setup complete


In [173]:
df = pd.read_csv('feature_eng.csv')

In [174]:
df.head()

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.180379,0.050413,0.056351,9.758895,51.199707,,Still,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,14.679876,0.999981,0.999999,7.707437,82.40989,89.065143,Car,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.180379,0.610456,0.610456,9.804817,55.501802,,Still,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,14.679876,0.998112,0.998112,7.659674,95.664309,87.470377,Car,0.036372,21.848949
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,14.679876,0.322242,0.378193,8.965621,156.795909,89.770732,Car,0.059035,24.343749


In [175]:
print(len(df))
df.nunique()

5893


id                                                5893
user                                                13
android_sensor_gyroscope_mean                     5018
android_sensor_accelerometer_std                  5861
android_sensor_gyroscope_std                      4926
android_sensor_gyroscope_uncalibrated_mean        5000
android_sensor_accelerometer_max                  5838
android_sensor_linear_acceleration_mean           4972
speed_mean                                           5
android_sensor_rotation_vector_mean               5002
android_sensor_rotation_vector_max                4991
android_sensor_accelerometer_min                  5833
android_sensor_magnetic_field_uncalibrated_min    4707
sound_min                                         2306
target                                               5
acc_gyro                                          5124
Lin_speed                                         4972
dtype: int64

### We need to separate users from train and test. (we need atleast 1180 rows of data thats not in train)

#### Drop few users data such that the train data is 80% of original data

In [176]:
df['user'].unique().tolist()

['Luca',
 'andrea',
 'Federica',
 'michelangelo',
 'Damiano',
 'Claudio',
 'Vincenzo',
 'Serena',
 'Pierpaolo',
 'IvanHeibi',
 'AndreaCarpineti',
 'Elena',
 'Riccardo']

In [177]:
#TRAIN DATA

#drop these users from train data so that the model will never see them in test

luca_index = df[df['user'] == 'Luca'].index
Ivan_index = df[df['user'] == 'IvanHeibi'].index
Pier_index = df[df['user'] == 'Pierpaolo'].index
Riccardo_index = df[df['user'] == 'Riccardo'].index

# Delete these row indexes from dataFrame
train_df = df.drop(luca_index)
train_df = train_df.drop(Ivan_index)
train_df = train_df.drop(Pier_index)
train_df = train_df.drop(Riccardo_index)

print(len(train_df))
train_df.head()

4454


Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,14.679876,0.322242,0.378193,8.965621,156.795909,89.770732,Car,0.059035,24.343749
5,13480,Federica,,0.103857,,,9.520044,,14.679876,,,9.149029,,57.361127,Car,,
6,18786,michelangelo,0.0041,0.013489,0.001801,0.042991,9.816197,0.027441,34.722137,0.201198,0.201318,9.758751,170.521377,65.158369,Train,0.000825,0.952804
8,343,andrea,0.041554,0.255053,0.030074,0.065754,10.027802,0.278997,5.368554,0.780795,0.786845,8.996134,134.611517,89.808441,Bus,0.032445,1.49781
9,4650,andrea,0.037451,0.267791,0.021391,0.030491,10.068528,0.255172,34.722137,0.968082,0.968806,9.014786,140.174223,89.815738,Train,0.036255,8.860111


In [178]:
#TEST DATA

andrea_index = df[df['user'] == 'andrea'].index
federica_index = df[df['user'] == 'Federica'].index
michel_index = df[df['user'] == 'michelangelo'].index
damiano_index = df[df['user'] == 'Damiano'].index
claudio_index = df[df['user'] == 'Claudio'].index
vincenzo_index = df[df['user'] == 'Vincenzo'].index
serena_index = df[df['user'] == 'Serena'].index
andreacarpi_index = df[df['user'] == 'AndreaCarpineti'].index
elena_index = df[df['user'] == 'Elena'].index


# Delete these row indexes from dataFrame
test_df = df.drop(andrea_index)
test_df = test_df.drop(federica_index)
test_df = test_df.drop(michel_index)
test_df = test_df.drop(damiano_index)
test_df = test_df.drop(claudio_index)
test_df = test_df.drop(vincenzo_index)
test_df = test_df.drop(serena_index)
test_df = test_df.drop(andreacarpi_index)
test_df = test_df.drop(elena_index)


print(len(test_df))
test_df.head()

1439


Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.180379,0.050413,0.056351,9.758895,51.199707,,Still,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,14.679876,0.999981,0.999999,7.707437,82.40989,89.065143,Car,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.180379,0.610456,0.610456,9.804817,55.501802,,Still,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,14.679876,0.998112,0.998112,7.659674,95.664309,87.470377,Car,0.036372,21.848949
7,15268,Luca,0.140902,1.044684,0.119667,0.142145,14.202603,1.00352,5.368554,0.929187,0.941615,6.431646,58.646911,,Bus,0.130924,5.38745


In [179]:
print(train_df.user.nunique())
print(test_df.user.nunique())

9
4


In [180]:
#split the features and target in train data

train_df.target = train_df.target.replace({'Still':0, 'Walking':1, 'Car':2, 'Bus':3, 'Train':4})

X = train_df.drop(['target', 'id','user'], axis = 1)
y = train_df.target
#X_train.head()

In [181]:
print(len(X_train))
print(len(y_train))

4454
3563


In [182]:
#split the features and target in test data

test_df.target = test_df.target.replace({'Still':0, 'Walking':1, 'Car':2, 'Bus':3, 'Train':4})

X_test = test_df.drop(['target', 'id','user'], axis = 1)
y_test = test_df.target

In [183]:
#numerical columns in the train data

num_vars  = list(X_train.columns)
num_vars

['android_sensor_gyroscope_mean',
 'android_sensor_accelerometer_std',
 'android_sensor_gyroscope_std',
 'android_sensor_gyroscope_uncalibrated_mean',
 'android_sensor_accelerometer_max',
 'android_sensor_linear_acceleration_mean',
 'speed_mean',
 'android_sensor_rotation_vector_mean',
 'android_sensor_rotation_vector_max',
 'android_sensor_accelerometer_min',
 'android_sensor_magnetic_field_uncalibrated_min',
 'sound_min',
 'acc_gyro',
 'Lin_speed']

In [184]:
# Pipeline for Tree models

num_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(missing_values=np.nan ,strategy='mean')),
  ('Normalizer',RobustScaler())
  
])


tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro

In [185]:
#import the tress models

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
#from xgboost               import XGBClassifier
#from lightgbm              import LGBMClassifier
#from catboost              import CatBoostClassifier

In [186]:
#create a dict with the models that we want to train on

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":ExtraTreesClassifier(),
  "Random Forest":RandomForestClassifier(),
  "AdaBoost":AdaBoostClassifier(),
  "Skl GBM":GradientBoostingClassifier(),
  "Skl HistGBM":GradientBoostingClassifier()
  #"XGBoost":XGBClassifier(),
  #"LightGBM":LGBMClassifier(),
  #"CatBoost":CatBoostClassifier()
}

In [187]:
#pass the pipeline to everymodel

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [203]:
X

Unnamed: 0,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,acc_gyro,Lin_speed
4,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,14.679876,0.322242,0.378193,8.965621,156.795909,89.770732,0.059035,24.343749
5,,0.103857,,,9.520044,,14.679876,,,9.149029,,57.361127,,
6,0.004100,0.013489,0.001801,0.042991,9.816197,0.027441,34.722137,0.201198,0.201318,9.758751,170.521377,65.158369,0.000825,0.952804
8,0.041554,0.255053,0.030074,0.065754,10.027802,0.278997,5.368554,0.780795,0.786845,8.996134,134.611517,89.808441,0.032445,1.497810
9,0.037451,0.267791,0.021391,0.030491,10.068528,0.255172,34.722137,0.968082,0.968806,9.014786,140.174223,89.815738,0.036255,8.860111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5885,0.071227,0.591102,0.066071,0.041725,10.912657,0.573006,14.679876,0.582695,0.585233,8.359974,69.566084,79.070362,0.041503,8.411653
5886,0.019471,0.080109,0.012894,0.044579,10.275756,0.107402,34.722137,0.988151,0.988755,10.010278,145.476903,89.744465,0.019240,3.729221
5887,0.100751,0.253208,0.012566,0.119229,10.113405,0.722498,34.722137,0.882573,0.882723,9.218854,187.269949,82.684818,0.088920,25.086663
5891,0.003652,0.013485,0.001301,0.048900,9.874450,0.030764,0.180379,0.276348,0.276354,9.807347,270.328540,,0.001009,0.005549


In [189]:
y.head()

4    2
5    2
6    4
8    3
9    4
Name: target, dtype: int64

In [190]:
len(X)

4454

In [191]:
len(y)

4454

In [192]:
#fit the data to the model

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=37
)

for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    
    pred = model.predict(X_test)  
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Decision Tree,100.0,100.0,0.056041
2,Skl GBM,100.0,100.0,12.852138
3,Skl HistGBM,100.0,100.0,11.55636
4,Random Forest,99.652536,99.728376,1.030408
5,Extra Trees,99.374566,99.551268,0.409085
6,AdaBoost,44.683808,60.0,0.802875


In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, pred))

In [193]:
X.isna().sum()

android_sensor_gyroscope_mean                     720
android_sensor_accelerometer_std                    0
android_sensor_gyroscope_std                      721
android_sensor_gyroscope_uncalibrated_mean        799
android_sensor_accelerometer_max                    0
android_sensor_linear_acceleration_mean           494
speed_mean                                          0
android_sensor_rotation_vector_mean               721
android_sensor_rotation_vector_max                721
android_sensor_accelerometer_min                    0
android_sensor_magnetic_field_uncalibrated_min    799
sound_min                                         348
acc_gyro                                          721
Lin_speed                                         494
dtype: int64