## Start with imports and loading the data

In [24]:
# do imports here
import time
import numpy    as np
import pandas   as pd

from sklearn import pipeline
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn import impute
from sklearn import compose
from sklearn import set_config
set_config(display='diagram') # Useful to display the pipeline
from sklearn.preprocessing import RobustScaler
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix

from sklearn.preprocessing import MinMaxScaler,Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report,confusion_matrix
import sklearn

print("Numpy  ", np.__version__)
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24

print('setup complete')

Numpy   1.19.5
Pandas   1.2.4
Sklearn  0.24.2
setup complete


In [2]:
#load the data

data = pd.read_csv('feature_eng.csv')
data.head(5)

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.180379,0.050413,0.056351,9.758895,51.199707,,Still,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,14.679876,0.999981,0.999999,7.707437,82.40989,89.065143,Car,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.180379,0.610456,0.610456,9.804817,55.501802,,Still,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,14.679876,0.998112,0.998112,7.659674,95.664309,87.470377,Car,0.036372,21.848949
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,14.679876,0.322242,0.378193,8.965621,156.795909,89.770732,Car,0.059035,24.343749


## Separate data into Train and Test  based on users

There are 13 users in the data. The rows contain the data of users from multiple instances. If we don't separate the users and keep the same users both in train and test, the model will pick up the hidden patterns and ultimately overfit the data.                                                                                                 Hence, split the data so that the test data contains new users that the model has never seen before.

In [3]:
# create test data from certain users

df=data.copy()
user1=df[df['user']=='Pierpaolo']
user2=df[df['user']=='IvanHeibi']
user3=df[df['user']=='AndreaCarpineti']
user4=df[df['user']=='Elena']
users=[user1,user2,user3,user4]
df_test=pd.concat(users)

#print(df_test.head(5))
df_test.user.unique()

array(['Pierpaolo', 'IvanHeibi', 'AndreaCarpineti', 'Elena'], dtype=object)

In [4]:
#create train data by substracting it from test data

df_train = df.merge(df_test, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train=df_train.drop('_merge',axis=1)

#df_train.head(5)
df_train.user.unique()

array(['Luca', 'andrea', 'Federica', 'michelangelo', 'Damiano', 'Claudio',
       'Vincenzo', 'Serena', 'Riccardo'], dtype=object)

## Drop the columns which are not useful

In [5]:
df_train.head()

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.180379,0.050413,0.056351,9.758895,51.199707,,Still,8.3e-05,0.003784
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,14.679876,0.999981,0.999999,7.707437,82.40989,89.065143,Car,0.036325,12.906844
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.180379,0.610456,0.610456,9.804817,55.501802,,Still,0.000931,0.006153
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,14.679876,0.998112,0.998112,7.659674,95.664309,87.470377,Car,0.036372,21.848949
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,14.679876,0.322242,0.378193,8.965621,156.795909,89.770732,Car,0.059035,24.343749


In [6]:
#drop id, user 
#drop speed_mean because it only contains 5 unique values just like our target 
#and it's somehow leaking the data

#Run this cell only once as it will drop the columns 

unimp    = ['id','user']
df_train  = df_train.drop(unimp,axis=1)
df_test  = df_test.drop(unimp,axis=1)
df_train=df_train.drop('speed_mean',axis=1)
df_test=df_test.drop('speed_mean',axis=1)

In [7]:
df_test.head()

Unnamed: 0,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target,acc_gyro,Lin_speed
33,0.615399,1.283204,0.655962,0.778458,14.028126,1.721314,0.955202,0.955202,9.764513,36.378866,77.064227,Walking,0.58783,1.928226
54,0.447802,0.361961,0.116564,0.334574,9.963264,0.9582,0.983561,0.989576,8.712177,77.496043,74.178611,Bus,0.44044,5.144149
102,0.069503,0.070605,0.04814,0.078626,9.18734,0.868719,0.78757,0.802811,9.012243,91.951478,6.0206,Walking,0.054738,0.973144
121,0.045041,0.451773,0.027367,0.038644,9.563967,0.958079,0.993876,0.996117,8.24622,70.436446,72.5596,Bus,0.044765,5.143499
172,1.734294,4.272022,1.400249,1.331566,17.743662,8.823918,0.985658,0.993053,3.332163,90.118737,73.702289,Walking,1.709421,9.884602


## Separate the data into features(X) and labels(y)

In [8]:
X=df_train.drop('target',axis=1) #for training
y=df_train.target

X_test=df_test.drop('target',axis=1) #for testing
y_test=df_test.target

In [9]:
print(X.shape, y.shape)
print(X_test.shape, y_test.shape)

(4985, 13) (4985,)
(908, 13) (908,)


## Creating the Pipeline

In [15]:
num_vars  = list(X.columns) #define the numerical features (we don't have any categorical features)
#num_vars

In [33]:
# num_4_treeModels = Pipeline(steps=[
#   ('imputer', impute.SimpleImputer(missing_values=np.nan ,strategy='mean')),
#   ('Normalizer',RobustScaler())
  
# ])

# tree_prepro = compose.ColumnTransformer(transformers=[
#     ('num', num_4_treeModels, num_vars),
# ], remainder='drop') # Drop other vars not specified in num_vars or cat_vars


num_attribs=df_train.drop('target',axis=1).columns.to_list()
num_pip=Pipeline([('imputer',impute.SimpleImputer(strategy='median')),
                  ('scalar',MinMaxScaler()),
               ])  # ('PCA',PCA(n_components=12))
tree_prepro=compose.ColumnTransformer([
    ('num',num_pip,num_vars)
])
tree_prepro

In [21]:
# using the best classifier
tree_classifiers = {
"LGBMClassifier":LGBMClassifier()

}

#pass the pipeline to everymodel

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["LGBMClassifier"]

In [26]:
#fit the data to the model

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=37
)

for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    
    pred = model.predict(x_val)  
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

    
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

KeyboardInterrupt: 

In [30]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    stratify = y,   
    random_state=37
)

In [29]:
model=Pipeline([('preprocess',tree_prepro),('classification',LGBMClassifier())])

In [39]:
model.fit(x_train,y_train)


KeyboardInterrupt: 

In [None]:
pred_test_1=model.predict(x_test)
accuracy_score(y_val, pred_test_1)*100,

In [38]:
print(classification_report(y_val, pred_test_1))
print()
print(confusion_matrix(y_val, pred_test_1))

              precision    recall  f1-score   support

         Bus       0.94      0.93      0.94       203
         Car       0.94      0.96      0.95       229
       Still       1.00      1.00      1.00       220
       Train       0.97      0.96      0.96       219
     Walking       0.99      0.99      0.99       126

    accuracy                           0.97       997
   macro avg       0.97      0.97      0.97       997
weighted avg       0.97      0.97      0.97       997


[[189   9   0   5   0]
 [  8 219   0   2   0]
 [  0   0 220   0   0]
 [  2   6   0 210   1]
 [  1   0   0   0 125]]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3e7089f3-11a5-48ad-89c4-39a166311a14' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>