In [3]:
import numpy as np
import pandas as pd
import sys
import os

In [4]:
# Info of original dataset
FilePath = os.path.join(os.getcwd(), 'Windows_Data/dataset.csv')
dataset = pd.read_csv(FilePath)
print(dataset.describe())
fall_set = dataset.loc[dataset['label'] == 1]
normal_set = dataset.loc[dataset['label'] == 0]
print('Number of fall-samples: {}'.format(fall_set.shape[0]))
print('Number of normal-samples: {}'.format(normal_set.shape[0]))
print('Falls / Normal = {}'.format(fall_set.shape[0] / normal_set.shape[0]))

            max_smv       min_smv      duration         label
count  56917.000000  56917.000000  56917.000000  56917.000000
mean      17.747558      5.445815     -7.426305      0.032855
std        6.408102      3.562756    347.463460      0.178258
min        9.547117      0.030987   -995.000000      0.000000
25%       10.075159      2.505617   -205.000000      0.000000
50%       19.697805      4.566719    -14.000000      0.000000
75%       23.092690      9.627163    181.000000      0.000000
max       32.885551     10.015043    996.000000      1.000000
Number of fall-samples: 1870
Number of normal-samples: 55047
Falls / Normal = 0.0339709702617763


In [5]:
dataset_no_dups = dataset.drop_duplicates(subset=None, keep=False)
print(dataset_no_dups.describe())
fall_set_np = dataset_no_dups.loc[dataset['label'] == 1]
normal_set_np = dataset_no_dups.loc[dataset['label'] == 0]
print('Number of fall-samples (without duplicates): {}'.format(fall_set_np.shape[0]))
print('Number of normal-samples (without duplicates): {}'.format(normal_set_np.shape[0]))
print('Falls / Normal = {} (without duplicates)'.format(fall_set_np.shape[0] / normal_set_np.shape[0]))

            max_smv       min_smv      duration         label
count  18071.000000  18071.000000  18071.000000  18071.000000
mean      17.107593      5.734681    -11.069780      0.007028
std        6.298308      3.560391    446.908952      0.083539
min        9.547117      0.030987   -995.000000      0.000000
25%       10.026761      2.760565   -270.000000      0.000000
50%       18.882865      5.023431    -14.000000      0.000000
75%       22.572992      9.689584    242.000000      0.000000
max       32.505287     10.014622    996.000000      1.000000
Number of fall-samples (without duplicates): 127
Number of normal-samples (without duplicates): 17944
Falls / Normal = 0.0070775746767721805 (without duplicates)


In [6]:
fall_set_np.describe()

Unnamed: 0,max_smv,min_smv,duration,label
count,127.0,127.0,127.0,127.0
mean,25.221958,4.570352,0.031496,1.0
std,2.918982,1.621344,90.295453,0.0
min,17.128696,0.838504,-262.0,1.0
25%,23.068876,3.245285,-48.5,1.0
50%,25.201343,4.558423,17.0,1.0
75%,27.438482,5.794097,49.0,1.0
max,32.505287,7.880321,432.0,1.0


In [7]:
normal_set_np.describe()

Unnamed: 0,max_smv,min_smv,duration,label
count,17944.0,17944.0,17944.0,17944.0
mean,17.050163,5.742922,-11.14835,0.0
std,6.278558,3.569031,448.422953,0.0
min,9.547117,0.030987,-995.0,0.0
25%,10.025897,2.751745,-273.0,0.0
50%,18.818034,5.034214,-14.0,0.0
75%,22.518249,9.6927,245.0,0.0
max,32.430852,10.014622,996.0,0.0


In [43]:
TrnSet = dataset_no_dups.drop(dataset_no_dups[(dataset_no_dups.label == 0) & (dataset_no_dups.max_smv < 17)].index)
print(TrnSet.loc[dataset['label'] == 1].describe())
print(TrnSet.loc[dataset['label'] == 0].describe())
X = TrnSet[['max_smv', 'min_smv', 'duration']].values
y = TrnSet[['label']].values

          max_smv     min_smv    duration  label
count  127.000000  127.000000  127.000000  127.0
mean    25.221958    4.570352    0.031496    1.0
std      2.918982    1.621344   90.295453    0.0
min     17.128696    0.838504 -262.000000    1.0
25%     23.068876    3.245285  -48.500000    1.0
50%     25.201343    4.558423   17.000000    1.0
75%     27.438482    5.794097   49.000000    1.0
max     32.505287    7.880321  432.000000    1.0
            max_smv       min_smv      duration    label
count  10013.000000  10013.000000  10013.000000  10013.0
mean      22.275489      2.864994    -17.926396      0.0
std        2.709789      1.802601    477.441668      0.0
min       17.000608      0.030987   -993.000000      0.0
25%       20.264987      1.092221   -365.000000      0.0
50%       22.135603      3.060025    -20.000000      0.0
75%       24.083629      4.234462    293.000000      0.0
max       32.430852      9.237574    989.000000      0.0


In [9]:
# 5-fold cross validation (GridsearchCV) & apply undersampling and oversampling on each fold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [19]:
def imb_pipeline(clf_name, clf, X, y, params):
    
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    
    if clf_name == 'LinearSVC':
        normalize = StandardScaler()
        steps = [
        ('normalization', normalize),
        ('oversampling', over),
        ('undersampling', under),
        ('classification', clf)]
    else:
        steps = [
        ('oversampling', over),
        ('undersampling', under),
        ('classification', clf)]
    
    model = Pipeline(steps=steps)
    
    score={'AUC':'roc_auc', 
           'RECALL':'recall',
           'PRECISION':'precision',
           'F1':'f1'}
    
    gcv = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring=score, n_jobs=12, refit='F1',
                       return_train_score=True)
    gcv.fit(X, y)

    return gcv



In [29]:
setting_LogisticRegression = {'classification__penalty': ('l1', 'l2'), 
                              'classification__C': (0.001, 0.01, 0.1, 1.0, 10, 100)}

setting_SVC = {'classification__C': [0.01, 0.1, 1, 10, 100], 
               'classification__gamma': [1e-1, 1e-2, 1e-3, 1e-4]}

setting_RandomForest = {'classification__n_estimators': (100, 100, 200, 300, 400, 500), 
                        'classification__max_depth': (10, 20, 30, 40, 50, 60, 70, 80, None),
                        'classification__min_samples_leaf':[1, 2, 4],
                        'classification__min_samples_split':[2, 5, 10]}

params = [setting_LogisticRegression, setting_SVC, setting_RandomForest]

In [12]:
classifiers = [
 ('Logistic Regression', LogisticRegression(random_state=0)),
 ('SVC', SVC(random_state=0)),
 ('Random Forst',RandomForestClassifier(bootstrap=True,random_state=0))]


In [42]:
for param, classifier in zip(params, classifiers):
    print("Working on {}...".format(classifier[0]))
    clf = imb_pipeline(classifier[0], classifier[1], X, y, param) 
    print("Best parameter for {} is {}".format(classifier[0], clf.best_params_))
    print("Best `F1` for {} is {}".format(classifier[0], clf.best_score_))
    print('-'*100)
    print('\n')

Working on Logistic Regression...


        nan 0.89503283        nan 0.89071903        nan 0.89401569]
        nan 0.90127083        nan 0.90124749        nan 0.9012374 ]
        nan 0.66215385        nan 0.65415385        nan 0.63076923]
        nan 0.66334692        nan 0.66338575        nan 0.65542613]
        nan 0.08795939        nan 0.08184611        nan 0.08564773]
        nan 0.08674725        nan 0.08702412        nan 0.08799876]
        nan 0.15205213        nan 0.14357098        nan 0.1478634 ]
        nan 0.15335097        nan 0.15377182        nan 0.15501858]


Best parameter for Logistic Regression is {'classification__C': 0.001, 'classification__penalty': 'l2'}
Best `F1` for Logistic Regression is 0.19409962425395033
----------------------------------------------------------------------------------------------------


Working on SVC...
Best parameter for SVC is {'classification__C': 100, 'classification__gamma': 0.0001}
Best `F1` for SVC is 0.23963170898556063
----------------------------------------------------------------------------------------------------


Working on Random Forst...
Best parameter for Random Forst is {'classification__max_depth': 60, 'classification__min_samples_leaf': 2, 'classification__min_samples_split': 5, 'classification__n_estimators': 100}
Best `F1` for Random Forst is 0.334628873535307
----------------------------------------------------------------------------------------------------


