# Ensamble Classifiers

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydotplus
from sklearn import tree
from IPython.display import Image
from collections import defaultdict
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Data Preparation

In [None]:
X_train = pd.read_csv("X_train.txt", header=None,  delim_whitespace=True)
y_train = pd.read_csv("y_train.txt", header=None, delim_whitespace=True )
X_test = pd.read_csv("X_test.txt", header=None, delim_whitespace=True )
y_test = pd.read_csv("y_test.txt", header=None, delim_whitespace=True )
#subject_test = pd.read_csv("subject_test.txt", header=None, delim_whitespace=True )
subject_train = pd.read_csv("subject_train.txt", header=None, delim_whitespace=True )
features = pd.read_csv("features.txt", header=None, delim_whitespace=True )

In [None]:
feature = features
feature.drop(0,inplace=True, axis=1)

In [None]:
#inseriamo l'intestazione al dataset
lista=[]
feat_transpa = feature.transpose()
for i in range(561):
    lista.append(feat_transpa.iloc[0][i])
X_test.columns=lista    
X_train.columns=lista

In [None]:
# eliminiamo le colonne che contengono la stima mad in quanto quasi uguale a dev.std
stringa="mad()"
for col in X_train.columns:
    if(stringa in col):
        X_train.drop(labels=col, axis=1, inplace=True)
for col in X_test.columns:
    if(stringa in col):
        X_test.drop(labels=col, axis=1, inplace=True)
#for col in features:
#    if (stringa in col):
#        features.drop(labels=col,axis=1,inplace=True)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

In [None]:
clf = RandomForestClassifier(max_depth=None, min_samples_leaf= 1, min_samples_split=4,
                             n_estimators=400, max_features='log2', n_jobs=-1, criterion ='entropy')
clf.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

In [1]:
# Feature Importance
nbr_features = 15

tree_feature_importances = clf.feature_importances_
sorted_idx = tree_feature_importances.argsort()[-nbr_features:]

y_ticks = np.arange(0, len(sorted_idx))
fig, ax = plt.subplots()
plt.barh(y_ticks, tree_feature_importances[sorted_idx])
plt.yticks(y_ticks, np.array(X_train.columns)[sorted_idx])
plt.title("Random Forest Feature Importances (MDI)", size=13)
plt.show()

In [None]:
# Plot Albero 0
dot_data = tree.export_graphviz(clf.estimators_[0], out_file=None,  
                                feature_names=X_train.columns, 
                                class_names=["WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS", "SITTING", "STANDING", "LAYING" ],  
                                filled=True, rounded=True,  
                                special_characters=True, max_depth=4)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

### Tuning the hyper-parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_list = {'max_depth': [None],    
             'min_samples_split': [2, 4, 7, 10, 15],
             'min_samples_leaf': [1, 2, 3, 5, 10], 
             'criterion': ['entropy'],
             'max_features': ['auto', 'sqrt', 'log2'],
             'n_estimators' : [5,20,50,100,200,300,400]
             }

In [None]:
grid_search = GridSearchCV(clf, param_grid=param_list, cv=5, n_jobs=-1)
grid_search.fit(X_train, np.ravel(y_train))
clf_gs = grid_search.best_estimator_

In [None]:
clf_gs = grid_search.best_estimator_
clf_gs

In [None]:
grid_search.best_params_

### Roc Curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from yellowbrick import ROCAUC

In [None]:
#Binarize FARE ATTENZIONE PERCHE' DOPO SBALLA LE CLASSI
y_test1 = label_binarize(y_test, classes=[1,2,3,4,5,6])
y_train1 = label_binarize(y_train, classes=[1,2,3,4,5,6])

In [None]:
# Learn to predict each class against the other
classifier = OneVsRestClassifier(clf)   
classifier.fit(X_train, y_train1)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [None]:
# Compute ROC curve and ROC area for each class
CLASS_LABELS = [ "WALKING" , "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS", "SITTING", "STANDING", "LAYING"]

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(CLASS_LABELS)):
    fpr[i], tpr[i], _ = roc_curve(y_test1[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
#plot
lw=2
colors = cycle(["aqua", "darkorange", "cornflowerblue","navy", "deeppink", "gold"])
for i, color in zip(range(len(CLASS_LABELS)), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=lw,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i+1, roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve multiclass Random Forest")
plt.legend(loc="lower right")
plt.show()

## Bagging

In [None]:
from sklearn.svm import SVC
# Base estimator con DecTree, SVC o Rand forest

In [None]:
clf1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=0)
clf1.fit(X_train, np.ravel(y_train))

y_pred = clf1.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

In [None]:
clf = BaggingClassifier(base_estimator=SVC(C=1000), n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

## Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=0, n_jobs= -1)
clf.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

### Tuning the hyper-parameters

In [None]:
#GS adaboostclassif
abc = AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=None, min_samples_leaf= 1, min_samples_split=4,
                                                               n_estimators=400, max_features='log2'), random_state=1)

parameters = {'n_estimators':list(range(40, 400, 20)), 
              'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gs = GridSearchCV(abc, parameters, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(X_train, np.ravel(y_train))

In [None]:
print("Optimal hyperparameter combination:", gs.best_params_)

In [None]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=220, learning_rate = 0.1, random_state=0)
clf.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

##  Gradient Boosting

In [None]:
#FEATURE IMPORTANCES + CLASSIFICATION REPORT
from sklearn.ensemble import GradientBoostingClassifier

baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
baseline.fit(X_train,np.ravel(y_train[0]))
predictors=list(X_train)
feat_imp = pd.Series(baseline.feature_importances_, predictors).sort_values(ascending=False)
feat_imp[:10].plot(kind='barh', title='Importance of Features')

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
pred=baseline.predict(X_test)
print(classification_report(y_test, pred))

### Tuning the hyper-parameters

In [None]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500],
          'max_features': ['auto', 'sqrt', 'log2']
           }

model = GradientBoostingClassifier(n_estimators=100, random_state=100)

In [None]:
model_grid_search = GridSearchCV(model, param_grid=params,
                                 n_jobs=-1, cv=3)
model_grid_search.fit(X_train, np.ravel(y_train[0]))

In [None]:
model_grid_search.best_params_

In [None]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=3, max_features='log2', random_state=0)
clf.fit(X_train, y_train[0])

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

## HistGradientBoostingClassifier


In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

model=HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'max_leaf_nodes': [3,10,15,30],
    'max_depth' : [3,6,10],
    'max_bins' : [50,100,150]
}
model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=-1, cv=3)
model_grid_search.fit(X_train, np.ravel(y_train[0]))

In [None]:
model_grid_search.best_params_

In [None]:
clf = HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1,
                                     max_depth=3, max_bins=150, # max
                                     random_state=0, max_leaf_nodes=3, loss='categorical_crossentropy')
clf.fit(X_train, np.ravel(y_train[0]))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

## XGBoost

In [None]:
#!pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
import xgboost as xgb
#TUNING ATTENZIONE CHE LE LABEL DEVONO PARTIRE DA 0 PER QUESTO USIAMO Y_TEST E TRAIN ["NEW"]

#learning_rate [0.05, 0.1]
#max_depth= [3,6,10]
#min_child_weight = 1, 3, 6
#gamma = 0 : A smaller value like 0.1-0.2 can also be chosen for starting.
#colsample_bytree = 0.8 : This is a commonly used used start value. Typical values range between 0.5-0.9.
##DOPO AVER IMPOSTATO QUESTI PARAMETRI PER TUNING VEDIAMO DI REGOLARIZZARE L'OVERFITTING

#reg_alpha [1e-5, 1e-2, 0.1, 1, 100] Nel nostro caso lasciandolo di default

#objective='multi:softmax' to multiclass prediction

xgb_model = xgb.XGBClassifier(booster = 'gbtree', random_state=0, objective='multi:softmax',
                              gamma = 0.1, max_depth=3,  min_child_weight=1,
                              use_label_encoder=False, eta=0.4)

xgb_model.fit(X_train, np.ravel(y_train["new"]), eval_metric='mlogloss')
y_pred = xgb_model.predict(X_test)

#Check overfitting
#print('Training set score: {:.4f}'.format(xgb_model.score(X_train, y_train["new"])))
#print('Test set score: {:.4f}'.format(xgb_model.score(X_test, y_test["new"])))

print('Accuracy %s' % accuracy_score(y_test["new"], y_pred))
print('F1-score %s' % f1_score(y_test["new"], y_pred, average=None))
print(classification_report(y_test["new"], y_pred))

## LightGBM

In [None]:
from lightgbm import LGBMClassifier
#bisogna levare caretteri speciali dai nomi delle features altrimenti si rompe
import re
X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test = X_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [None]:
clf = LGBMClassifier(boosting_type='gbdt',  #'goss', #'dart'
                     max_depth=-1, # no limit
                     num_leaves=31,
                     n_estimators=100,
                     subsample_for_bin=200000,
                     objective='multiclass',
                     reg_alpha=0.0, #L1 regularization term on weights
                     reg_lambda=0.0, #L2 regularization term on weights
                     random_state=42
                   )
clf.fit(X_train, np.ravel(y_train["new"]))

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test["new"], y_pred))
print('F1-score %s' % f1_score(y_test["new"], y_pred, average=None))
print(classification_report(y_test["new"], y_pred))

In [None]:
import lightgbm as lgb
clf = LGBMClassifier()
clf.fit(X_train, y_train[0])
y_pred=clf.predict(X_test)

### Tuning the hyper-parameters

In [None]:
#1 num_leaves : This is the main parameter to control the complexity of the tree model. Ideally, the value of num_leaves should be less than or equal to 2^(max_depth). Value more than this will result in overfitting.

#2 min_data_in_leaf : Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset.

#3 max_depth : We also can use max_depth to limit the tree depth explicitly.

#FOR FASTER SPEED
#Use bagging by setting bagging_fraction and bagging_freq.
#Use feature sub-sampling by setting feature_fraction.
#Use small max_bin.
#Use save_binary to speed up data loading in future learning.

#FOR BETTER ACCURACY
#Use large max_bin (may be slower).
#Use small learning_rate with large num_iterations
#Use large num_leaves(may cause over-fitting)
#Use bigger training data
#Try dart
#Try to use categorical feature directly

#TO DEAL WITH OVER FITTING
#Use small max_bin
#Use small num_leaves
#Use min_data_in_leaf and min_sum_hessian_in_leaf
#Use bagging by set bagging_fraction and bagging_freq
#Use feature sub-sampling by set feature_fraction
#Use bigger training data
#Try lambda_l1, lambda_l2 and min_gain_to_split to regularization
#Try max_depth to avoid growing deep tree

In [None]:
#!pip install scikit_optimize

params = {
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'metric': 'multi_logloss',
          'num_class':6,
          'max_depth':3,
          'learning_rate': 0.2,
          'n_estimators':150,
          'max_bins' : 8,
          'min_split_gain' : 0.2,
          'min_child_samples':600
         ## REDUCE OVERFITTING
         #'min_data_in_leaf':[50], #primo parametro per ridurre overifitting
         #'colsample_bytree': [0.0], #altri 3 parametri per ridurre overfitt
         #'min_split_gain' : [0.0],
         #'subsample' : [1],
         #'reg_lambda' : 0.5,
         #'reg_alpha': 1, # fattore di regolarizzazione semper per overfitting
         }
          #quando non c'è molta differenza di score tra
          #train e test, we can adjust the max_depth and num_leaves parameter to reduce overfitting.
           

clf = LGBMClassifier(**params)

clf.fit(X_train, y_train[0])
y_pred=clf.predict(X_test)

#Check OVERFITTING --> SE l'accuratezza del modello differisce di molto dal test rispetto al train siamo in overfitting
#print('Training set score: {:.4f}'.format(clf.score(X_train, y_train[0])))
#print('Test set score: {:.4f}'.format(clf.score(X_test, y_test[0])))

print('Accuracy %s' % accuracy_score(y_test[0], y_pred))
print('F1-score %s' % f1_score(y_test[0], y_pred, average=None))
print (classification_report(y_test[0], y_pred))

In [None]:
#PLOT CHECK OVERFITTING (in questo caso lo vediamo al variare della depth, ma possiamo impostare un qualsiasi valore)
from matplotlib import pyplot

# define lists to collect scores
#rain_scores, test_scores = list(), list()
# define the tree depths to evaluate
#alues = [i for i in range(1, 21)]
# evaluate a decision tree for each depth
#or i in values:
#   # configure the model
#   model = LGBMClassifier(max_depth=i)
#   # fit model on the training dataset
#   model.fit(X_train, np.ravel(y_train[0]))
#   # evaluate on the train dataset
#   train_yhat = model.predict(X_train)
#   train_acc = accuracy_score(y_train[0], train_yhat)
#   train_scores.append(train_acc)
#   # evaluate on the test dataset
#   test_yhat = model.predict(X_test)
#   test_acc = accuracy_score(y_test[0], test_yhat)
#   test_scores.append(test_acc)
#   # summarize progress
#   print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))
#   # pot of train and test scores vs tree depth
pyplot.plot(values, train_scores, '-o', label='Train')
pyplot.plot(values, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()