# <u>CLASSIFER TESTS NOTEBOOK</u>

# Global libraries and variables

In [None]:
import pandas as pd
import numpy as np
import pyts
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix, cohen_kappa_score, roc_auc_score
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [None]:
BINARY_CLASSIFICATION = True
RESPONDERS_ONLY = True
RESPONDERS = [8,15,18,22,23,24]
ANIMAL_LEVEL_SPLIT = True
CATCH_22_FEATURES = False # if False, then default hand-crafted features are used
BATCH = False
TEST_ON_TRAIN_DATA = False # Test classifiers on training data to check for overfitting

# Fitting classifiers

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.pipeline import Pipeline

''' Undersampling with Tomek-links ('under') or oversampling with SMOTE algorithm ('over')'''
def resampler(X, y, ratio=0.9, resampling_type = 'over'):
    if resampling_type == 'over':
        resampler = SMOTE(sampling_strategy=0.9)
    elif resampling_type == 'under':
        resampler = TomekLinks(n_jobs=-1)
    X, y = resampler.fit_resample(X, y)
    return X, y

In [None]:
## NEW DATASET INITIALISATION ##

from os import listdir
from os.path import isfile, join
import os
import pickle5 as pickle
from collections import Counter
from combat.pycombat import pycombat


def data_split(animal_split = False, resampling = False, resampling_ratio = 0.9, mode = 'delta_features_dataset'):

    valid_channels = [str(i) for i in range(8,25,1)]
    if RESPONDERS_ONLY:
        valid_channels = RESPONDERS
    files = [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
    if CATCH_22_FEATURES:
        valid_ids = [id for id in valid_channels if f'c22_dataset{id}.csv' in files]
    else:
        valid_ids = [id for id in valid_channels if f'{mode}_{id}.csv' in files]

    def unpacker(file_references, batch_effect_elimination=BATCH):
        X = pd.DataFrame()
        Y = pd.DataFrame()
        for ref in file_references:
            if CATCH_22_FEATURES:
                x = pd.read_csv(f'c22_labelled{ref}.csv').drop(columns=['Unnamed: 0'], axis=1)
                labels = np.load(f'raw_dataset_y_{ref}.npy')
                y = pd.DataFrame({'y': labels})
            else:
                df = pd.read_csv(f'{mode}_{ref}.csv').drop(columns=['Unnamed: 0'], axis=1)
                if mode == 'hf_features_dataset':
                    df = df.drop(columns=['Unnamed: 0.1'], axis=1)
                y = df[['y']]
                x = df.drop(columns=['y'])
             
            X = X.append(x)
            Y = Y.append(y)
        if batch_effect_elimination:
            X = pycombat(X.drop(columns=['batch']).transpose(), X['batch']).transpose()
        else:
            if all(item in X.columns for item in ['batch']):
                X = X.drop(columns=['batch'])
        return X.reset_index(), Y.reset_index()  

    data_train, data_test = train_test_split(valid_ids, test_size=0.1)
    X_train, y_train = unpacker(data_train)
    X_test, y_test = unpacker(data_test, False)
    X_train, X_test = X_train.drop(columns=['index']), X_test.drop(columns=['index'])
    y_train, y_test = np.array(y_train['y']), np.array(y_test['y'])
    X = X_train.append(X_test)
    y = np.array(list(y_train) + list(y_test))

    if BINARY_CLASSIFICATION:
        y[y==2] = 0
        y_train[y_train==2] = 0
        y_test[y_test==2] = 0
        
    # Resampling
    if resampling:
        X_train, y_train = resampler(X_train, y_train, ratio=resampling_ratio)
        
    if not(animal_split):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
        
    print(f'label distribution: {Counter(y_train)}')
    if TEST_ON_TRAIN_DATA:
        return X_train, X_train, y_train, y_train
    return X_train, X_test, y_train, y_test

In [None]:
from mlxtend.plotting import plot_confusion_matrix

def fit_classifier(clf, num_trials, animal_split=False, resampling=False, resampling_ratio=0.9):
    scores = []
    roc_scores = []
    print(f'Fitting {clf.__class__.__name__}...')
    
    for i in range(num_trials):
        
        X_train, X_test, y_train, y_test = data_split(animal_split=animal_split, resampling=resampling, resampling_ratio = resampling_ratio)
            
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = np.mean((y_pred==y_test))
        kappa = cohen_kappa_score(y_pred, y_test)
        roc_score = None
        if BINARY_CLASSIFICATION:
            try:
                roc_score = roc_auc_score(y_test, y_pred)
                roc_scores.append(roc_score)
            except ValueError as ve:
                roc_score = 0.5
        
        print(f'y_pred: {y_pred}, actual: {y_test}, score: {score}, kappa score: {kappa}, roc_auc: {roc_score}')
        
        # Confusion Matrix
        if BINARY_CLASSIFICATION:
            class_names = ['no decrease', 'decrease']
        else:
            class_names = ['stable', 'decrease', 'increase']
        try:
            cm = confusion_matrix(y_test, y_pred)
            fig, ax = plot_confusion_matrix(cm,
                                    show_absolute=True,
                                    colorbar=True,
                                    class_names=class_names)
            plt.show()

            if hasattr(clf, 'feature_importances_'):
                feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)[:10]
                feat_importances.sort_values(inplace=True)
                feat_importances.plot(kind='barh')
                plt.xlabel('Fractional importance')
                plt.ylabel('Features')
                plt.show()
        except AssertionError as ae:
            print(f'Warning: No confusion matrix possible - {ae}')
            
        scores.append(score)
        
    print('_____________________________________________________________________________________ \n')
    print(f'{clf.__class__.__name__} accuracy: {np.mean(scores)} (+/- {2*np.std(scores)})')
    print(f'{clf.__class__.__name__} roc_auc_accuracy: {np.mean(roc_scores)} (+/- {2*np.std(roc_scores)})')
    print('_____________________________________________________________________________________ \n')
    if BINARY_CLASSIFICATION:
        return np.mean(roc_scores)
    print(f'SCORING:: {scores}')
    return np.mean(scores)

# Model architectures

## Extra Trees Classifier

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score

TEST_ON_TRAIN_DATA = False

space = {'criterion': hp.choice('criterion', ['entropy']),
        'max_depth': hp.choice('max_depth', [3*i for i in range(1,40,1)]),
        'max_features': hp.choice('max_features', ['auto']),
        'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.1),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 0.5),
        'n_estimators' : hp.choice('n_estimators', [2, 3, 5, 7, 9]) #[50, 100, 500,1000,1500,2000,5000])
    }

def objective(space):
    model = ExtraTreesClassifier(criterion = space['criterion'], 
                                   max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 n_jobs = -1
                                 #class_weight = 'balanced'
                                 )
    accuracy = fit_classifier(model, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }
    
trials = Trials()
best_params = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 150,
            trials= trials)


print(best_params)
'''
HANDPICKED NON-RESPONDERS best loss: -0.6107236043297425]
{'criterion': 0, 'max_depth': 15, 'max_features': 0, 'min_samples_leaf': 0.012750661927616893, 'min_samples_split': 0.007730493536295527, 'n_estimators': 3}
NON-RESPONDERS:  best loss: -0.6327777777777778]
{'criterion': 0, 'max_depth': 21, 'max_features': 0, 'min_samples_leaf': 0.013918505198977742, 'min_samples_split': 0.03295160386254649, 'n_estimators': 5}
CATCH 22 NON-RESPONDERS best loss: -0.566 (AUC-ROC)
{'criterion': 0, 'max_depth': 5, 'max_features': auto, 'min_samples_leaf': 0.07257100898991854, 'min_samples_split': 0.3850081744499732, 'n_estimators': 0}
CATCH 22 NON-REPONDERS BALANCED, best loss: -0.6425280033975685]
{'criterion': 0, 'max_depth': 26, 'max_features': 0, 'min_samples_leaf': 0.05946724515424022, 'min_samples_split': 0.2661228299401958, 'n_estimators': 4}
CATCH 22 NON-REPONDERS batch, best loss: -0.5494117647058824]
{'criterion': 0, 'max_depth': 21, 'max_features': 0, 'min_samples_leaf': 0.057393489718521554, 'min_samples_split': 0.4989641086718188, 'n_estimators': 2}
CATCH 22 NON-RESPONDERS 3 CLASS, best loss: -0.46224270353302616]
{'criterion': 0, 'max_depth': 3, 'max_features': 0, 'min_samples_leaf': 0.08828795232641648, 'min_samples_split': 0.29518178292497504, 'n_estimators': 0}
HF_NON-RESPONDERS best loss: -0.7184294871794872, 2.05s/trial] (+/- 0.2716701152961734)
{'criterion': 0, 'max_depth': 11, 'max_features': 0, 'min_samples_leaf': 0.0006541652174175058, 'min_samples_split': 0.39663870497638093, 'n_estimators': 0}
HF COMBINED NON-RESPONDERS BALANCED, best loss: -0.7067857142857142] (+/- 0.15999299827053376) 3.46s/trial
{'criterion': 0, 'max_depth': 25, 'max_features': 0, 'min_samples_leaf': 0.023040599674842968, 'min_samples_split': 0.22824212025886656, 'n_estimators': 3}
*HF NON-REPONDERS UNDERSAMPLED, best loss: -0.596160372194855]
{'criterion': 0, 'max_depth': 6, 'max_features': 0, 'min_samples_leaf': 0.048762422145537894, 'min_samples_split': 0.22400537438735946, 'n_estimators': 1}
HF NON-REPSONDES OVERSAMPLED, best loss: -0.6126798201798201]
{'criterion': 0, 'max_depth': 5, 'max_features': 0, 'min_samples_leaf': 0.009882907429288226, 'min_samples_split': 0.493444397300387, 'n_estimators': 4}
HF NON-RESPONDERS THREE CLASS, best loss: -0.4469868698129568 
{'criterion': 0, 'max_depth': 17, 'max_features': 0, 'min_samples_leaf': 0.058645900085625126, 'min_samples_split': 0.058978447877741494, 'n_estimators': 0}
HF NON-RESPONDERS BATCH best loss: -0.6337142857142858]
{'criterion': 0, 'max_depth': 2, 'max_features': 0, 'min_samples_leaf': 0.01035078623841787, 'min_samples_split': 0.1790916569652849, 'n_estimators': 4}
DELTA NON-RESPONDERS, 3.53s/trial, best loss: -0.5671292759528054 (+/- 0.1128157009808595) 
{'criterion': 0, 'max_depth': 9, 'max_features': 0, 'min_samples_leaf': 0.03183344667522384, 'min_samples_split': 0.08817323561060117, 'n_estimators': 2}

RESPONDERS best loss: -0.6736842105263159 (AUC-ROC)
{'criterion': 0, 'max_depth': 45, 'max_features': 0, 'min_samples_leaf': 0.025281111352832085, 'min_samples_split': 0.002607202794544773, 'n_estimators': 1000}
RESPONDERS CATCH22, best loss: -0.7131578947368421 (+/- 0.237950622220154) 6.09s/trial]
{'criterion': 0, 'max_depth': 14, 'max_features': 0, 'min_samples_leaf': 0.017197968066777133, 'min_samples_split': 0.07833535344969539, 'n_estimators': 2}
RESPONDERS CATCH22 3 CLASS, best loss: -0.3772993311036789]
{'criterion': 0, 'max_depth': 20, 'max_features': 0, 'min_samples_leaf': 0.08720910431480049, 'min_samples_split': 0.46357970569054646, 'n_estimators': 4}
RESPONDERS CATCH22 UNDERSAMPLED, best loss: -0.6078947368421053]
{'criterion': 0, 'max_depth': 20, 'max_features': 0, 'min_samples_leaf': 0.040662203381250255, 'min_samples_split': 0.1989744013792977, 'n_estimators': 0}
RESPONDERS CATCH22 OVERSAMPLED, best loss: -0.6028070175438597]
{'criterion': 0, 'max_depth': 26, 'max_features': 0, 'min_samples_leaf': 0.06021740136878643, 'min_samples_split': 0.24842274951983306, 'n_estimators': 3}
RESPONDERS SMALL N_ESTIMATORS CATCH 22, best loss: -0.5947368421052632]
{'criterion': 0, 'max_depth': 29, 'max_features': 0, 'min_samples_leaf': 0.06063902001742095, 'min_samples_split': 0.3786259278204676, 'n_estimators': 3}
RESPONDERS CATCH22 batch: best loss: -0.575]
{'criterion': 0, 'max_depth': 10, 'max_features': 0, 'min_samples_leaf': 0.017880637004093616, 'min_samples_split': 0.11491483983622097, 'n_estimators': 4}
RESPONDERS DELTA: best loss: -0.6103695513293037]
{'criterion': 0, 'max_depth': 15, 'max_features': 0, 'min_samples_leaf': 0.05163641743321325, 'min_samples_split': 0.19279648497935287, 'n_estimators': 2}
HF REPONDERS, 3.22s/trial, best loss: -0.65] (+/- 0.18708286933869706)
{'criterion': 0, 'max_depth': 13, 'max_features': 0, 'min_samples_leaf': 0.07992646992678645, 'min_samples_split': 0.38119863891953715, 'n_estimators': 2}
HF RESPONDERS BALANCED,  best loss: -0.6326315789473684]
{'criterion': 0, 'max_depth': 37, 'max_features': 0, 'min_samples_leaf': 0.0622173004424529, 'min_samples_split': 0.20279857485589373, 'n_estimators': 3}
HF COMBINED RESPONDERS BALANCED, 2.89s/trial best loss: -0.6637719298245613] (+/- 0.29022979401386984)
{'criterion': 0, 'max_depth': 32, 'max_features': 0, 'min_samples_leaf': 0.051975911316321094, 'min_samples_split': 0.2249282596912486, 'n_estimators': 0}
HF RESPONDERS THREE CLASS, best loss: -0.4869565217391304]
{'criterion': 0, 'max_depth': 35, 'max_features': 0, 'min_samples_leaf': 0.00570874876573191, 'min_samples_split': 0.0825394982567266, 'n_estimators': 0}
HF RESPONDERS BATCH, best loss: -0.6008333333333333 
{'criterion': 0, 'max_depth': 21, 'max_features': 0, 'min_samples_leaf': 0.02745919221866434, 'min_samples_split': 0.12459145824798322, 'n_estimators': 1}
'''

## Random Forest classifier

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

TEST_ON_TRAIN_DATA = False

space = {'criterion': hp.choice('criterion', ['entropy']),
        'max_depth': hp.choice('max_depth', [3*i for i in range(1,20,1)]),
        'max_features': hp.choice('max_features', ['auto']),
        'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.1),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 0.5),
        'n_estimators' : hp.choice('n_estimators',  [1500,2000])# [2, 3, 5, 7, 9])  # [500,1000,1500,2000,5000]
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], 
                                   max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 n_jobs = -1
                                 )
    accuracy = fit_classifier(model, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }
    
trials = Trials()
best_params = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 100,
            trials= trials)
print(best_params)

'''-
[2, 3, 5, 7, 9])   [500,1000,1500,2000,5000]
{'criterion': 0, 'max_depth': 2, 'max_features': 0, 'min_samples_leaf': 0.05503690384288085, 'min_samples_split': 0.031076491313923316, 'n_estimators': 3}
HANDPICKED NON RESPONDERS, best loss: -0.6274437229437231 (+/- 0.15661955752871673) 15.87s/trial]
{'criterion': 0, 'max_depth': 15, 'max_features': 0, 'min_samples_leaf': 0.01625764740377511, 'min_samples_split': 0.2964956982880342, 'n_estimators': 3}
HANDPICKED NON RESPONDERS SMALL FOREST 2.10s/trial, best loss: -0.5914296814296816]
{'criterion': 0, 'max_depth': 12, 'max_features': 0, 'min_samples_leaf': 0.06535997213876292, 'min_samples_split': 0.24805911299808792, 'n_estimators': 2}
HANDPICKED RESPONDERS (Overprediction of decrease), 16.90s/trial, best loss: -0.5]
{'criterion': 0, 'max_depth': 0, 'max_features': 0, 'min_samples_leaf': 0.00705758017682645, 'min_samples_split': 0.266104692034989, 'n_estimators': 4}
SMALL FOREST CATCH22 RESPONDERS, best loss: -0.55]
{'criterion': 0, 'max_depth': 14, 'max_features': 0, 'min_samples_leaf': 0.056538518170784285, 'min_samples_split': 0.10820256145213225, 'n_estimators': 4} 
HF RESPONDERS, best loss: -0.6325000000000001 (+/- 0.13605554421305702) 1.71s/trial
{'criterion': 0, 'max_depth': 17, 'max_features': 0, 'min_samples_leaf': 0.07110229538963608, 'min_samples_split': 0.1450084663114631, 'n_estimators': 0}
HF RESPONDERS 3 CLASS, best loss: -0.3646739130434783]
{'criterion': 0, 'max_depth': 17, 'max_features': 0, 'min_samples_leaf': 0.05770846252235319, 'min_samples_split': 0.4814812254247135, 'n_estimators': 0}
HF NON-RESPONDERS, best loss: -0.6318333333333334 (+/- 0.39230430365555086) 2.10s/trial
{'criterion': 0, 'max_depth': 10, 'max_features': 0, 'min_samples_leaf': 0.07659940578952626, 'min_samples_split': 0.0026034165908682128, 'n_estimators': 3}
HF NON-RESPONDERS 3 CLASS best loss: -0.5205396825396825]
{'criterion': 0, 'max_depth': 8, 'max_features': 0, 'min_samples_leaf': 0.046450367468097054, 'min_samples_split': 0.2687114323925206, 'n_estimators': 0}
CATCH_22 RESPONDER, best loss: -0.5625438596491228]
{'criterion': 0, 'max_depth': 15, 'max_features': 0, 'min_samples_leaf': 0.02501477632438393, 'min_samples_split': 0.17515365082535356, 'n_estimators': 0}
CATCH 22 RESPONDER 3 CLASS, best loss: -0.40652173913043477]
{'criterion': 0, 'max_depth': 0, 'max_features': 0, 'min_samples_leaf': 0.07574179915967928, 'min_samples_split': 0.49858316089971866, 'n_estimators': 0}
CATCH 22 NON-RESPONDERS best loss: -0.5439655172413793]
{'criterion': 0, 'max_depth': 9, 'max_features': 0, 'min_samples_leaf': 0.07276937585678855, 'min_samples_split': 0.29796310219729616, 'n_estimators': 1}
CATCH 22 NON-RESPONDERS 3 CLASS, best loss: -0.4521061106873464]
{'criterion': 0, 'max_depth': 18, 'max_features': 0, 'min_samples_leaf': 0.023363672552956134, 'min_samples_split': 0.40566774286886687, 'n_estimators': 1}

'''

## XGBoost classifier

In [None]:
## XGBOOST ##

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import chi2

# 0.50 +- 0.07
XGB_clf = xgb.XGBClassifier(n_estimators=50, n_learning_rate=0.001, max_depth=5, sample_bytree=0.1, gamma=0.0, score_func=chi2, k=10, class_weight='balanced')
fit_classifier(XGB_clf, num_trials = 5, animal_split = True, resampling=False)
scores = cross_val_score(XGB_clf, X_metrics, y, scoring='accuracy', cv=10)   
print("Accuracy: %0.2f (+/- %0.2f) [XGB Classifier]" % (scores.mean(), scores.std()))

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score

TEST_ON_TRAIN_DATA = False
RESPONDERS_ONLY = False

space = {'max_depth': hp.choice('max_depth', [3*i for i in range(1,40,1)]),
        'sample_bytree': hp.uniform ('sample_bytree', 0, 0.5), 
        'n_estimators' : hp.choice('n_estimators',  [2, 3, 5, 7, 9]), # [50, 100, 500,1000,1500,2000,5000])
         'n_learning_rate': hp.uniform ('n_learning_rate', 0, 0.2)
    }

def objective(space):
    model = xgb.XGBClassifier(n_estimators=space['n_estimators'], n_learning_rate=space['n_learning_rate'], max_depth=space['max_depth'], sample_bytree=space['sample_bytree'], gamma=0.0, score_func=chi2, k=10, class_weight='balanced')
    accuracy = fit_classifier(model, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }
    
trials = Trials()
best_params = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 150,
            trials= trials)


print(best_params)
#BEST: 0.4913157894736842 (+/- 0.2189473684210526

# Dataset imports and preprocessing

## File Converter and Transformer

In [None]:
import numpy as np
import pandas as pd
from sktime.transformations.panel.catch22 import Catch22
from os.path import isfile, join
from os import listdir, getcwd

valid_nums = [i for i in range(9, 25, 1)]
files = [f for f in listdir(getcwd()) if isfile(join(getcwd(), f))]
valid_files = [num for num in valid_nums if f'raw_dataset_{num}.npy' in files]

num = 12

X = np.load(f'raw_dataset_{num}.npy', allow_pickle=True)
y = np.load(f'raw_datasety_{num}.npy', allow_pickle=True)
trunc = 599999
invalid_arrays = []
print('Truncating raw data...')
print('X shape: ', X.shape)
X_new = np.zeros((len(X),trunc))
for i in range(len(X)):
    if len(X[i]) >= trunc:
        X_new[i] = X[i][:trunc]
    else:
        X_new[i] = np.zeros(trunc)
        invalid_arrays.append(i)
        
# Removing invalid subarrays   
print(' Invalid (shorter than required for uniform dataset) indices in raw_dataset: ', invalid_arrays)
X_temp = list(X_new)
for ele in sorted(invalid_arrays, reverse = True):
    del X_temp[ele]
X = np.array(X_temp)
print('Final X shape: ', X.shape)
y = np.delete(y, invalid_arrays)
np.save(f'raw_dataset_y_{num}.npy', y)

# Catch22 transformation
print('Transforming...')
catch22 = Catch22(n_jobs=-1)
catch22.fit(X)
X_t = catch22.transform(X)
X_t.to_csv(f'c22_dataset{num}.csv')
print('Transformed file saved.')

## High frequency data processing

In [None]:
## HIGH FREQUENCY DATA PROCESSING SPIKE FREQUENCY

from os.path import isfile, join
from os import listdir, getcwd
from delta_feature_extractor import *

valid_nums = [i for i in range(8, 25, 1)]
files = [f for f in listdir(getcwd()) if isfile(join(getcwd(), f))]
valid_files = [num for num in valid_nums if f'hf_features_dataset_{num}.csv' in files]
MODE = 'CATCH22' # HANDPICKED or CATCH22
features_to_extract = [ Mean , Max , Min , MAV , Var , StD , WL , Energy_from_fft , Kurtosis , \
                           Skewness , Signal_Power , Signal_Energy , Min_max_difference , Wilson_Amplitude , \
                           Root_Mean_Square , V3  , DABS , Maximum_fractal_length , Myopulse_percentage_rate , \
                           Mean_Frequency ]
for num in valid_files:
    df = pd.read_csv(f'hf_features_dataset_{num}.csv')

    glucose_indices = []
    for i,v in enumerate(list(df['change_label'])):
        if not np.isnan(v):
            glucose_indices.append(i)
    print(glucose_indices)
    output_array = []
    labels = []
    for i,v in enumerate(glucose_indices):
        if i == 0:
            windowed_neural_df = df.iloc[0:v-1]     
        else:
            windowed_neural_df = df.iloc[glucose_indices[i-1]+1:v-1]
            
        cleaned_array = [x for x in list(windowed_neural_df['fr']) if not np.isnan(x)]
        output_array.append(cleaned_array)
        labels.append(df.iloc[v]['change_label'])
        
    if MODE == 'CATCH22':
        catch22 = Catch22(n_jobs=-1)
        catch22.fit(np.array(output_array))
        X_t = catch22.transform(X)
        X_t.to_csv(f'hf_c22_dataset{num}.csv')
    else:
        final_df = Feature_extraction_windowed(output_array, features_to_extract)
        final_df['y'] = labels
        final_df.to_csv(f'hf_dataset_{num}.csv')

In [None]:
## HIGH FREQUENCY DATA PROCESSING SPIKE FREQUENCY and AMPLITUDE

from os.path import isfile, join
from os import listdir, getcwd
from feature_extractor_hf import *

valid_nums = [i for i in range(8, 25, 1)]
files = [f for f in listdir(getcwd()) if isfile(join(getcwd(), f))]
valid_files = [num for num in valid_nums if f'hf_features_dataset_{num}.csv' in files]
MODE = 'HANDPICKED' # HANDPICKED or CATCH22
features_to_extract = [ Mean , Max , Min , MAV , Var , StD , WL , Energy_from_fft , Kurtosis , \
                           Skewness , Signal_Power , Signal_Energy , Min_max_difference , Wilson_Amplitude , \
                           Root_Mean_Square , V3  , DABS , Maximum_fractal_length , Myopulse_percentage_rate , \
                           Mean_Frequency ]
for num in valid_files:
    df = pd.read_csv(f'hf_{num}.csv')

    glucose_indices = []
    for i,v in enumerate(list(df['change_label'])):
        if not np.isnan(v):
            glucose_indices.append(i)
    print(glucose_indices)
    output_fr_array = []
    output_amp_array = []
    labels = []
    for i,v in enumerate(glucose_indices):
        if i == 0:
            windowed_neural_df = df.iloc[0:v-1]     
        else:
            windowed_neural_df = df.iloc[glucose_indices[i-1]+1:v-1]
            
        cleaned_fr_array = [x for x in list(windowed_neural_df['fr']) if not np.isnan(x)]
        cleaned_amp_array = [x for x in list(windowed_neural_df['amp']) if not np.isnan(x)]
        output_fr_array.append(cleaned_fr_array)
        output_amp_array.append(cleaned_amp_array)
        labels.append(df.iloc[v]['change_label'])
        
    if MODE == 'CATCH22':
        catch22 = Catch22(n_jobs=-1)
        catch22.fit(np.array(output_array))
        X_t = catch22.transform(X)
        X_t.to_csv(f'hf_c22_dataset{num}.csv')
    else:
        final_df = Feature_extraction_windowed(output_fr_array, output_amp_array, features_to_extract)
        final_df['y'] = labels
        final_df = final_df.drop(columns=['Mean', 'Max', 'Min', 'MAV', 'Var', 'StD', 'WL',
       'Energy_from_fft', 'Kurtosis', 'Skewness', 'Signal_Power',
       'Signal_Energy', 'Min_max_difference', 'Wilson_Amplitude',
       'Root_Mean_Square', 'V3', 'DABS', 'Maximum_fractal_length',
       'Myopulse_percentage_rate', 'Mean_Frequency'])
        final_df.to_csv(f'hf_combined_dataset_{num}.csv')

In [None]:
num = 9
#X = np.load(f'hf_windowedX_{num}.npy')
X = pd.read_csv(f'hf_combined_dataset_9.csv')
print(X)

## Delta data processing

In [None]:
## DELTA DATASETS GENERATION ##

import numpy as np
import pandas as pd
#from sktime.transformations.panel.catch22 import Catch22
from os.path import isfile, join
from os import listdir, getcwd

valid_nums = [i for i in range(8, 25, 1)]
files = [f for f in listdir(getcwd()) if isfile(join(getcwd(), f))]
valid_files = [num for num in valid_nums if f'raw_dataset_{num}.npy' in files]

for num in valid_files:
    X = np.load(f'raw_dataset_{num}.npy', allow_pickle=True)
    y = np.load(f'raw_datasety_{num}.npy', allow_pickle=True)
    window = 1000
    invalid_arrays = []
    print('Truncating raw data...')
    print('X shape: ', X.shape)
    X_new = np.zeros((len(X),window))
    for i in range(len(X)):
        if len(X[i]) >= 2*window:
            X_new[i] = X[i][-window:] - X[i][:window]
        else:
            X_new[i] = np.zeros(window)
            invalid_arrays.append(i)

    # Removing invalid subarrays   
    print(' Invalid (shorter than required for uniform dataset) indices in raw_dataset: ', invalid_arrays)
    X_temp = list(X_new)
    for ele in sorted(invalid_arrays, reverse = True):
        del X_temp[ele]
    X = np.array(X_temp)
    print('Final X shape: ', X.shape)
    y = np.delete(y, invalid_arrays)
    np.save(f'raw_delta_dataset{num}.npy', X)
    np.save(f'raw_delta_dataset_y_{num}.npy', y)

## Handpicked features processing

In [None]:
import math
import os
import sys
import pickle
import numpy as np
import scipy as sp
import pandas as pd
from datetime import date

# Add my module to python path
sys.path.append("../")

from delta_feature_extractor import *
import pickle
from os.path import isfile, join
from os import listdir, getcwd

valid_nums = [i for i in range(8, 25, 1)]
files = [f for f in listdir(getcwd()) if isfile(join(getcwd(), f))]
valid_files = [num for num in valid_nums if f'hf_windowedX_{num}.npy' in files]

for num in valid_files:

    X = np.load(f'hf_windowedX_{num}.npy')
    y = np.load(f'hf_windowedy_{num}.npy')
    
    # Feature Extraction Parameters
    features_to_extract = [ Mean , Max , Min , MAV , Var , StD , WL , Energy_from_fft , Kurtosis , \
                           Skewness , Signal_Power , Signal_Energy , Min_max_difference , Wilson_Amplitude , \
                           Root_Mean_Square , V3  , DABS , Maximum_fractal_length , Myopulse_percentage_rate , \
                           Mean_Frequency ]

    final_df = Feature_extraction_windowed(X, features_to_extract)
    final_df['y'] = y
    
    
    print(final_df) 
    final_df.to_csv(f'hf_features_dataset_{num}.csv')

## Batch labelling

In [None]:
animal = {8:1, 9:2, 10:2, 11:3, 12:4, 13:4, 14:5, 15:6, 16:7, 17:8, 18:9, 19:10, 20:11, 21:11, 22:12, 23:12, 24:13}

for ref in range(8,25,1):
    
    df = pd.read_csv(f'c22_dataset{ref}.csv').drop(columns=['Unnamed: 0'], axis=1)
    df['batch'] = animal[ref]
    df.to_csv(f'c22_labelled{ref}.csv')
    
    df = pd.read_csv(f'delta_features_{ref}.csv')
    df['batch'] = animal[ref]
    df.to_csv(f'delta_features_dataset_{ref}.csv')
    
    if ref not in [22,23,24]:
        df = pd.read_csv(f'hf_dataset_{ref}.csv')
        df['batch'] = animal[ref]
        df.to_csv(f'hf_features_dataset_{ref}.csv')
    

## Batch correction

In [None]:
## PYCOMBAT TESTS ##

from combat.pycombat import pycombat

X = pd.DataFrame()
Y = pd.DataFrame()
for ref in range(8,25,1):
    x = pd.read_csv(f'c22_labelled{ref}.csv')
    X = X.append(x)
corrected_df = pycombat(X.drop(columns=['batch']).transpose(), X['batch']).transpose()
print(corrected_df.drop(columns=['Unnamed: 0']))

# Alternate feature transforms

In [None]:
from tsfresh.utilities.dataframe_functions import impute
import tsfresh
extracted_features = impute(pd.read_csv('featuresX200.csv'))[:-1]
#extracted_features.drop(index=extracted_features.index[0], axis=0, inplace=True)
extracted_features = extracted_features.reset_index() #788 features
y = pd.read_csv('featuresy200.csv')['1.000000000000000000e+00']
print(len(extracted_features.columns))

In [None]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
relevance_table = calculate_relevance_table(extracted_features, y, ml_task = 'classification',multiclass=True, n_jobs=2, show_warnings=True)
#relevance_table = relevance_table[relevance_table.relevant]
#relevance_table.sort_values("p_value", ascending =False, inplace=True)
print(relevance_table)

In [None]:
print(relevance_table[relevance_table.relevant])

In [None]:
## Loading data ##
from collections import Counter

X = np.load('filtered_windowed_datasetX.npy')
y = np.load('filtered_windowed_datasety.npy')
Trunc = 300000 # max: 599999


THINNING_FACTOR = 1
X_new = np.zeros((len(X),Trunc-1))

for i in range(len(X)):
    X_new[i] = X[i][Trunc:]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.15)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
X = X_new
print(len(X_new[0]))


In [None]:
## SHAPELET TRANSFORM ## ! OUT OF MEMORY ERROR NOT FEASIBLE?

from pyts.transformation import ShapeletTransform

transform = ShapeletTransform(sort = True, n_jobs = 6)
transform.fit(X_new,y)

# Alternate classifier models

In [None]:
TEST_ON_TRAIN_DATA = False

## METRICS ENSEMBLE CLASSIFIERS (AVERAGING MODELS) ##

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

XT_clf = ExtraTreesClassifier(n_estimators=100, max_depth=5, n_jobs=-1, class_weight='balanced')
RF_clf = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1, class_weight='balanced') 
BC_clf = BaggingClassifier(ExtraTreesClassifier(n_estimators=100, max_depth=5, n_jobs=-1, class_weight='balanced'), n_jobs=-1)
fit_classifier(XT_clf, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)
fit_classifier(RF_clf, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)
fit_classifier(BC_clf, num_trials = 5, animal_split = ANIMAL_LEVEL_SPLIT, resampling=False)

In [None]:
## RANDOM TREES EMBEDDING -> LOGISTIC REGRESSION

from sklearn.ensemble import RandomTreesEmbedding
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

random_tree_embedding = RandomTreesEmbedding(
    n_estimators=10000, n_jobs=-1
)

rt_model = make_pipeline(random_tree_embedding, ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, class_weight='balanced')) 

fit_classifier(rt_model, num_trials = 5, animal_split = True, resampling=False)
#rt_model.fit(X_train, y_train)
#print(rt_model.score(X_test, y_test))
# roc_auc_accuracy: 0.4966666666666667 (+/- 0.013333333333333286)

In [None]:
## STACKING CLASSIFIERS ##

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42, class_weight='balanced')),
    ('svr', make_pipeline(StandardScaler(),
    LinearSVC(random_state=42)))]
    
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(class_weight='balanced'))
#scores = cross_val_score(stacking_clf, X_metrics, y, scoring='accuracy', cv=5)   
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
fit_classifier(stacking_clf, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## XGB GRIDSEARCH OPTIMIZATION ##
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.feature_selection import SelectKBest, chi2

clf = xgb.XGBClassifier() # 0.47 +- 0.7
search_space = [
  {
    'clf__n_estimators': [50, 200], #50 
    'clf__learning_rate': [0.01],
    'clf__max_depth': range(5, 10), #5
    'clf__colsample_bytree': [i/10.0 for i in range(1, 3)], # 0.1
    'clf__gamma': [i/10.0 for i in range(3)], # 0
    'fs__score_func': [chi2],
    'fs__k': [10]
  }
]# Define cross validation
kfold = KFold(n_splits=10)# AUC and accuracy as score
scoring = {'Accuracy':make_scorer(accuracy_score)}# Define grid search
grid = GridSearchCV(
  clf,
  param_grid=search_space,
  cv=kfold,
  scoring=scoring,
  refit='Accuracy',
  verbose=1,
  n_jobs=-1
)# Fit grid search
model = grid.fit(X_train, y_train)
print(grid.best_params_)
print(model.score(X_test, y_test))

In [None]:
from sktime.transformations.panel.tsfresh import TSFreshFeatureExtractor
from sklearn.ensemble import RandomForestClassifier

classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier(),
)
classifier.fit(X_train, y_train)
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

#clf = GradientBoostingClassifier(learning_rate=0.001,n_estimators=10000).fit(X_train, y_train) # AVERAGE: 0.49
HGB_clf = HistGradientBoostingClassifier(loss='categorical_crossentropy',learning_rate=0.01, max_iter=10000) # AVERAGE: 0.45 (consistent)
fit_classifier(HGB_clf, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## ROCKET TRANSFORM ## ! RANDOM

from pyts.transformation import ROCKET

rocket = ROCKET()
rocket.fit(X_train)
X_train_R = rocket.transform(X_train)
X_test_R = rocket.transform(X_test)

In [None]:
## RIDGECLASSIFIERCV FITTING ##

from sklearn.linear_model import RidgeClassifierCV

classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
fit_classifier(classifier, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## CATCH22 CLASSIFIER ##

from sktime.classification.feature_based import Catch22Classifier
from sklearn.ensemble import RandomForestClassifier

classifier = Catch22Classifier(
    estimator=RandomForestClassifier(n_estimators=60, n_jobs=-1), #0.63 best, 0.56 average
    outlier_norm=True,
    n_jobs = -1
)

In [None]:
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier(loss="hinge", penalty="l2", max_iter=10000)
classifier.fit(X_train_R, y_train)

In [None]:
## RESULTS ##

print(classifier.score(X_test_R, y_test))

In [None]:
## MINIROCKET TRANSFORM ## !

from sklearn.linear_model import RidgeClassifierCV
from sktime.transformations.panel.rocket import MiniRocket

minirocket = MiniRocket(n_jobs=-1) 
minirocket.fit(X)
X_MR = minirocket.transform(X)

In [None]:
## RIDGECLASSIFIERCV FITTING ## !SGDCLASSIFIER USED FOR LARGER DATASETS (RECOMMENDED BY CREATORS)

from sklearn.model_selection import cross_val_score

clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
#print(classifier.score(X_test_transform, y_test))
scores = cross_val_score(clf, X_MR, y, scoring='accuracy', cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
## PROXIMITY FORESTS ## ! OUT OF MEMORY ERROR NOT FEASIBLE?
from sktime.classification.distance_based import ProximityForest

classifier = ProximityForest(n_estimators=10, n_jobs=-1)
fit_classifier(classifier, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## MRSEQL CLASSIFIER ## 

from sktime.classification.shapelet_based import MrSEQLClassifier

classifier = MrSEQLClassifier(seql_mode='clf', symrep=['sax', 'sfa'])
fit_classifier(classifier, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## RESULTS ##

print(classifier.score(X_test_R, y_test))

In [None]:
fit_classifier(X, y, classifier, 3)

In [None]:
classifier.fit(X_train, y_train)
print(classifier.score(X_test, y_test))

In [None]:
## VOTING ##
from sklearn.ensemble import VotingClassifier#create a dictionary of our models
from sklearn.linear_model import LogisticRegression#create a new logistic regression model
from sklearn.ensemble import RandomForestClassifier#create a new random forest classifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier#create new a knn model

knn = KNeighborsClassifier()#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 50)}#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)#fit model to training data
knn_gs.fit(X_train, y_train)
#save best model
knn_best = knn_gs.best_estimator_#check best n_neigbors value
knn_best.fit(X_train, y_train)
print(knn_best.score(X_test, y_test))
print(knn_gs.best_params_)

'''
rf = RandomForestClassifier()#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)#fit model to training data
rf_gs.fit(X_train, y_train)
#save best model
rf_best = rf_gs.best_estimator_#check best n_estimators value
print(rf_gs.best_params_)

estimators=[('knn', knn_best), ('rf', rf_best)]#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard', n_jobs=8)

classifier = Catch22Classifier(
    estimator=ensemble, 
    outlier_norm=True,
    n_jobs = 8
)
'''

In [None]:
classifier.fit(X_train, y_train)
print(classifier.score(X_test, y_test))

In [None]:
## HIVECOTEV1 ##

from sktime.classification.hybrid import HIVECOTEV1

clf = HIVECOTEV1(verbose = 1, n_jobs = -1)
clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))

In [None]:
## HIVECOTEV2 ##

from sktime.classification.hybrid import HIVECOTEV2

clf = HIVECOTEV2(time_limit_in_minutes = 50, verbose = 1, n_jobs = -1)
fit_classifier(clf, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## KNN with DTW ## ! 0.433

from tslearn.neighbors import KNeighborsTimeSeriesClassifier

clf = KNeighborsTimeSeriesClassifier(n_jobs=-1)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))


In [None]:
## SVM with JAK ##

from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC

#X_train_T = TimeSeriesScalerMinMax().fit_transform(X_train)
#X_test_T = TimeSeriesScalerMinMax().fit_transform(X_test)


clf = TimeSeriesSVC(n_jobs=-1, verbose=1)
fit_classifier(clf, num_trials = 5, animal_split = True, resampling=False)

In [None]:
## LEARNING TIME-SERIES SHAPELETS ## ! TOO MUCH MEMORY REQUIRED

from pyts.classification import LearningShapelets

clf = LearningShapelets(verbose=1, n_jobs=-1)
clf.fit(X, y)
print(clf.score(X_test, y_test))

In [None]:
## TIME SERIES FOREST ## ! Random

from pyts.classification import TimeSeriesForest

clf = TimeSeriesForest(n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

In [None]:
## WEASEL ##

from pyts.transformation import WEASEL

# WEASEL transformation
weasel = WEASEL(word_size=2, n_bins=2, window_sizes=[12, 36], sparse=False)
X_weasel = weasel.fit_transform(X_train, y_train)

# Classifier 
classifier = SVC()
classifier.fit(X_weasel, y_train)
print(classifier.score(weasel.transform(X_test), y_test))

In [None]:
## SAXVSM ##

from pyts.classification import SAXVSM

clf = SAXVSM(window_size=0.5, word_size=0.5, n_bins=5, strategy='normal')
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

In [None]:
## EXTRACTED ALL HANDPICKED METRICS ## ! TODO: PLOT ROC CURVES, FEATURE IMPORTANCES

X_metrics = pd.read_csv('X_filtered_features.csv').drop(['Unnamed: 0'], axis=1)
y = np.load('filtered_windowed_datasety.npy')

In [None]:
## STACKED GENERALISER ##

from stacked_generalizer import StackedGeneralizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

VERBOSE = True
N_FOLDS = 5

# define base models
base_models = [RandomForestClassifier(n_estimators=5000, n_jobs=-1, criterion='gini'),
               RandomForestClassifier(n_estimators=5000, n_jobs=-1, criterion='entropy'),
               ExtraTreesClassifier(n_estimators=5000, n_jobs=-1, criterion='gini')] # AVERAGE: 0.50

# define blending model
blending_model = LogisticRegression()

# initialize multi-stage model
sg = StackedGeneralizer(base_models, blending_model, 
                    n_folds=N_FOLDS, verbose=VERBOSE)
# fit model
sg.fit(X_train,y_train)

# test accuracy
pred = sg.predict(X_test)
pred_classes = [np.argmax(p) for p in pred]

_ = sg.evaluate(y_test, pred_classes)

In [None]:
## METRICS ENSEMBLE CLASSIFIERS (BOOSTING MODELS) ##

from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

AB_clf = AdaBoostClassifier(SVC(probability=True, kernel='linear'), 1000)
fit_classifier(X_metrics, y, AB_clf, 5)

In [None]:
## HARD VOTING CLASSIFIER ##
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_metrics, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
fit_classifier(X_metrics, y, eclf, 5)

In [None]:
## SOFT VOTING CLASSIFIER ##

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[1, 2, 1])

for clf, label in zip([clf1, clf2, clf3, eclf], ['DecisionTree', 'KNN', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, X_metrics, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
fit_classifier(X_metrics, y, eclf, 5)

In [None]:
scores = cross_val_score(model, X_metrics, y, scoring='accuracy', cv=5)   
print("Accuracy: %0.2f (+/- %0.2f) [XGB Classifier]" % (scores.mean(), scores.std()))