In [None]:
## Import pandas and scikit libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import svm, preprocessing, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Import libraries for time
from datetime import datetime
from datetime import date
import json

## Classifier to be used with different sklearn Estimators
## Performs LOOCV for testing, 10-fold CV for hyperparameter tuning
## Uses ROC AUC as score metric
def loo_cv(Estimator, parameters, filename):
    t1=datetime.now() # Start time
    df=pd.read_csv("data_preproc.csv") # data file
    Xr_t = df.iloc[:,1:5] # TODO: parameter for number of variables taken from file
    yr = df['class']
    print(Xr_t.head()) # Check file
    # Scaler for train and test data
    scaler = preprocessing.StandardScaler().fit(Xr_t)
    Xr=scaler.transform(Xr_t)
    # Leave One Out cross validation
    loo=LeaveOneOut()
    df_g=pd.DataFrame(loo.split(Xr,yr)) # Dataframe with train and test index for each split
    df_g=df_g.rename({0: 'Train', 1: 'Test'}, axis='columns')
    best_params=[] # Initialize list for best params in each split
    y_pred=[] # Initialize list for predictions on test data 
    y_pred_prob=[] #Initialize list for predictions with probabilities on test data
    print("\n\nLongitud del dataset: "+str(len(df_g))) # Check the number of splits
    for row in df_g.itertuples():
        train_set_x=Xr[row.Train]
        train_set_y=yr[row.Train]
        test_set_x=Xr[row.Test]
        test_set_y=yr[row.Test]
        cv_model=GridSearchCV(Estimator, parameters, 
                              scoring=metrics.make_scorer(metrics.roc_auc_score),
                              n_jobs=-1, cv=10)
        cv_model.fit(train_set_x,train_set_y) # Fit to train data
        param=cv_model.best_params_
        param['score']=cv_model.best_score_
        best_params.append(param)
        yhat=cv_model.predict(test_set_x) # Predict on test datapoint with best params
        y_pred.append(yhat)
        try:
            yhat_prob=cv_model.predict_proba(test_set_x) # Predict on test datapoint with best params
            y_pred_prob.append(yhat_prob[0,1])
        except AttributeError:
            y_pred_prob.append(yhat) # Method doesn't return probabilities
    y_pred_df=pd.DataFrame({'bin':y_pred, 'prob':y_pred_prob})
    print(y_pred_df.head())
    print('\nScorer: '+str(cv_model.scorer_))
    # Write to files
    filename1=filename + '.json'
    filename2=filename + '_pred.csv'
    filename3=filename + '_info.csv'
    y_pred_df.to_csv(filename2, index=False, header=False)
    with open(filename1, 'w+') as outfile:
        json.dump(best_params, outfile)
    t2=datetime.now()
    delta=str(t2-t1)
    print('\nTiempo de ejecución: ' + delta)
    score_test=metrics.roc_auc_score(yr, y_pred_df['bin'])
    print('\nScore (AUC) de test binario: ', score_test)
    score_test_prob=metrics.roc_auc_score(yr, y_pred_df['prob'])
    print('\nScore (AUC) de test proba: ', score_test_prob)
    with open(filename3, 'w+') as f:
        f.write('Tiempo de ejecución: ' + delta)
        f.write('\nFecha: ' + str(date.today()))
        f.write('\nScore (AUC) de test proba: '+ str(score_test_prob))
        f.write('\nScore (AUC) de test binario: '+ str(score_test))
        f.write('\n')
        f.write(str(parameters))