In [485]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split,  KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

from sklearn.metrics import confusion_matrix, \
                  classification_report, accuracy_score,  precision_score, recall_score, f1_score

from IPython.core.interactiveshell import InteractiveShell

pd.set_option('display.precision', 3)

In [486]:
import warnings
warnings.filterwarnings('ignore')

np.random.seed(3451341) # for reproducibility

## Suport functions

In [487]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [488]:
ILDS = pd.read_csv("log_ILDS_train_X.csv", delimiter=',')
ILDS.columns = ['Age','TP','ALB','AR','DBratio','logTB','logDB','logAlkphos','logSgpt','logSgot','Female', 'Target']

ILDS.head()

Unnamed: 0,Age,TP,ALB,AR,DBratio,logTB,logDB,logAlkphos,logSgpt,logSgot,Female,Target
0,0.18,0.553,-0.911,-1.419,-1.02,1.113,1.204,0.41,-1.447,0.55,0,0
1,-0.37,0.937,1.534,1.523,-0.907,0.229,0.474,-0.571,0.121,0.338,0,0
2,-1.346,-0.216,-0.011,0.223,-0.01,-0.429,-0.381,-0.213,0.088,0.643,0,0
3,-0.186,-0.216,0.118,0.428,0.112,-0.795,-0.697,-0.941,-0.123,0.658,1,0
4,0.424,0.457,0.247,-0.12,1.947,-0.953,-1.236,-0.407,-1.798,-1.627,1,1


In [489]:
X = ILDS.loc[:, ILDS.columns != 'Target']
y = ILDS['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# LDA

In [490]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

print('Priors:', lda_model.priors_)

Priors: [0.48674699 0.51325301]


In [491]:
y_train

535    1
144    1
302    0
567    1
558    1
      ..
71     1
106    0
270    0
435    0
102    1
Name: Target, Length: 415, dtype: int64

In [492]:
pd.Series(lda_model.predict(X_train))

0      1
1      1
2      0
3      0
4      1
      ..
410    1
411    1
412    0
413    1
414    1
Length: 415, dtype: int64

Nota: es pot treure molta informació sobre el model amb funcions que té LDA, per tant si algo falla es pot estudiar bé

In [493]:
confusion(y_train, pd.Series(lda_model.predict(X_train))) 

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,106,96
1,37,176


## Computing metrics

In [494]:
results_df = pd.DataFrame(index=[], columns= ['Accuracy', 'F1 Macro', 'Precision Macro', 'Recall Macro'])

In [495]:
cross_val_results = pd.DataFrame(cross_validate(lda_model , X_train, y_train, cv = 5, 
                            scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['LDA',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.665,0.655,0.682,0.661


# QDA

In [496]:
qda_model = QuadraticDiscriminantAnalysis(reg_param=0.1).fit(X_train, y_train)

print('Priors:', qda_model.priors_)
print('Means:\n')
means =pd.DataFrame(qda_model.means_)
means.columns=ILDS.columns[1:]
means

Priors: [0.48674699 0.51325301]
Means:



Unnamed: 0,TP,ALB,AR,DBratio,logTB,logDB,logAlkphos,logSgpt,logSgot,Female,Target
0,0.053,0.04944,-0.051,-0.129,-0.045,0.177,0.159,0.108,0.122,0.132,0.257
1,-0.258,-0.0003105,0.202,0.285,0.323,-0.513,-0.503,-0.448,-0.47,-0.509,0.221


In [497]:
confusion(y_train, pd.Series(qda_model.predict(X_train)))     

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,104,98
1,24,189


In [498]:
cross_val_results = pd.DataFrame(cross_validate(qda_model , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['QDA',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.665,0.655,0.682,0.661
QDA,0.689,0.676,0.716,0.685


# KNN

In [499]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
    estimator=knn,
    param_grid={
        'n_neighbors': [5, 7, 10, 15, 20],
        'metric': ['euclidean', 'minkowski', 'manhattan']
    },
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    refit=False
)

knn_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [500]:
cols = ['param_n_neighbors', 'param_metric',
     'mean_test_accuracy',
    'mean_test_f1_macro', 'mean_test_precision_macro',
    'mean_test_recall_macro', 
    'std_test_accuracy', 'std_test_f1_macro', 'std_test_precision_macro',
    'std_test_recall_macro'
]
results_cv[cols].sort_values(by='mean_test_f1_macro',ascending=False)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_accuracy,mean_test_f1_macro,mean_test_precision_macro,mean_test_recall_macro,std_test_accuracy,std_test_f1_macro,std_test_precision_macro,std_test_recall_macro
2,10,euclidean,0.708,0.699,0.73,0.704,0.031,0.03,0.038,0.03
7,10,minkowski,0.708,0.699,0.73,0.704,0.031,0.03,0.038,0.03
1,7,euclidean,0.708,0.693,0.746,0.703,0.022,0.023,0.026,0.021
6,7,minkowski,0.708,0.693,0.746,0.703,0.022,0.023,0.026,0.021
0,5,euclidean,0.701,0.689,0.726,0.696,0.012,0.012,0.014,0.012
5,5,minkowski,0.701,0.689,0.726,0.696,0.012,0.012,0.014,0.012
12,10,manhattan,0.684,0.675,0.699,0.681,0.021,0.021,0.024,0.02
4,20,euclidean,0.692,0.675,0.728,0.686,0.038,0.039,0.045,0.036
9,20,minkowski,0.692,0.675,0.728,0.686,0.038,0.039,0.045,0.036
3,15,euclidean,0.696,0.674,0.752,0.69,0.034,0.038,0.033,0.031


In [501]:
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')

cross_val_results = pd.DataFrame(cross_validate(knn , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))
knn.fit(X_train, y_train)

results_df.loc['KNN',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.665,0.655,0.682,0.661
QDA,0.689,0.676,0.716,0.685
KNN,0.708,0.699,0.73,0.704


# Gaussian Naive Bayes

In [502]:
gaussian_nb = GaussianNB()

gaussian_nb.fit(X_train,y_train)


cross_val_results = pd.DataFrame(cross_validate(gaussian_nb , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['Gaussian Naive Bayes',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.665,0.655,0.682,0.661
QDA,0.689,0.676,0.716,0.685
KNN,0.708,0.699,0.73,0.704
Gaussian Naive Bayes,0.692,0.679,0.719,0.687


# Logistic regression

In [503]:
logreg = LogisticRegressionCV(Cs=20, random_state=1, cv = 10, scoring = 'accuracy', multi_class='multinomial')

logreg.fit(X_train, y_train)

In [504]:
print(logreg.scores_.keys())

dict_keys([1])


In [505]:
# figure out optimal parameter value for 'C'

avg_crossval_scores = logreg.scores_[1].mean(axis=0)
idx = np.argmax(avg_crossval_scores)
best_C = logreg.Cs_[idx]
print(best_C)

0.012742749857031334


In [506]:
logreg = LogisticRegression(C=best_C, multi_class='multinomial')
logreg_model = logreg.fit(X_train, y_train)
cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['Logistic Regression',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values

results_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
KNN,0.708,0.699,0.73,0.704
Logistic Regression,0.696,0.689,0.709,0.693
Gaussian Naive Bayes,0.692,0.679,0.719,0.687
QDA,0.689,0.676,0.716,0.685
LDA,0.665,0.655,0.682,0.661


# Test

In [507]:
ILDS_test = pd.read_csv("log_ILDS_test_X.csv", delimiter=',', header=None)

ILDS_test.columns = ['Age','TP','ALB','AR','DBratio','logTB','logDB','logAlkphos','logSgpt','logSgot','Female']

X_test = ILDS_test.loc[:,:'Female']

ILDS_test['Label'] = qda_model.predict(X_test)

ILDS_test.head()

ILDS_test.index = ILDS_test.index + 1
ILDS_test.index.name = 'ID'

ILDS_test['Label'].to_csv('qda.csv', index=True)