In [25]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split,  KFold, cross_validate, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

from sklearn.metrics import confusion_matrix, \
                  classification_report, accuracy_score,  precision_score, recall_score, f1_score

from IPython.core.interactiveshell import InteractiveShell

pd.set_option('display.precision', 3)

In [26]:
import warnings
warnings.filterwarnings('ignore')

np.random.seed(3451341) # for reproducibility

## Suport functions

In [27]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [28]:
ILDS = pd.read_csv("train_features_ILDS.csv", delimiter=',')

ILDS.columns = ['Age', 'Female', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB', 'AR']
ILDS['target'] = pd.read_csv("train_labels_ILDS.csv", delimiter=',')

ILDS.shape

(462, 11)

In [29]:
ILDS.head()

Unnamed: 0,Age,Female,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,AR,target
0,48,0,4.5,2.3,282,13,74,7.0,2.4,0.52,0
1,39,0,1.9,0.9,180,42,62,7.4,4.3,1.38,0
2,23,0,1.0,0.3,212,41,80,6.2,3.1,1.0,0
3,42,1,0.7,0.2,152,35,81,6.2,3.2,1.06,0
4,52,1,0.6,0.1,194,10,12,6.9,3.3,0.9,1


In [30]:
X = ILDS.loc[:, ILDS.columns != 'target']
y = ILDS['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# LDA

In [31]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

print('Priors:', lda_model.priors_)

Priors: [0.70873786 0.29126214]


Nota: es pot treure molta informació sobre el model amb funcions que té LDA, per tant si algo falla es pot estudiar bé

In [32]:
confusion(y_train, pd.Series(lda_model.predict(X_train)))    

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,212,7
1,82,8


## Computing metrics

In [33]:
results_df = pd.DataFrame(index=[], columns= ['Accuracy', 'F1 Macro', 'Precision Macro', 'Recall Macro'])

In [34]:
cross_val_results = pd.DataFrame(cross_validate(lda_model , X_train, y_train, cv = 5, 
                            scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['LDA',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.693,0.477,0.56,0.515


# QDA

In [35]:
qda_model = QuadraticDiscriminantAnalysis(reg_param=0.1).fit(X_train, y_train)

print('Priors:', qda_model.priors_)
print('Means:\n')
means =pd.DataFrame(qda_model.means_)
means.columns=ILDS.columns[1:]
means

Priors: [0.70873786 0.29126214]
Means:



Unnamed: 0,Female,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,AR,target
0,46.717,0.233,4.051,1.738,302.283,92.411,119.699,6.418,3.04,0.905
1,41.844,0.289,1.277,0.471,230.522,37.578,46.111,6.492,3.312,1.027


In [36]:
confusion(y_train, pd.Series(qda_model.predict(X_train)))     

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,143
1,8,82


In [37]:
cross_val_results = pd.DataFrame(cross_validate(qda_model , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['QDA',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.693,0.477,0.56,0.515
QDA,0.508,0.506,0.624,0.624


# KNN

In [38]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
    estimator=knn,
    param_grid={
        'n_neighbors': [1, 3, 5, 7, 10, 15, 20],
        'metric': ['euclidean', 'minkowski', 'manhattan']
    },
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    refit=False
)

knn_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [39]:
cols = ['param_n_neighbors', 'param_metric',
     'mean_test_accuracy',
    'mean_test_f1_macro', 'mean_test_precision_macro',
    'mean_test_recall_macro', 
    'std_test_accuracy', 'std_test_f1_macro', 'std_test_precision_macro',
    'std_test_recall_macro'
]
results_cv[cols].sort_values(by='mean_test_f1_macro',ascending=False)

Unnamed: 0,param_n_neighbors,param_metric,mean_test_accuracy,mean_test_f1_macro,mean_test_precision_macro,mean_test_recall_macro,std_test_accuracy,std_test_f1_macro,std_test_precision_macro,std_test_recall_macro
0,1,euclidean,0.705,0.625,0.636,0.625,0.036,0.054,0.046,0.059
7,1,minkowski,0.705,0.625,0.636,0.625,0.036,0.054,0.046,0.059
8,3,minkowski,0.693,0.615,0.624,0.613,0.058,0.067,0.068,0.064
1,3,euclidean,0.693,0.615,0.624,0.613,0.058,0.067,0.068,0.064
14,1,manhattan,0.68,0.598,0.613,0.601,0.047,0.052,0.062,0.048
2,5,euclidean,0.696,0.587,0.617,0.583,0.036,0.034,0.058,0.03
9,5,minkowski,0.696,0.587,0.617,0.583,0.036,0.034,0.058,0.03
15,3,manhattan,0.667,0.586,0.6,0.585,0.073,0.072,0.081,0.07
10,7,minkowski,0.702,0.579,0.616,0.577,0.042,0.059,0.074,0.05
3,7,euclidean,0.702,0.579,0.616,0.577,0.042,0.059,0.074,0.05


In [47]:
knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn.fit(X_train, y_train)

cross_val_results = pd.DataFrame(cross_validate(knn , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['KNN',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.693,0.477,0.56,0.515
QDA,0.508,0.506,0.624,0.624
KNN,0.705,0.625,0.636,0.625
Gaussian Naive Bayes,0.502,0.499,0.632,0.626
Logistic Regression,0.725,0.507,0.611,0.544


# Gaussian Naive Bayes

In [41]:
gaussian_nb = GaussianNB()

gaussian_nb.fit(X_train,y_train)


cross_val_results = pd.DataFrame(cross_validate(gaussian_nb , X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['Gaussian Naive Bayes',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values
results_df

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
LDA,0.693,0.477,0.56,0.515
QDA,0.508,0.506,0.624,0.624
KNN,0.702,0.579,0.616,0.577
Gaussian Naive Bayes,0.502,0.499,0.632,0.626


# Logistic regression

In [42]:
logreg = LogisticRegressionCV(Cs=20, random_state=1, cv = 10, scoring = 'accuracy', multi_class='multinomial')

logreg.fit(X_train, y_train)

In [43]:
print(logreg.scores_.keys())

dict_keys([1])


In [44]:
# figure out optimal parameter value for 'C'

avg_crossval_scores = logreg.scores_[1].mean(axis=0)
idx = np.argmax(avg_crossval_scores)
best_C = logreg.Cs_[idx]
print(best_C)

0.0006951927961775605


In [45]:
logreg = LogisticRegression(C=best_C, multi_class='multinomial')
cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

results_df.loc['Logistic Regression',:] = cross_val_results[['test_accuracy', 'test_f1_macro',
       'test_precision_macro', 'test_recall_macro']].mean().values

results_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,F1 Macro,Precision Macro,Recall Macro
Logistic Regression,0.725,0.507,0.611,0.544
KNN,0.702,0.579,0.616,0.577
LDA,0.693,0.477,0.56,0.515
QDA,0.508,0.506,0.624,0.624
Gaussian Naive Bayes,0.502,0.499,0.632,0.626


# Test

In [62]:
ILDS_test = pd.read_csv("test_data_ILDS.csv", delimiter=',', header = None)

ILDS_test.columns = ['Age', 'Female', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB', 'AR']

X_test = ILDS_test.loc[:,:'AR']

ILDS_test['Label'] = gaussian_nb.predict(X_test)

ILDS_test.head()

ILDS_test.index = ILDS_test.index + 1
ILDS_test.index.name = 'ID'

ILDS_test['Label'].to_csv('lin_class_raw.csv', index=True)

In [60]:
X_test

Unnamed: 0,Age,Female,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,AR
0,11,0,0.7,0.1,592,26,29,7.1,4.2,1.40
1,62,0,1.8,0.9,224,69,155,8.6,4.0,0.80
2,60,0,0.7,0.2,174,32,14,7.8,4.2,1.10
3,60,0,5.7,2.8,214,412,850,7.3,3.2,0.78
4,48,1,0.9,0.2,175,24,54,5.5,2.7,0.90
...,...,...,...,...,...,...,...,...,...,...
111,69,1,0.8,0.2,146,42,70,8.4,4.9,1.40
112,35,0,26.3,12.1,108,168,630,9.2,2.0,0.30
113,42,0,0.8,0.2,114,21,23,7.0,3.0,0.70
114,18,0,0.6,0.2,538,33,34,7.5,3.2,0.70
