LDA, QDA, NB on wine dataset

In [8]:
# importing libraries
import numpy as np
import requests
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import urllib
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

read dataset 

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

data = pd.read_csv(
    url,
    sep=',',
    encoding='utf-8',
    names = ['alcohol','malic_acid','ash','alcalinity','magnesium','total_phenols','flavanoids','nonflavanoid_phenols','proanthocyanins','color_intensity','hue','code','proline']
)


compare LDA, QDA, NB on full dataset

In [25]:
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
nb = GaussianNB()

In [128]:
# build extended confusion matrix 

def CM_large(x, y):   
    cm = pd.crosstab(x, y)
    print(cm)
    FP = cm.sum(axis = 0)  - np.diag(cm)
    FN = cm.sum(axis = 1)  - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum().sum() - (FP + FN + TP)


    TPR = TP/(TP+FN) #recall, sensitivity
    TNR = TN/(FP+TN) #specifity, selectivity
    
    PPV = TP/(TP+FP) #precision
    NPV = TN/(TN+FN)

    FPR = FP/(FP+TN)
    ACC = (TP+TN)/(TP+FP+FN+TN)


    return np.mean(ACC), np.mean(FPR), np.mean(TPR), np.mean(PPV), np.mean(NPV)


In [135]:
def params_display(M):
    for clf in [lda, qda, nb]:
        clf.fit(data.iloc[:, 0:M], data.index)
        
    preds = pd.DataFrame(np.array([clf.predict(data.iloc[:, 0:M]) for clf in [lda, qda, nb]]).T)
    preds.columns = ["lda", "qda", "nb"]
    res = pd.DataFrame([CM_large(data.index, preds[i]) for i in preds.columns])

    res.index = preds.columns
    res.columns = ["acc", "fpr", "tpr", "ppv", "npv"]
    res.columns.name = str(M) + ' columns'
    display(res.sort_values('acc', ascending=False))

In [136]:
params_display(13)

lda     1   2   3
row_0            
1      59   0   0
2       0  71   0
3       0   0  48
qda     1   2   3
row_0            
1      59   0   0
2       1  70   0
3       0   0  48
nb      1   2   3
row_0            
1      58   1   0
2       0  70   1
3       0   0  48


13 columns,acc,fpr,tpr,ppv,npv
lda,1.0,0.0,1.0,1.0,1.0
qda,0.996255,0.002801,0.995305,0.994444,0.996914
nb,0.992509,0.005679,0.989655,0.988502,0.994107


compare LDA, QDA, NB for 2/5/10 first cols

In [137]:
params_display(2)

lda     1   2   3
row_0            
1      51   1   7
2       5  61   5
3       7   9  32
qda     1   2   3
row_0            
1      52   1   6
2       4  62   5
3       7  10  31
nb      1   2   3
row_0            
1      51   1   7
2       4  62   5
3       7  10  31


2 columns,acc,fpr,tpr,ppv,npv
qda,0.876404,0.093285,0.800143,0.804269,0.909472
lda,0.872659,0.095535,0.796743,0.79865,0.905858
nb,0.872659,0.095849,0.794493,0.797609,0.906465


In [138]:
params_display(5)

lda     1   2   3
row_0            
1      54   1   4
2       1  65   5
3       3   8  37
qda     1   2   3
row_0            
1      53   1   5
2       1  65   5
3       2   6  40
nb      1   2   3
row_0            
1      52   3   4
2       2  62   7
3       3   7  38


5 columns,acc,fpr,tpr,ppv,npv
qda,0.925094,0.055851,0.882377,0.883069,0.943905
lda,0.917603,0.062319,0.867194,0.871254,0.939103
nb,0.902622,0.073363,0.848754,0.849634,0.926575


In [139]:
params_display(10)

lda     1   2   3
row_0            
1      59   0   0
2       1  70   0
3       0   1  47
qda     1   2   3
row_0            
1      59   0   0
2       1  70   0
3       0   0  48
nb      1   2   3
row_0            
1      56   3   0
2       2  68   1
3       0   1  47


10 columns,acc,fpr,tpr,ppv,npv
qda,0.996255,0.002801,0.995305,0.994444,0.996914
lda,0.992509,0.005916,0.988361,0.98975,0.99434
nb,0.973783,0.020627,0.962022,0.963043,0.979669


create train (50%), validation (25%), test (25%) dataset for 2 cols

In [None]:
# (50% train, 50% rest)
x_train, x_rest, y_train, y_rest = train_test_split(data.iloc[:,:2], data.klasa, test_size=0.5, stratify=data.klasa)

# (50% test, 50% val)
x_test, x_val, y_test, y_val = train_test_split(x_rest, y_rest, test_size=0.5, stratify=y_rest)

# Sprawdzenie rozmiarów zbiorów
print("Rozmiary zbiorów:")
print("Train:", len(x_train))
print("Val:", len(x_val))
print("Test:", len(x_test))