Cross validation using Quadratic Distriminant Analysis

In [1]:
from os import listdir
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.metrics import zero_one_loss, confusion_matrix, accuracy_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/ChunkedData_NR/'
filesR = listdir(pathR)
filesN = listdir(pathN)
filesR[0]

'Daily_2060_S3.csv'

In [3]:
# Read one CSV file. Drop the date column.
def file_mean (filepath):
    mydata = pd.read_csv(filepath)
    # Drop the date column.
    mydata = mydata.drop('Date',axis=1) 
    # Transpose column of mean values into a row.
    mymean = mydata.mean(axis=0).to_frame().T
    return mymean

In [4]:
# Read directory of CSV files. Retain only one row per file = column averages.
def mean_per_file (directory):
    files = listdir(directory)
    means = pd.DataFrame()
    for fp in files:
        dfp = directory+fp
        m = file_mean(dfp)
        # Let Pandas number the rows sequentially.
        means = means.append(m,ignore_index=True)
    return means

In [5]:
meansR = mean_per_file(pathR)
meansN = mean_per_file(pathN)
meansAll = pd.concat((meansR,meansN),ignore_index=True)

In [6]:
def make_labels(positives,negatives):
    rows = positives.shape[0]
    labelsP = pd.DataFrame(np.ones(rows,dtype=np.int8))  # one = positive = blue
    rows = negatives.shape[0]
    labelsN = pd.DataFrame(np.zeros(rows,dtype=np.int8))  # zero = negative = red
    labelsAll = pd.concat((labelsP,labelsN),ignore_index=True)
    return labelsAll

In [7]:
labelsAll = make_labels(meansR,meansN)
print(labelsAll.shape)
#print(labelsAll.T)
y1d = labelsAll.values.ravel() # required for sklearn models
y1d

(40, 1)


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [8]:
def make_question_features(X,scale=True):
    c = X.columns[:13] # Excel columns B-N
    z = X[c]
    if scale:
        s = StandardScaler()
        z = s.fit_transform(z)
    return z
def make_physiol_features(X,scale=True):
    c = X.columns[249:] # Excel columns IQ-IZ
    z = X[c]
    if scale:
        s = StandardScaler()
        z = s.fit_transform(z)
    return z
def make_all_features(X,scale=True):
    c = X.columns 
    z = X[c]
    if scale:
        s = StandardScaler()
        z = s.fit_transform(z)
    return z
X_questionFeatures = make_question_features(meansAll)
X_physiolFeatures = make_physiol_features(meansAll)
X_allFeatures = make_all_features(meansAll)

In [9]:
def do_cross_val(model,splits,X,y,verbose=False):
    skf = StratifiedKFold(n_splits=splits, random_state=456, shuffle=True)
    confusion = np.zeros(shape=[2,2],dtype=np.int8)
    for train_index, test_index in skf.split(X, y):
        X_train,y_train = X[train_index],y[train_index]
        model.fit(X_train, y_train)
        X_test,y_test = X[test_index],y[test_index]
        y_pred = model.predict(X_test)
        # Labels attribute says generate all 4 counts even if none of any category.
        # https://stackoverflow.com/questions/46229965/how-to-make-sklearn-metrics-confusion-matrix-to-always-return-tp-tn-fp-fn
        cf = confusion_matrix(y_test,y_pred,labels=[0,1])
        confusion = np.add(confusion,cf)
        if verbose:
            ba = balanced_accuracy_score(y_test,y_pred)
            acc = accuracy_score(y_test,y_pred)
            # unintuitive order but from documentation
            tn, fp, fn, tp = cf.ravel() 
            print(" Array indices. Train:",train_index, " Test:",test_index)
            print("  y_test=",y_test," y_pred=",y_pred)
            print("  acc=%.2f ba=%.2f tp=%d fp=%d fn=%d tn=%d"%(acc,ba,tp,fp,fn,tn))
    return confusion
def print_confusion(label1,cm1,label2,cm2):
    tn, fp, fn, tp = cm1.ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp / (tp+fn)
    tnr = tn / (tn+fp)
    bal = (tpr+tnr)/2
    f1 = (2*tp)/(2*tp+fp+fn)
    TL=' tp fp | %2d %2d | acc=%.2f'%(tp,fp,acc)
    BL=' fn tn | %2d %2d | bal=%.2f'%(fn,tn,bal)
    tn, fp, fn, tp = cm2.ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp / (tp+fn)
    tnr = tn / (tn+fp)
    bal = (tpr+tnr)/2
    f1 = (2*tp)/(2*tp+fp+fn)
    TR=' tp fp | %2d %2d | acc=%.2f'%(tp,fp,acc)
    BR=' fn tn | %2d %2d | bal=%.2f'%(fn,tn,bal)
    print("%-30s %-30s"%(label1,label2))
    print("%-30s %-30s"%(TL,TR))
    print("%-30s %-30s"%(BL,BR))

In [12]:
def get_model():
    return LinearDiscriminantAnalysis(solver='lsqr')
print("Verbose validation run")
confusion = do_cross_val(get_model(),5,X_questionFeatures, y1d, True)

Verbose validation run
 Array indices. Train: [ 0  2  3  5  7  8  9 10 11 12 13 14 15 17 18 19 20 21 22 23 25 26 27 28
 30 31 33 34 35 36 37 38]  Test: [ 1  4  6 16 24 29 32 39]
  y_test= [1 1 1 0 0 0 0 0]  y_pred= [1 1 1 0 0 0 0 0]
  acc=1.00 ba=1.00 tp=3 fp=0 fn=0 tn=5
 Array indices. Train: [ 0  1  2  3  4  6  7  8 10 11 13 14 15 16 17 18 20 21 23 24 25 26 27 29
 31 32 34 35 36 37 38 39]  Test: [ 5  9 12 19 22 28 30 33]
  y_test= [1 1 1 0 0 0 0 0]  y_pred= [0 1 1 0 0 0 0 0]
  acc=0.88 ba=0.83 tp=2 fp=0 fn=1 tn=5
 Array indices. Train: [ 1  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 21 22 23 24 25 26 28 29
 30 31 32 33 35 36 37 39]  Test: [ 0  2  3 18 20 27 34 38]
  y_test= [1 1 1 0 0 0 0 0]  y_pred= [1 0 1 1 0 0 0 0]
  acc=0.75 ba=0.73 tp=2 fp=1 fn=1 tn=4
 Array indices. Train: [ 0  1  2  3  4  5  6  8  9 11 12 14 15 16 18 19 20 22 24 25 27 28 29 30
 32 33 34 35 36 37 38 39]  Test: [ 7 10 13 17 21 23 26 31]
  y_test= [1 1 1 0 0 0 0 0]  y_pred= [1 0 0 0 1 1 0 0]
  acc=0.50 ba=0.47 t

In [13]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Supress sklearn warnings that test set has zero instances of one category.

print("Linear Discriminant Analysis with 5-fold Cross Validation")
confusionQ=do_cross_val(get_model(),5,X_questionFeatures, y1d)
confusionP=do_cross_val(get_model(),5,X_physiolFeatures, y1d)
print_confusion("QuestionFeatures",confusionQ,"PhysiolFeatures",confusionP)

print("Linear Discriminant Analysis with 10-fold Cross Validation")
confusionQ=do_cross_val(get_model(),10,X_questionFeatures, y1d)
confusionP=do_cross_val(get_model(),10,X_physiolFeatures, y1d)
print_confusion("QuestionFeatures",confusionQ,"PhysiolFeatures",confusionP)

print("Linear Discriminant Analysis with 20-fold Cross Validation")
confusionQ=do_cross_val(get_model(),20,X_questionFeatures, y1d)
confusionP=do_cross_val(get_model(),20,X_physiolFeatures, y1d)
print_confusion("QuestionFeatures",confusionQ,"PhysiolFeatures",confusionP)

print("Linear Discriminant Analysis with All Features")
confusion5=do_cross_val(get_model(),5,X_allFeatures, y1d)
confusion10=do_cross_val(get_model(),10,X_allFeatures, y1d)
print_confusion("5-fold CV",confusion5,"10-fold CV",confusion10)

Linear Discriminant Analysis with 5-fold Cross Validation
QuestionFeatures               PhysiolFeatures               
 tp fp | 10  3 | acc=0.82       tp fp |  9  6 | acc=0.72     
 fn tn |  4 23 | bal=0.80       fn tn |  5 20 | bal=0.71     
Linear Discriminant Analysis with 10-fold Cross Validation
QuestionFeatures               PhysiolFeatures               
 tp fp |  9  4 | acc=0.78       tp fp |  9  5 | acc=0.75     
 fn tn |  5 22 | bal=0.74       fn tn |  5 21 | bal=0.73     
Linear Discriminant Analysis with 20-fold Cross Validation
QuestionFeatures               PhysiolFeatures               
 tp fp | 11  3 | acc=0.85       tp fp |  9  7 | acc=0.70     
 fn tn |  3 23 | bal=0.84       fn tn |  5 19 | bal=0.69     
Linear Discriminant Analysis with All Features
5-fold CV                      10-fold CV                    
 tp fp |  8 16 | acc=0.45       tp fp |  5 12 | acc=0.47     
 fn tn |  6 10 | bal=0.48       fn tn |  9 14 | bal=0.45     
