In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [59]:
df=pd.read_csv("diabetes_70k.csv")
df.head(10)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,class
0,1,0,1,31,1,0,0,0,0,0,...,0,1,5,0,0,0,10,4,7,0
1,1,1,1,27,0,0,0,1,0,1,...,0,2,0,0,0,0,11,5,7,0
2,0,1,1,32,1,0,1,0,1,1,...,0,5,0,15,1,0,10,5,5,0
3,0,1,1,23,1,1,0,1,1,0,...,0,2,0,3,0,0,12,5,8,0
4,0,1,1,37,1,0,0,0,0,1,...,0,3,21,14,1,0,6,5,1,0
5,0,0,0,27,0,0,0,1,0,0,...,1,4,0,0,0,1,7,5,3,0
6,0,1,1,37,1,0,0,1,0,0,...,0,4,0,30,0,0,9,4,5,0
7,0,0,1,22,1,0,0,1,0,0,...,0,2,0,0,0,1,10,6,8,0
8,1,0,1,17,1,0,0,0,1,1,...,0,2,30,0,0,1,4,4,4,0
9,0,0,1,24,0,0,0,0,1,1,...,1,2,0,4,0,0,8,5,5,0


Read in the CSV file and see what the data looks like, making sure the data is loaded in correctly.

In [60]:
y = df.pop('class').values
X = df.values
scaler = StandardScaler()
x_norm = scaler.fit_transform(X)

Normalise the data

In [61]:
model1=DecisionTreeClassifier()
model2=DecisionTreeClassifier(max_depth=10)
model3=KNeighborsClassifier(n_neighbors = 100)
model4=KNeighborsClassifier()
model5=GaussianNB()
modelList=[model1,model2,model3,model4,model5]


Initialise all the models according to the assignment specifications

In [62]:

for k in range (5,11):
    cv_scoreList=[]   
    for model in modelList:
        kf = KFold(n_splits=k, shuffle = True)
        cv_scoreList.append(np.mean(cross_val_score(model,x_norm,y,cv=kf)))
    print(k, " fold val score", cv_scoreList)
        

5  fold val score [np.float64(0.6531573736742322), np.float64(0.732303556935585), np.float64(0.738428754838575), np.float64(0.7082415278870714), np.float64(0.7169976443189678)]
6  fold val score [np.float64(0.6503564759803089), np.float64(0.736320941549256), np.float64(0.7388106150625248), np.float64(0.7098115769818367), np.float64(0.7169976800769536)]
7  fold val score [np.float64(0.6517286912084301), np.float64(0.7345385541246517), np.float64(0.7391077477079003), np.float64(0.7080434188476337), np.float64(0.7168704205190055)]
8  fold val score [np.float64(0.6519548687811182), np.float64(0.7359246977974874), np.float64(0.738923890062017), np.float64(0.710575505983756), np.float64(0.7166582918961917)]
9  fold val score [np.float64(0.6521103029359292), np.float64(0.735938925203912), np.float64(0.738980347779539), np.float64(0.7103915014687471), np.float64(0.7167288120296694)]
10  fold val score [np.float64(0.6546144000249711), np.float64(0.7342133141835089), np.float64(0.738754085561537

Using k-fold Cross validation, where k changes in value from the range 5-->10 the ranking is as follows for all k-fold cross validations:

    1. KNN Classifier where n is 100
    2. Decision Tree Classifier where the max depth is 10
    3. Gaussian Naive Bayes Classifier
    4. KNN Classifier with default parameters
    5. Decision Tree Classifier with default parameters

The ranking appears to be very stable

In [63]:
model1=DecisionTreeClassifier()
model2=DecisionTreeClassifier(max_depth=10)
model3=KNeighborsClassifier(n_neighbors = 100)
model4=KNeighborsClassifier()
model5=GaussianNB()
modelList=[model1,model2,model3,model4,model5]


In [None]:

avg_score=np.zeros(5)
for i in range(0,20):
    holdout_scoreList=[]
    X_train, X_test, y_train, y_test = train_test_split(x_norm, y, test_size=1/3,train_size=2/3,random_state=i+int(datetime.datetime.now().timestamp()))
    for model in modelList:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        holdout_scoreList.append(acc)
    print(holdout_scoreList)
    avg_score=np.add(avg_score,holdout_scoreList)
avg_score=np.divide(avg_score,20)
print("Average scores ",avg_score)

[0.6529451705992192, 0.7323883890680699, 0.7389237820404007, 0.711169580716347, 0.7207604820913257]
[0.6535817348497709, 0.7312001358003735, 0.7382447801731454, 0.7113817687998641, 0.7189356645730776]
[0.6616448820234255, 0.7344678322865388, 0.7407485995586488, 0.7082838227805126, 0.7182991003225259]
[0.6550670514343915, 0.7393905958241385, 0.7390510948905109, 0.7117212697334918, 0.7212272958750636]
[0.6501442878967917, 0.7317518248175182, 0.7399422848412833, 0.7099813274486505, 0.7186386012561534]
[0.6504837888304192, 0.7277202512306908, 0.7338737056526905, 0.7086233237141402, 0.7146494652860296]
[0.6521812934985571, 0.7286114411814633, 0.7393057205907316, 0.7073501952130369, 0.7208453573247326]
[0.6580376846036327, 0.7332371414021388, 0.7366321507384145, 0.7082413851638092, 0.7180444746223053]
[0.6531573586827364, 0.7335342047190629, 0.7390086572738075, 0.7098115769818367, 0.7180869122390087]
[0.6556187404515362, 0.7328127652351044, 0.7374809030724835, 0.7103208283822781, 0.716983534

Using HoldOut, where 1/3 of the data is used as test and 2/3 is used as training data, the ranking is as follows:

    1. KNN Classifier where n is 100
    2. Decision Tree Classifier where the max depth is 10
    3. Gaussian Naive Bayes Classifier
    4. KNN Classifier with default parameters
    5. Decision Tree Classifier with default parameters

Similar to k-fold, the ranking is the exact same and stable

In [65]:
df=pd.read_csv("diabetes_70k.csv")
downsized=df.sample(n=3000)
y = downsized.pop('class').values
X = downsized.values
scaler = StandardScaler()
x_norm = scaler.fit_transform(X)

Here we reload the dataset the randomly sample 3000 values and then normalise the data values once again

In [66]:
model1=DecisionTreeClassifier()
model2=DecisionTreeClassifier(max_depth=10)
model3=KNeighborsClassifier(n_neighbors = 100)
model4=KNeighborsClassifier()
model5=GaussianNB()
modelList=[model1,model2,model3,model4,model5]

for k in range (5,11):
    cv_scoreList=[]   
    for model in modelList:
        kf = KFold(n_splits=k, shuffle = True)
        cv_scoreList.append(np.mean(cross_val_score(model,x_norm,y,cv=kf)))
    print(k, " fold val score", cv_scoreList)
        

5  fold val score [np.float64(0.663), np.float64(0.6913333333333334), np.float64(0.7336666666666666), np.float64(0.6983333333333333), np.float64(0.7233333333333334)]
6  fold val score [np.float64(0.6536666666666667), np.float64(0.6963333333333334), np.float64(0.7326666666666667), np.float64(0.7016666666666667), np.float64(0.7229999999999999)]
7  fold val score [np.float64(0.6446855325359999), np.float64(0.700010270103728), np.float64(0.7276586342006902), np.float64(0.6976730434674359), np.float64(0.7210025177314898)]
8  fold val score [np.float64(0.651), np.float64(0.6973333333333334), np.float64(0.7343333333333333), np.float64(0.698), np.float64(0.726)]
9  fold val score [np.float64(0.6520092947238657), np.float64(0.6970093846341352), np.float64(0.733665801530073), np.float64(0.6963300625975276), np.float64(0.725307143470816)]
10  fold val score [np.float64(0.6529999999999999), np.float64(0.6973333333333332), np.float64(0.733), np.float64(0.6996666666666667), np.float64(0.723333333333

Using k-fold Cross validation, where k changes in value from the range 5-->10 the ranking for the sampled data is as follows for all k-fold cross validations:

    1. KNN Classifier where n is 100
    2. Gaussian Naive Bayes Classifier
    3. Decision Tree Classifier where the max depth is 10
    4. KNN Classifier with default parameters
    5. Decision Tree Classifier with default parameters

The KNN(k is 100) classifier and the Gaussian NB classifiers ranking's remains stable(KNN with 100 neighbours has the highest accuracy with Gaussian being the second highest) and fairly consistent in performance. This is different from the previous rankings in which Gaussian was the third most accurate, however with a smaller dataset it appears to be more accurate(could be overfitting).  The default decision tree classifier is consistently the weakest performing model, which matches the previous ranking. The default KNN and Decision tree whose depth is capped at 10 seem to switch rankings and isnt very stable but have similar performance. For 7 fold and 9 fold, the Decision tree is more accurate but for all other ones, the KNN is more accurate.  

In [None]:
model1=DecisionTreeClassifier()
model2=DecisionTreeClassifier(max_depth=10)
model3=KNeighborsClassifier(n_neighbors = 100)
model4=KNeighborsClassifier()
model5=GaussianNB()
modelList=[model1,model2,model3,model4,model5]

avg_score=[0,0,0,0,0]
for i in range(0,20):
    holdout_scoreList=[]
    X_train, X_test, y_train, y_test = train_test_split(x_norm, y, test_size=1/3,train_size=2/3,random_state=i+int(datetime.datetime.now().timestamp()))
    for model in modelList:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        holdout_scoreList.append(acc)
    print(holdout_scoreList)
    avg_score=np.add(avg_score,holdout_scoreList)
avg_score=np.divide(avg_score,20)
print("Average scores ",avg_score)

[0.661, 0.685, 0.743, 0.695, 0.721]
[0.646, 0.668, 0.718, 0.686, 0.701]
[0.655, 0.691, 0.715, 0.684, 0.702]
[0.625, 0.666, 0.718, 0.693, 0.704]
[0.642, 0.677, 0.732, 0.7, 0.735]
[0.646, 0.677, 0.736, 0.698, 0.72]
[0.635, 0.674, 0.75, 0.712, 0.724]
[0.647, 0.678, 0.73, 0.694, 0.73]
[0.659, 0.689, 0.725, 0.705, 0.727]
[0.61, 0.65, 0.708, 0.687, 0.717]
[0.652, 0.678, 0.711, 0.686, 0.704]
[0.646, 0.693, 0.73, 0.712, 0.735]
[0.656, 0.662, 0.735, 0.682, 0.731]
[0.649, 0.699, 0.739, 0.674, 0.743]
[0.655, 0.685, 0.746, 0.704, 0.738]
[0.65, 0.689, 0.727, 0.709, 0.725]
[0.673, 0.693, 0.723, 0.696, 0.721]
[0.644, 0.682, 0.732, 0.689, 0.715]
[0.656, 0.691, 0.731, 0.682, 0.712]
[0.64, 0.66, 0.724, 0.694, 0.73]
[0.64735 0.67935 0.72865 0.6941  0.72175]


Using HoldOut, where 1/3 of the sapled data is used as test and 2/3 is used as training data, the ranking is as follows:

    1. KNN Classifier where n is 100
    2. Gaussian Naive Bayes Classifier
    3. Decision Tree Classifier where the max depth is 10
    4. KNN Classifier with default parameters
    5. Decision Tree Classifier with default parameters

Similar to k-fold, the rankings for KNN where n=100, Gaussian NB and default parameter Decision Trees is the exact same and stable. On average KNN has a higher accuracy than the Decision Tree classifier whose depth is limited to 10. However, in some reps, we can see that the Decision tree classifier has a slightly higher accuracy than the default KNN.