In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

In [4]:
def load_dataset(path_to_file):
    df=pd.read_csv(path_to_file)
    return df

In [5]:
def standardize(df):
    scaler=StandardScaler()
    df_std=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
    df_std[df.columns[-1]]=df[df.columns[-1]]
    return df_std

In [6]:
def train_test(df):
    X=df.drop(df.columns[-1],axis=1)
    y=df[df.columns[-1]]
    XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.3, random_state=42)
    return([XTrain,XTest,yTrain,yTest])

In [20]:
def knn_classifier(k,XTrain,yTrain,XTest):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(XTrain, yTrain)
    yPred = knn.predict(XTest)
    return yPred

In [8]:
def percentage_accuracy(yPred,yTest):
    return(accuracy_score(yTest, yPred))

In [9]:
def confusion_matrixp(yPred,yTest):
    return(confusion_matrix(yTest,yPred))

In [37]:
def bayes_classifier(XTrain,yTrain,XTest):
    gnb = GaussianNB()
    gnb.fit(XTrain, yTrain)
    yPred = gnb.predict(XTest)
    return yPred

def pca(df, lst):
    Xdata=df.drop(df.columns[-1],axis=1)
    DFs = []
    for i in lst:
        pca = PCA(n_components=i)
        pComps = pca.fit_transform(Xdata)
        pDF = pd.DataFrame(data = pComps)
        pDF[df.columns[-1]]=df[df.columns[-1]]
        DFs.append(pDF)
    return DFs

"""
def pca(df, n):
    Xdata=df.drop(df.columns[-1],axis=1)
    pca = PCA(n_components=n)
    principalComponents = pca.fit_transform(Xdata)
    principalDf = pd.DataFrame(data = principalComponents)
    pca = PCA(n_components=n)
    pComps = pca.fit_transform(Xdata)
    pDF = pd.DataFrame(data = pComps)
    principalDf[df.columns[-1]]=df[df.columns[-1]]
    return principalDf
"""

def pda(df,n):
    from sklearn.decomposition import PCA
    x=df.drop(df.columns[-1],axis=1)
    pca = PCA(n_components=n)
    pComps = pca.fit_transform(x)
    pDf = pd.DataFrame(data = pComps)
    pDf[df.columns[-1]]=df[df.columns[-1]]
    return pDf

In [125]:
def pca(df,n):
    x=df.drop(df.columns[-1],axis=1)
    pca = PCA(n_components=n)
    pComps = pca.fit_transform(x)
    pDf = pd.DataFrame(data = pComps)
    pDf[df.columns[-1]]=df[df.columns[-1]]
    return pDf

In [165]:
def main():
    import warnings
    from sklearn.exceptions import DataConversionWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    df=load_dataset("../files/pima-indians-diabetes.csv")
    print(df.shape)
    l=range(len(df.columns)-2)
    for i in l:
        print(i)
        pdf = pca(df,i)
        #print(pdf.head())
        print(pdf.shape)
        dfStd=standardize(df)
        XTrain=train_test(dfStd)[0]
        XTest=train_test(dfStd)[1]
        yTrain=pd.DataFrame(train_test(dfStd)[2])
        yTest=pd.DataFrame(train_test(dfStd)[3])
        print(XTrain.shape,yTrain.shape,XTest.shape,yTest.shape)
        yPredBayes = bayes_classifier(XTrain,yTrain,XTest)
    
        #print(confusion_matrixp(yPredBayes,yTest))
        print("Accuracy by Bayes Classifier:",percentage_accuracy(yPredBayes,yTest))
    
        k = range(1,22,2)
        accuracies=[]
        for i in k:
            #print("k = ",i)
            yPredKnn=knn_classifier(i,XTrain,yTrain,XTest)
            #print(confusion_matrixp(yPredKnn,yTest))
            #print(percentage_accuracy(yPredKnn,yTest))
            accuracies.append(percentage_accuracy(yPredKnn,yTest))
            #print("\n")
        print("Accuracy by KNN Classifier:",sum(accuracies)/len(accuracies))
    
    """
    dfStd=standardize(df)
    XTrain=train_test(dfStd)[0]
    XTest=train_test(dfStd)[1]
    yTrain=train_test(dfStd)[2]
    yTest=train_test(dfStd)[3]
    
    #Xdata=df.drop(df.columns[-1],axis=1)
    #ydata=df[df.columns[-1]]
    """
    """
    l = range(len(df.columns))
    DFs = pca(df, l)
    DFs.append(df)
    
    for df in DFs:
        dfStd=standardize(df)
        XTrain=train_test(dfStd)[0]
        XTest=train_test(dfStd)[1]
        yTrain=train_test(dfStd)[2]
        yTest=train_test(dfStd)[3]
        
        yPredBayes = bayes_classifier(XTrain,yTrain,XTest)
    
        #print(confusion_matrixp(yPredBayes,yTest))
        print("Accuracy by Bayes Classifier: ",percentage_accuracy(yPredBayes,yTest))
    
        k = range(1,22,2)
        accuracies=[]
        for i in k:
            #print("k = ",i)
            yPredKnn=knn_classifier(i,XTrain,yTrain,XTest)
            #print(confusion_matrixp(yPredKnn,yTest))
            #print(percentage_accuracy(yPredKnn,yTest))
            accuracies.append(percentage_accuracy(yPredKnn,yTest))
            #print("\n")
        print("Accuracy by KNN Classifier: ",sum(accuracies)/len(accuracies))
        
    
    yPredBayes = bayes_classifier(XTrain,yTrain,XTest)
    
    #print(confusion_matrixp(yPredBayes,yTest))
    print("Accuracy by Bayes Classifier:",percentage_accuracy(yPredBayes,yTest))
    
    k = range(1,22,2)
    accuracies=[]
    for i in k:
        #print("k = ",i)
        yPredKnn=knn_classifier(i,XTrain,yTrain,XTest)
        #print(confusion_matrixp(yPredKnn,yTest))
        #print(percentage_accuracy(yPredKnn,yTest))
        accuracies.append(percentage_accuracy(yPredKnn,yTest))
        #print("\n")
    print("Accuracy by KNN Classifier:",sum(accuracies)/len(accuracies))
    """
    """
    print("\n")
    plt.plot(range(1,22,2),accuracies,color='b')
    plt.xlabel("Value of k")
    plt.ylabel("Accuracy")
    plt.show()
    """

In [166]:
if __name__ == "__main__":
    main()

(768, 9)
0
(768, 1)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
1
(768, 2)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
2
(768, 3)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
3
(768, 4)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
4
(768, 5)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
5
(768, 6)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671
6
(768, 7)
(537, 8) (537, 1) (231, 8) (231, 1)
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671


In [144]:
Accuracy by Bayes Classifier: 0.7445887445887446
Accuracy by KNN Classifier: 0.70995670995671


SyntaxError: invalid syntax (<ipython-input-144-662214850185>, line 1)

In [116]:
df=load_dataset("../files/pima-indians-diabetes.csv")

l=[1,2,3,4]

def pda(df,n):
    x=df.drop(df.columns[-1],axis=1)
    pca = PCA(n_components=n)
    pComps = pca.fit_transform(x)
    pDf = pd.DataFrame(data = pComps)
    pDf[df.columns[-1]]=df[df.columns[-1]]
    print(pDf.head())
for i in l:
    print(i)
    pda(df,i)

1
           0  class
0 -75.714655      1
1 -82.358268      0
2 -74.630643      1
3  11.077423      0
4  89.743788      1
2
           0          1  class
0 -75.714655 -35.950783      1
1 -82.358268  28.908213      0
2 -74.630643 -67.906496      1
3  11.077423  34.898486      0
4  89.743788  -2.746937      1
3
           0          1          2  class
0 -75.714655 -35.950783  -7.260789      1
1 -82.358268  28.908213  -5.496671      0
2 -74.630643 -67.906496  19.461808      1
3  11.077423  34.898486  -0.053018      0
4  89.743788  -2.746937  25.212859      1
4
           0          1          2          3  class
0 -75.714655 -35.950783  -7.260789  15.669269      1
1 -82.358268  28.908213  -5.496671   9.004554      0
2 -74.630643 -67.906496  19.461808  -5.653056      1
3  11.077423  34.898486  -0.053018   1.314873      0
4  89.743788  -2.746937  25.212859  18.994237      1
