In [13]:
# A simple program to illustrate how to use sckkit-learn 
#Perform PCA dimensionality reduction

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

#Load dataset 
raw = np.genfromtxt('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',delimiter=',')
mask = np.isnan(raw)
raw[mask]=0
mask_sum = np.sum(mask, axis = 0)
col_sum = np.sum(raw,axis=0)
for k in range(10):
    mask1d = mask[:,k]
    raw[mask1d,k] = col_sum[k] / (len(raw) - mask_sum[k])
# Sample features of all_X is 599 x 9
# We don't need ID (not a feature)
df_X = raw[:,1:10]
# Sample labels are in df_y. Shape of df_y is 599
df_y = raw[:,10]


#Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y)


for comp in range(1,10): 

    pca = PCA(n_components=comp)
    pca.fit(X_train)    
    print("k = ",comp,"PoV(k) = ",np.sum(pca.explained_variance_ratio_))

    ld_train = pca.transform(X_train)
    ld_test = pca.transform(X_test)
    if (0): #use kNN = 0 or Naive Bayesian = 1
        acc=0
        for i in range(10):
            gnb = GaussianNB()
            gnb.fit(ld_train, y_train)
            acc += gnb.score(ld_test, y_test)
        print('avg acc = ',acc/10)
    else:
        acc=0
        for i in range(10):
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(ld_train, y_train)
            acc += knn.score(ld_test, y_test)
        print('avg acc = ',acc/10)

    print('component = %d Acc = %.3f' % (comp, acc/10) )


k =  1 PoV(k) =  0.6956976157127026
avg acc =  0.9523809523809523
component = 1 Acc = 0.952
k =  2 PoV(k) =  0.7646606678547693
avg acc =  0.9666666666666666
component = 2 Acc = 0.967
k =  3 PoV(k) =  0.8261656046094824
avg acc =  0.9571428571428571
component = 3 Acc = 0.957
k =  4 PoV(k) =  0.8693239925943768
avg acc =  0.9619047619047618
component = 4 Acc = 0.962
k =  5 PoV(k) =  0.9083522334414337
avg acc =  0.9666666666666666
component = 5 Acc = 0.967
k =  6 PoV(k) =  0.9430010704529711
avg acc =  0.9666666666666666
component = 6 Acc = 0.967
k =  7 PoV(k) =  0.9699935626512982
avg acc =  0.9666666666666666
component = 7 Acc = 0.967
k =  8 PoV(k) =  0.9892793792865707
avg acc =  0.9666666666666666
component = 8 Acc = 0.967
k =  9 PoV(k) =  0.9999999999999999
avg acc =  0.9666666666666666
component = 9 Acc = 0.967


In [2]:
!pip install sklearn


Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp37-cp37m-win_amd64.whl (6.8 MB)
Collecting joblib>=0.11
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1321 sha256=3f013e24a3ed956175c0dc9cdddeed95cd939c9b60ee682f0e94e2f4aafcabb8
  Stored in directory: c:\users\lab1223\appdata\local\pip\cache\wheels\46\ef\c3\157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: joblib, threadpoolctl, scikit-learn, sklearn
Successfully installed joblib-0.17.0 scikit-learn-0.23.2 sklearn-0.0 threadpoolctl-2.1.0
