In [1]:

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# read raw file
# 458 samples with class = 2, 241 samples with class = 4
filename = 'breast-cancer-wisconsin.data'
raw = np.genfromtxt(filename, delimiter=',')  


In [3]:
# NaN means missing data, need to fill it
mask = np.isnan(raw)
raw[mask]=0
mask_sum = np.sum(mask, axis = 0)
col_sum = np.sum(raw,axis=0)
for k in range(10):
    mask1d = mask[:,k]
    raw[mask1d,k] = col_sum[k] / (len(raw) - mask_sum[k])

In [4]:
# Sample features of all_X is 599 x 9
# We don't need ID (not a feature)
all_X = raw[:,1:10]
# Sample labels are in df_y. Shape of df_y is 599
df_y = raw[:,10]

In [5]:
all_X.shape[0]

699

In [6]:
train_len = 419
val_len = 140
test_len = 140

In [7]:

train_X = np.zeros((10,train_len,9))
validate_X = np.zeros((10,val_len,9))
test_X = np.zeros((10,test_len,9))
train_y = np.zeros((10,train_len,))
validate_y = np.zeros((10,val_len))
test_y = np.zeros((10,test_len))

In [8]:
train_X.shape,train_y.shape

((10, 419, 9), (10, 419))

In [9]:
validate_X.shape,validate_y.shape

((10, 140, 9), (10, 140))

In [10]:
test_X.shape,test_y.shape

((10, 140, 9), (10, 140))

In [11]:

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(all_X, df_y, test_size=0.2, stratify=df_y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train)

    print(X_train.shape,X_val.shape,X_test.shape)
    train_X[i,:,:] = X_train
    validate_X[i,:,:] = X_val
    test_X[i,:,:] = X_test
    train_y[i,:] = y_train
    validate_y[i,:] = y_val
    test_y[i,:] = y_test


(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)
(419, 9) (140, 9) (140, 9)


In [12]:
train_X,validate_X,test_X

(array([[[ 1.,  1.,  1., ...,  3.,  1.,  1.],
         [ 2.,  5.,  3., ...,  7.,  5.,  1.],
         [ 1.,  1.,  1., ...,  1.,  1.,  1.],
         ...,
         [ 7.,  2.,  4., ...,  3.,  3.,  1.],
         [ 1.,  1.,  1., ...,  3.,  1.,  1.],
         [ 5.,  5.,  5., ...,  3.,  1.,  1.]],
 
        [[ 5.,  1.,  3., ...,  2.,  3.,  1.],
         [ 1.,  1.,  1., ...,  1.,  1.,  1.],
         [ 2.,  1.,  1., ...,  1.,  1.,  1.],
         ...,
         [ 1.,  1.,  1., ...,  3.,  1.,  1.],
         [ 1.,  1.,  1., ...,  2.,  1.,  1.],
         [ 2.,  1.,  1., ...,  1.,  1.,  1.]],
 
        [[ 1.,  3.,  1., ...,  2.,  2.,  1.],
         [10.,  4.,  6., ...,  7.,  1.,  1.],
         [ 3.,  1.,  1., ...,  2.,  1.,  1.],
         ...,
         [ 8.,  7.,  8., ...,  9., 10.,  1.],
         [ 8.,  3.,  4., ...,  3.,  3.,  1.],
         [ 7.,  8.,  8., ...,  7.,  2.,  3.]],
 
        ...,
 
        [[ 1.,  1.,  1., ...,  3.,  1.,  1.],
         [ 1.,  1.,  2., ...,  3.,  1.,  1.],
         [ 3.,

In [13]:
# selected features, T = selected, F = not selected
sel_f = [False,False,False,False,False,False,False,False,False] 

In [14]:
for fset in range (3): # pick at most three features
    mx_avg = 0 
    ix_keep = 10 
    for k in range(9): # totally 9 features
        avg = 0 ;
        tmp = sel_f.copy()
        if (tmp[k] != True): # skip those features we have already chosen
            tmp[k] = True
            df_X = all_X[:,tmp]
            for lp in range(10): # loop exp 10 times
                #Split train and test dataset <--- this part is NOT correct. Just want to mnake codes executable
                X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3,stratify=df_y)
                
                X_train = train_X[i,:,:] 
                X_test = validate_X[i,:,:]
                y_train = train_y[i,:]
                y_test = validate_y[i,:]
                
                
                knn = KNeighborsClassifier(n_neighbors=3)
                #Train the model        
                knn.fit(X_train, y_train)
                #print(knn.score(X_test, y_test))
                avg = avg + knn.score(X_test, y_test)
            avg = avg / 10 # avg accuracy
            if (avg > mx_avg):
                ix_keep = k 
                mx_avg = avg
    #done with k, update feature vector
    sel_f[ix_keep]= True

In [15]:
X_test[:,sel_f].shape

(140, 3)

In [16]:
X_train[:,sel_f].shape

(419, 3)

In [17]:
# we have everyything here ... Do final test
# df_X_train = X_train[:,sel_f]
avg = 0 
for lp in range(10): # repeat exp 10 times
    
    knn = KNeighborsClassifier(n_neighbors=3)
    #Train the model        
    knn.fit( X_train[:,sel_f], y_train)
    #print(knn.score(X_test, y_test))
    avg = avg + knn.score(X_test[:,sel_f], y_test)
avg = avg / 10 # avg accuracy

show_index = np.array(range(9))
print('Final avg accuracy',avg, 'chosen features are', show_index[sel_f] )

Final avg accuracy 0.9642857142857142 chosen features are [0 1 2]


In [18]:
# A simple program to illustrate how to use scikit-learn 
# Read Breast cancer dataset
# Note the program is not correct. It just shows how to perform the feature selection process
# You need to revise the program to make it work correctly





# Need to split train, validation, and test here
# Split the sets for 10 times and store them here
# We will use the same training and validation sets when choosing features ....
# Check problem 7 of HW 1 to see how to do it

# selected features, T = selected, F = not selected
sel_f = [False,False,False,False,False,False,False,False,False] 

for fset in range (3): # pick at most three features
    mx_avg = 0 
    ix_keep = 10 
    for k in range(9): # totally 9 features
        avg = 0 ;
        tmp = sel_f.copy()
        if (tmp[k] != True): # skip those features we have already chosen
            tmp[k] = True
            df_X = all_X[:,tmp]
            for lp in range(10): # loop exp 10 times
            #Split train and test dataset <--- this part is NOT correct. Just want to mnake codes executable
                X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3,stratify=df_y)
                knn = KNeighborsClassifier(n_neighbors=3)
                #Train the model        
                knn.fit(X_train, y_train)
                #print(knn.score(X_test, y_test))
                avg = avg + knn.score(X_test, y_test)
            avg = avg / 10 # avg accuracy
            if (avg > mx_avg):
                ix_keep = k 
                mx_avg = avg
    #done with k, update feature vector
    sel_f[ix_keep]= True
    
# we have everyything here ... Do final test
df_X = all_X[:,sel_f]
avg = 0 
for lp in range(10): # repeat exp 10 times
    # Want to extract stored test dataset
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3) #<----- WRONG codes
    knn = KNeighborsClassifier(n_neighbors=3)
    #Train the model        
    knn.fit(X_train, y_train)
    #print(knn.score(X_test, y_test))
    avg = avg + knn.score(X_test, y_test)
avg = avg / 10 # avg accuracy

show_index = np.array(range(9))
print('Final avg accuracy',avg, 'chosen features are', show_index[sel_f] )

Final avg accuracy 0.9614285714285715 chosen features are [0 1 5]
