In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.style
import matplotlib as mpl


In [2]:
wbcd = pd.read_csv('./data/breast-cancer-wisconsin.data.csv')

print(wbcd.shape,'\n')

wbcd['diagnosis'] = wbcd['diagnosis'].astype('int64')

print('Data types :\n', wbcd.dtypes,'\n')

col_names = list(wbcd.columns)
print('column Names = ', col_names, '\n')

(699, 11) 

Data types :
 id                               int64
clump thickness                  int64
uniformity of cell size          int64
uniformity of cell shape         int64
marginal adhesion                int64
ingle epit helical cell size     int64
bare nucleoi                    object
bland chromatin                  int64
normal nuclei                    int64
mitoses                          int64
diagnosis                        int64
dtype: object 

column Names =  ['id', 'clump thickness', 'uniformity of cell size', 'uniformity of cell shape ', 'marginal adhesion', 'ingle epit helical cell size', 'bare nucleoi', 'bland chromatin', 'normal nuclei', 'mitoses', 'diagnosis'] 



In [3]:
np.where(wbcd.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [4]:
# Check for missing values in dataframe
wbcd.isnull().sum()

id                              0
clump thickness                 0
uniformity of cell size         0
uniformity of cell shape        0
marginal adhesion               0
ingle epit helical cell size    0
bare nucleoi                    0
bland chromatin                 0
normal nuclei                   0
mitoses                         0
diagnosis                       0
dtype: int64

In [5]:
wbcd['bare nucleoi'].describe()

count     699
unique     11
top         1
freq      402
Name: bare nucleoi, dtype: object

In [6]:
wbcd['bare nucleoi'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare nucleoi, dtype: int64

In [7]:
wbcd[wbcd['bare nucleoi'] == "?"]

Unnamed: 0,id,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,ingle epit helical cell size,bare nucleoi,bland chromatin,normal nuclei,mitoses,diagnosis
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [8]:
# Data cleaning, drop unnecessary columns id and Unnamed: 32
wbcd.drop(['id'], axis = 1, inplace = True)

wbcd.head()

Unnamed: 0,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,ingle epit helical cell size,bare nucleoi,bland chromatin,normal nuclei,mitoses,diagnosis
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [9]:
wbcd['diagnosis'].value_counts()

2    458
4    241
Name: diagnosis, dtype: int64

In [10]:
wbcd['bare nucleoi'].replace("?", np.NAN, inplace=True)
wbcd = wbcd.dropna()

In [11]:
wbcd['bare nucleoi'] = wbcd['bare nucleoi'].astype('int64')

In [12]:
type(wbcd)

pandas.core.frame.DataFrame

In [13]:
wbcd['diagnosis']

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: diagnosis, Length: 683, dtype: int64

In [14]:
wbcd['bare nucleoi'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: bare nucleoi, dtype: int64

In [15]:
wbcd['diagnosis'] = wbcd['diagnosis']/2-1

In [16]:
wbcd['diagnosis'].value_counts()

0.0    444
1.0    239
Name: diagnosis, dtype: int64

In [17]:
X = wbcd.drop(['diagnosis'], axis =1)
X_col = X.columns

### F-Score

In [18]:
# Split data in 70-30 partition
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wbcd, wbcd['diagnosis'], test_size = 0.3, random_state = 99)

In [19]:
X_train

Unnamed: 0,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,ingle epit helical cell size,bare nucleoi,bland chromatin,normal nuclei,mitoses,diagnosis
533,3,1,1,1,2,1,2,1,1,0.0
330,10,4,7,2,2,8,6,1,1,1.0
228,1,1,1,1,1,1,3,1,1,0.0
290,1,1,1,1,2,1,1,1,1,0.0
629,4,1,1,1,2,1,1,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...
70,5,1,3,1,2,1,2,1,1,0.0
174,8,6,5,4,3,10,6,1,1,1.0
191,7,5,10,10,10,10,4,10,3,1.0
36,10,10,10,8,6,1,8,9,1,1.0


In [20]:
Xtrain_p = X_train[y_train == 1]
Xtrain_n = X_train[y_train == 0]

In [21]:
n_pos = Xtrain_p.shape[0]

n_neg = Xtrain_n.shape[0]

print('train malignant (n+):', n_pos)

print('train benign (n-):', n_neg)

train malignant (n+): 176
train benign (n-): 302


In [22]:
x_mean = X_train.mean()
xp_mean = Xtrain_p.mean()
xn_mean = Xtrain_n.mean()
print('x_mean :\n',x_mean,'\n')
print('x+_mean :\n',xp_mean,'\n')
print('x-_mean :\n',xn_mean,'\n')

x_mean :
 clump thickness                 4.525105
uniformity of cell size         3.284519
uniformity of cell shape        3.301255
marginal adhesion               2.926778
ingle epit helical cell size    3.357741
bare nucleoi                    3.675732
bland chromatin                 3.510460
normal nuclei                   3.006276
mitoses                         1.677824
diagnosis                       0.368201
dtype: float64 

x+_mean :
 clump thickness                 7.312500
uniformity of cell size         6.681818
uniformity of cell shape        6.517045
marginal adhesion               5.698864
ingle epit helical cell size    5.477273
bare nucleoi                    7.619318
bland chromatin                 6.034091
normal nuclei                   5.994318
mitoses                         2.744318
diagnosis                       1.000000
dtype: float64 

x-_mean :
 clump thickness                 2.900662
uniformity of cell size         1.304636
uniformity of cell shape        

In [23]:
# FS_num = []
# FS_deno = []
F_Score = np.zeros(len(X_col))

for i in range(len(X_col)):     
    
    FS_num = (xp_mean[i] - x_mean[i])**2 + (xn_mean[i] - x_mean[i])**2

    FS_den = (sum((Xtrain_p[X_col[i]] - xp_mean[i])**2))/(n_pos-1) + (sum((Xtrain_n[X_col[i]]-xn_mean[i])**2))/(n_neg-1)

    F_Score[i] = FS_num/FS_den

In [24]:
F_Score

array([1.22293035, 1.89162361, 1.81917242, 0.91588744, 0.84851828,
       1.84496068, 1.47347907, 0.98985937, 0.20133574])

In [25]:
np.argsort(F_Score)[::-1]+1

array([2, 6, 3, 7, 1, 8, 4, 5, 9], dtype=int64)

## Models

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
from sklearn.model_selection import cross_val_score

for k in range(1, 40, 2):
    # only odd numbers
    knn = KNeighborsClassifier(n_neighbors = k, weights='distance')
    score = cross_val_score(knn, X_train, y_train, cv =5) #scoring = 'accuracy'
    print('k=', k, '; Mean accuracy', score.mean().round(3), ';Std:', score.std().round(3))

k= 1 ; Mean accuracy 0.967 ;Std: 0.017
k= 3 ; Mean accuracy 0.973 ;Std: 0.014
k= 5 ; Mean accuracy 0.971 ;Std: 0.02
k= 7 ; Mean accuracy 0.967 ;Std: 0.021
k= 9 ; Mean accuracy 0.971 ;Std: 0.014
k= 11 ; Mean accuracy 0.971 ;Std: 0.014
k= 13 ; Mean accuracy 0.969 ;Std: 0.015
k= 15 ; Mean accuracy 0.967 ;Std: 0.017
k= 17 ; Mean accuracy 0.967 ;Std: 0.017
k= 19 ; Mean accuracy 0.967 ;Std: 0.017
k= 21 ; Mean accuracy 0.969 ;Std: 0.018
k= 23 ; Mean accuracy 0.969 ;Std: 0.018
k= 25 ; Mean accuracy 0.969 ;Std: 0.018
k= 27 ; Mean accuracy 0.969 ;Std: 0.018
k= 29 ; Mean accuracy 0.969 ;Std: 0.018
k= 31 ; Mean accuracy 0.969 ;Std: 0.018
k= 33 ; Mean accuracy 0.969 ;Std: 0.018
k= 35 ; Mean accuracy 0.967 ;Std: 0.017
k= 37 ; Mean accuracy 0.969 ;Std: 0.018
k= 39 ; Mean accuracy 0.969 ;Std: 0.018



#### Chosen parameter k = 9

In [37]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


KNN_score = []
lr_score = []
NN_score = []
model_seq = np.array([2, 6, 3, 7, 1, 8, 4, 5, 9])-1

for i in range(len(model_seq)):
    print('\nModel number: ', i+1)
    xM_train=X_train[X_col[model_seq[0:i+1]]]
    xM_test=X_test[X_col[model_seq[0:i+1]]]
    # K-NN method
    knn = KNeighborsClassifier(n_neighbors = 9, weights='distance')
    knn.fit(xM_train,y_train)
    y_knn_pred = knn.predict(xM_test)
    mscore=  metrics.accuracy_score(y_test, y_knn_pred).round(4)
    KNN_score.append(mscore)
#     print('metric score for KNN = ', mscore)
    cm = metrics.confusion_matrix(y_test, y_knn_pred)
    wbcd_cm = pd.DataFrame(data = cm, columns = ['predict: Benign', 'predict: Malignant'], 
                    index = ['true: Benign', 'true: Malignant'])

    print(wbcd_cm)
    
    # Logistic Regression method
    lr = LogisticRegression(solver='lbfgs') #instantiate the model-step 1
    lr.fit(xM_train,y_train)
    y_lr_pred = lr.predict(xM_test)
    m_score=  metrics.accuracy_score(y_test, y_lr_pred).round(4)
    lr_score.append(m_score)
#     print('metric score for lr = ', m_score)
    
    # Neural Network
    clf = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(5, 5, 5, 5))
    clf.fit(xM_train, y_train)
    y_NN_pred = clf.predict(xM_test)
    m_NN_score=  metrics.accuracy_score(y_test, y_NN_pred).round(4)
    NN_score.append(m_NN_score)
#     print('metric score for NN = ',m_NN_score)


from tabulate import tabulate
print(tabulate(np.transpose([KNN_score, lr_score, NN_score]), headers=['K-NN', 'Log-Reg', 'NN']))


Model number:  1
                 predict: Benign  predict: Malignant
true: Benign                 131                  11
true: Malignant                1                  62

Model number:  2
                 predict: Benign  predict: Malignant
true: Benign                 138                   4
true: Malignant                5                  58

Model number:  3
                 predict: Benign  predict: Malignant
true: Benign                 139                   3
true: Malignant                4                  59

Model number:  4
                 predict: Benign  predict: Malignant
true: Benign                 137                   5
true: Malignant                5                  58

Model number:  5
                 predict: Benign  predict: Malignant
true: Benign                 138                   4
true: Malignant                3                  60

Model number:  6
                 predict: Benign  predict: Malignant
true: Benign                 139            

In [29]:
# xM5_train = X_train[X_col[model_seq[0:5+1]]]
# xM5_test = X_test[X_col[model_seq[0:5+1]]]
# knn = KNeighborsClassifier(n_neighbors = 9, weights='distance')
# knn.fit(xM5_train,y_train)
# y_pred = knn.predict(xM5_test)
# cm = metrics.confusion_matrix(y_test, y_pred)
# cm

In [30]:
# import statsmodels.formula.api as smf
# result = smf.ols(formula = 'np.log(sales) ~ TV', data = wbcd).fit()
# print(result.summary())

In [31]:
# from sklearn.linear_model import LogisticRegression

In [32]:
# lr = LogisticRegression(solver='lbfgs') #instantiate the model-step 1

# xM_train=X_train[X_col[model_seq[0:1+1]]]
# xM_test=X_test[X_col[model_seq[0:1+1]]]
    
# lr.fit(xM_train,y_train)

# y_pred = lr.predict(xM_test)

# m_score=  metrics.accuracy_score(y_test, y_pred).round(4)

# print(m_score)

In [33]:
# from sklearn.neural_network import MLPClassifier
# clf = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(5, 2))

# xM_train=X_train[X_col[model_seq[0:2+1]]]
# xM_test=X_test[X_col[model_seq[0:2+1]]]

# clf.fit(xM_train, y_train)

In [34]:
# y_pred = clf.predict(xM_test)
# m_score=  metrics.accuracy_score(y_test, y_pred).round(4)
# print(m_score)