# Breast Cancer Wisconsin

In [1]:
import numpy as np
import pandas as pd

In [20]:
col = ['id', 'Clumpy Thickness', 'Uniformity of Cell Size', 
       'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 
       'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

df = pd.read_csv("breast-cancer-wisconsin.data.csv", names=col, header=None)

In [21]:
df.head()

Unnamed: 0,id,Clumpy Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Pre-Processing

In [22]:
# Bare Nuclei is an object?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clumpy Thickness               699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [23]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [24]:
# question mark value amongst the numeric data
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [25]:
df[df['Bare Nuclei'] == '?']

Unnamed: 0,id,Clumpy Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [26]:
# removing the above rows doesn't change the daat too much
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [41]:
df['Bare Nuclei'].replace("?", np.nan, inplace=True)
df['Bare Nuclei'] = df['Bare Nuclei'].astype('int64')
df= df.dropna()

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
id                             683 non-null int64
Clumpy Thickness               683 non-null int64
Uniformity of Cell Size        683 non-null int64
Uniformity of Cell Shape       683 non-null int64
Marginal Adhesion              683 non-null int64
Single Epithelial Cell Size    683 non-null int64
Bare Nuclei                    683 non-null int64
Bland Chromatin                683 non-null int64
Normal Nucleoli                683 non-null int64
Mitoses                        683 non-null int64
Class                          683 non-null float64
dtypes: float64(1), int64(10)
memory usage: 64.0 KB


In [29]:
# binarize target variable
df['Class'] = (df['Class'] / 2) - 1

In [43]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [44]:
X = df.drop(['Class', 'id'], axis=1)
X_col = X.columns

In [45]:
y = df['Class']

In [46]:
from sklearn.preprocessing import StandardScaler

In [47]:
X = StandardScaler().fit_transform(X.values)



In [48]:
from sklearn.model_selection import train_test_split

In [49]:
df1 = pd.DataFrame(X, columns=X_col)

In [51]:
df1.head()

Unnamed: 0,Clumpy Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [52]:
X_train, X_test, y_train, y_test = train_test_split(df1, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [54]:
knn = KNeighborsClassifier(n_neighbors=5,
                          p=2, metric='minkowski')

In [55]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [57]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''Print scoring metrics for classifier on training data'''
    if train:
        print("Train Result:\n")
        print("Accuracy score: {0:.4f}".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report:\n {}".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix:\n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t {0:.4f}".format(np.std(res)))
        
      
    elif train == False:
        '''Print scoring metrics for classifier on test data'''  
        
        print("Test Result:\n")
        print("Accuracy score: {0:.4f}".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report:\n {}".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix:\n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [58]:
print_score(knn, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy score: 0.9725
Classification Report:
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       365
        1.0       0.96      0.96      0.96       181

avg / total       0.97      0.97      0.97       546

Confusion Matrix:
 [[358   7]
 [  8 173]] 

Average accuracy: 	 0.9635
Accuracy SD: 	 0.0162


In [59]:
print_score(knn, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy score: 0.9562
Classification Report:
              precision    recall  f1-score   support

        0.0       0.94      0.99      0.96        79
        1.0       0.98      0.91      0.95        58

avg / total       0.96      0.96      0.96       137

Confusion Matrix:
 [[78  1]
 [ 5 53]] 



## Grid Search

In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [65]:
params = {"n_neighbors": [x for x in range(1, 11)]}

In [66]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                             params,
                             n_jobs=-1,
                             verbose=1)

In [67]:
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [68]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [70]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy score: 0.9725
Classification Report:
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       365
        1.0       0.96      0.96      0.96       181

avg / total       0.97      0.97      0.97       546

Confusion Matrix:
 [[358   7]
 [  8 173]] 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   15.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.9s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.0s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Average accuracy: 	 0.9635
Accuracy SD: 	 0.0181


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.2s finished


In [69]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy score: 0.9562
Classification Report:
              precision    recall  f1-score   support

        0.0       0.94      0.99      0.96        79
        1.0       0.98      0.91      0.95        58

avg / total       0.96      0.96      0.96       137

Confusion Matrix:
 [[78  1]
 [ 5 53]] 

