# Project Cancer Detection

In [38]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
col = ['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class']
df = pd.read_csv("data/breast-cancer-wisconsin.data.csv", names=col, header=None)
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Data Pre-Processing

In [3]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [6]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [7]:
df['Bare Nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [11]:
df[df['Bare Nuclei'] == "?"]

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [12]:
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [13]:
df['Bare Nuclei'].replace("?", np.NaN, inplace=True)
df = df.dropna()

In [14]:
df[df['Bare Nuclei'] == "?"]

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class


### Now we need to change the 'Class' variable from 2 & 4 to 0 & 1

$$\frac{\text{df}["Class"]}{2} - 1$$

In [15]:
df['Class'] = (df['Class']/2) - 1

In [16]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [30]:
X = df.drop(['id', 'Class'], axis=1)
X_col = X.columns

In [31]:
y=df['Class']

In [32]:
from sklearn.preprocessing import StandardScaler

In [39]:
X = StandardScaler().fit_transform(X)

In [40]:
X

array([[ 0.19790469, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       [ 0.19790469,  0.27725185,  0.26278299, ..., -0.18182716,
        -0.28510482, -0.34839971],
       [-0.51164337, -0.70221201, -0.74177362, ..., -0.18182716,
        -0.61292736, -0.34839971],
       ...,
       [ 0.19790469,  2.23617957,  2.2718962 , ...,  1.86073779,
         2.33747554,  0.22916583],
       [-0.15686934,  1.58320366,  0.93248739, ...,  2.67776377,
         1.02618536, -0.34839971],
       [-0.15686934,  1.58320366,  1.6021918 , ...,  2.67776377,
         0.37054027, -0.34839971]])

***

## Other Types of Pre-Processing

In [46]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id', 'Class'], axis=1).values), columns=X_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


In [47]:
from sklearn.preprocessing import Normalizer
pd.DataFrame(Normalizer().fit_transform(df.drop(['id', 'Class'], axis=1).values), columns=X_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.753778,0.150756,0.150756,0.150756,0.301511,0.150756,0.452267,0.150756,0.150756
1,0.319438,0.255551,0.255551,0.319438,0.447214,0.638877,0.191663,0.127775,0.063888
2,0.538816,0.179605,0.179605,0.179605,0.359211,0.359211,0.538816,0.179605,0.179605
3,0.380235,0.506979,0.506979,0.063372,0.190117,0.25349,0.190117,0.443607,0.063372
4,0.609994,0.152499,0.152499,0.457496,0.304997,0.152499,0.457496,0.152499,0.152499


***
## Training

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
df1 = pd.DataFrame(X, columns=X_col)

In [43]:
df1.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [44]:
X_train, X_test, y_train, y_test = train_test_split(df1, y,
                                                    train_size=0.8,
                                                    random_state=42) 

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [49]:
knn = KNeighborsClassifier(n_neighbors=5,
                           p=2,
                           metric='minkowski')

In [50]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [52]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [53]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [54]:
print_score(knn, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9725

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

   micro avg       0.97      0.97      0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion Matrix: 
 [[358   7]
 [  8 173]] 

Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0162


In [55]:
print_score(knn, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9562

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

   micro avg       0.96      0.96      0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[78  1]
 [ 5 53]] 



***
## Grid Search

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [61]:
params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [62]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                              params,
                              n_jobs=-1,
                              verbose=1)

In [63]:
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [64]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

In [65]:
grid_search_cv.best_score_

0.9688644688644689

In [69]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9725

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

   micro avg       0.97      0.97      0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion Matrix: 
 [[358   7]
 [  8 173]] 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0181


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished


In [71]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9562

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

   micro avg       0.96      0.96      0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[78  1]
 [ 5 53]] 



***

## Other Models

In [74]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import svm

In [76]:
rf_clf = RandomForestClassifier()
xgb_clf = xgb.XGBClassifier(max_depth=3, n_estimators=1000, learning_rate=0.2)
svm_clf = svm.SVC()

In [78]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [79]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [80]:
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

***

## Random Forest

In [81]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9982

Classification Report: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       365
         1.0       0.99      1.00      1.00       181

   micro avg       1.00      1.00      1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546
 

Confusion Matrix: 
 [[364   1]
 [  0 181]] 

Average Accuracy: 	 0.9652
Accuracy SD: 		 0.0153


In [82]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9343

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.92      0.97      0.94        79
         1.0       0.96      0.88      0.92        58

   micro avg       0.93      0.93      0.93       137
   macro avg       0.94      0.93      0.93       137
weighted avg       0.94      0.93      0.93       137
 

Confusion Matrix: 
 [[77  2]
 [ 7 51]] 



***

## XGBoost

In [83]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       365
         1.0       1.00      1.00      1.00       181

   micro avg       1.00      1.00      1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546
 

Confusion Matrix: 
 [[365   0]
 [  0 181]] 

Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0215


In [84]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9635

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.95      0.99      0.97        79
         1.0       0.98      0.93      0.96        58

   micro avg       0.96      0.96      0.96       137
   macro avg       0.97      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[78  1]
 [ 4 54]] 



***

## Support Vector Machine

In [85]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9799

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.98       365
         1.0       0.96      0.98      0.97       181

   micro avg       0.98      0.98      0.98       546
   macro avg       0.98      0.98      0.98       546
weighted avg       0.98      0.98      0.98       546
 

Confusion Matrix: 
 [[358   7]
 [  4 177]] 

Average Accuracy: 	 0.9635
Accuracy SD: 		 0.0199


In [87]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.9635

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.96      0.97      0.97        79
         1.0       0.96      0.95      0.96        58

   micro avg       0.96      0.96      0.96       137
   macro avg       0.96      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[77  2]
 [ 3 55]] 

