## Capstone Project - Winonsin Breast Cancer Diagnosis Deep Learning Revisited






In [19]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
# from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import ShuffleSplit

# Import supplementary visualizations code visuals.py
# import visuals as vs

# Pretty display for notebooks
%matplotlib inline

#### Load in WBCD dataset

In [20]:
# Load the Boston housing dataset
headers = ["ID","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses","Diagnosis"]
data = pd.read_csv('breast-cancer-wisconsin.csv', names = headers)
print(data.head(n = 10))

diagnosis = data['Diagnosis']
features_raw = data.drop(['ID','Diagnosis'], axis = 1)
print(features_raw.head(n = 10))

        ID  CT  UCSize  UCShape  MA  SECSize  BN  BC  NN  Mitoses  Diagnosis
0  1000025   5       1        1   1        2   1   3   1        1          2
1  1002945   5       4        4   5        7  10   3   2        1          2
2  1015425   3       1        1   1        2   2   3   1        1          2
3  1016277   6       8        8   1        3   4   3   7        1          2
4  1017023   4       1        1   3        2   1   3   1        1          2
5  1017122   8      10       10   8        7  10   9   7        1          4
6  1018099   1       1        1   1        2  10   3   1        1          2
7  1018561   2       1        2   1        2   1   3   1        1          2
8  1033078   2       1        1   1        2   1   1   1        5          2
9  1033078   4       2        1   1        2   1   2   1        1          2
   CT  UCSize  UCShape  MA  SECSize  BN  BC  NN  Mitoses
0   5       1        1   1        2   1   3   1        1
1   5       4        4   5        7  10

#### Handle missing data 

In [21]:
features_with_missing_data = features_raw.replace('?', np.nan)
#print(features_with_missing_data)
features = features_with_missing_data.fillna(0)
print(features.head(n = 20))

    CT  UCSize  UCShape  MA  SECSize  BN  BC  NN  Mitoses
0    5       1        1   1        2   1   3   1        1
1    5       4        4   5        7  10   3   2        1
2    3       1        1   1        2   2   3   1        1
3    6       8        8   1        3   4   3   7        1
4    4       1        1   3        2   1   3   1        1
5    8      10       10   8        7  10   9   7        1
6    1       1        1   1        2  10   3   1        1
7    2       1        2   1        2   1   3   1        1
8    2       1        1   1        2   1   1   1        5
9    4       2        1   1        2   1   2   1        1
10   1       1        1   1        1   1   3   1        1
11   2       1        1   1        2   1   2   1        1
12   5       3        3   3        2   3   4   4        1
13   1       1        1   1        2   3   3   1        1
14   8       7        5  10        7   9   5   5        4
15   7       4        6   4        6   1   4   3        1
16   4       1

#### Scale dataset to the range of [0, 1]

In [22]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ["CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses"]
features[numerical] = scaler.fit_transform(features[numerical])

# Show an example of a record with scaling applied
print(features.head(n = 10))

         CT    UCSize   UCShape        MA   SECSize   BN        BC        NN  \
0  0.444444  0.000000  0.000000  0.000000  0.111111  0.1  0.222222  0.000000   
1  0.444444  0.333333  0.333333  0.444444  0.666667  1.0  0.222222  0.111111   
2  0.222222  0.000000  0.000000  0.000000  0.111111  0.2  0.222222  0.000000   
3  0.555556  0.777778  0.777778  0.000000  0.222222  0.4  0.222222  0.666667   
4  0.333333  0.000000  0.000000  0.222222  0.111111  0.1  0.222222  0.000000   
5  0.777778  1.000000  1.000000  0.777778  0.666667  1.0  0.888889  0.666667   
6  0.000000  0.000000  0.000000  0.000000  0.111111  1.0  0.222222  0.000000   
7  0.111111  0.000000  0.111111  0.000000  0.111111  0.1  0.222222  0.000000   
8  0.111111  0.000000  0.000000  0.000000  0.111111  0.1  0.000000  0.000000   
9  0.333333  0.111111  0.000000  0.000000  0.111111  0.1  0.111111  0.000000   

    Mitoses  
0  0.000000  
1  0.000000  
2  0.000000  
3  0.000000  
4  0.000000  
5  0.000000  
6  0.000000  
7  0.00

#### Split dataset into training and testing datasets

In [23]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, diagnosis, test_size=0.25, random_state=42)

#### Classify dataset using Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train, y_train)
# y_predict = rfc.predict(X_test)

score = rfc.score(X_test, y_test)
print("score = ", score)

('score = ', 0.95999999999999996)


#### Deep learning

In [25]:
print(X_train.head())
print(y_train.head())

           CT  UCSize  UCShape        MA   SECSize   BN        BC   NN  \
163  0.000000     0.0      0.0  0.111111  0.000000  0.3  0.000000  0.0   
286  1.000000     1.0      1.0  1.000000  1.000000  1.0  0.333333  1.0   
612  0.777778     1.0      1.0  1.000000  0.555556  1.0  1.000000  1.0   
517  0.000000     0.0      0.0  0.000000  0.000000  0.1  0.111111  0.0   
464  0.333333     0.0      0.0  0.000000  0.111111  0.1  0.000000  0.0   

      Mitoses  
163  0.666667  
286  1.000000  
612  1.000000  
517  0.000000  
464  0.000000  
163    2
286    4
612    4
517    2
464    2
Name: Diagnosis, dtype: int64


In [30]:
from keras.layers import Dense
from keras.models import Sequential
import keras
import keras.utils
from keras import utils as np_utils

In [38]:
# Convert diagnosis to categorical one-hot encoding
one_hot_diagnosis = keras.utils.to_categorical(diagnosis)

In [39]:
model = Sequential()

#Define your architecture.
model.add(Dense(9, activation='relu', input_dim=9))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 50        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 6         
Total params: 146
Trainable params: 146
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [41]:
model.fit(X_train, y_train, epochs=20, batch_size=10)
score = model.evaluate(X_test, y_test, batch_size=10)

Epoch 1/20


KeyError: '[479 490  72 426 120 328 191 443  36  76] not in index'