## Capstone Project - Winonsin Breast Cancer Diagnosis Deep Learning Revisited






In [281]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
# from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import ShuffleSplit

# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Pretty display for notebooks
%matplotlib inline

#### Load in WBCD dataset

In [282]:
# Load the Boston housing dataset
headers = ["ID","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses","Diagnosis"]
data = pd.read_csv('breast-cancer-wisconsin.csv', names = headers)
data = data.reset_index(drop=True)
print(data.head(n = 10))

        ID  CT  UCSize  UCShape  MA  SECSize  BN  BC  NN  Mitoses  Diagnosis
0  1000025   5       1        1   1        2   1   3   1        1          2
1  1002945   5       4        4   5        7  10   3   2        1          2
2  1015425   3       1        1   1        2   2   3   1        1          2
3  1016277   6       8        8   1        3   4   3   7        1          2
4  1017023   4       1        1   3        2   1   3   1        1          2
5  1017122   8      10       10   8        7  10   9   7        1          4
6  1018099   1       1        1   1        2  10   3   1        1          2
7  1018561   2       1        2   1        2   1   3   1        1          2
8  1033078   2       1        1   1        2   1   1   1        5          2
9  1033078   4       2        1   1        2   1   2   1        1          2


#### Handle missing data

In [283]:
data = data.replace('?', np.nan)
#print(features_with_missing_data)
data = data.fillna(0)
print(data.head(n = 20))

         ID  CT  UCSize  UCShape  MA  SECSize  BN  BC  NN  Mitoses  Diagnosis
0   1000025   5       1        1   1        2   1   3   1        1          2
1   1002945   5       4        4   5        7  10   3   2        1          2
2   1015425   3       1        1   1        2   2   3   1        1          2
3   1016277   6       8        8   1        3   4   3   7        1          2
4   1017023   4       1        1   3        2   1   3   1        1          2
5   1017122   8      10       10   8        7  10   9   7        1          4
6   1018099   1       1        1   1        2  10   3   1        1          2
7   1018561   2       1        2   1        2   1   3   1        1          2
8   1033078   2       1        1   1        2   1   1   1        5          2
9   1033078   4       2        1   1        2   1   2   1        1          2
10  1035283   1       1        1   1        1   1   3   1        1          2
11  1036172   2       1        1   1        2   1   2   1       

#### Scale dataset to the range of [0, 1]

In [284]:
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ["Diagnosis","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses"]
data[numerical] = scaler.fit_transform(data[numerical])
print(data.head(n = 10))

# features[numerical] = scaler.fit_transform(features[numerical])

# Show an example of a record with scaling applied
# print(features.head(n = 10))

        ID        CT    UCSize   UCShape        MA   SECSize   BN        BC  \
0  1000025  0.444444  0.000000  0.000000  0.000000  0.111111  0.1  0.222222   
1  1002945  0.444444  0.333333  0.333333  0.444444  0.666667  1.0  0.222222   
2  1015425  0.222222  0.000000  0.000000  0.000000  0.111111  0.2  0.222222   
3  1016277  0.555556  0.777778  0.777778  0.000000  0.222222  0.4  0.222222   
4  1017023  0.333333  0.000000  0.000000  0.222222  0.111111  0.1  0.222222   
5  1017122  0.777778  1.000000  1.000000  0.777778  0.666667  1.0  0.888889   
6  1018099  0.000000  0.000000  0.000000  0.000000  0.111111  1.0  0.222222   
7  1018561  0.111111  0.000000  0.111111  0.000000  0.111111  0.1  0.222222   
8  1033078  0.111111  0.000000  0.000000  0.000000  0.111111  0.1  0.000000   
9  1033078  0.333333  0.111111  0.000000  0.000000  0.111111  0.1  0.111111   

         NN   Mitoses  Diagnosis  
0  0.000000  0.000000        0.0  
1  0.111111  0.000000        0.0  
2  0.000000  0.000000    

In [285]:
diagnosis = data['Diagnosis']
features = data.drop(['ID','Diagnosis'], axis = 1)
print(features.head(n = 10))
print(diagnosis.head(n = 10))

         CT    UCSize   UCShape        MA   SECSize   BN        BC        NN  \
0  0.444444  0.000000  0.000000  0.000000  0.111111  0.1  0.222222  0.000000   
1  0.444444  0.333333  0.333333  0.444444  0.666667  1.0  0.222222  0.111111   
2  0.222222  0.000000  0.000000  0.000000  0.111111  0.2  0.222222  0.000000   
3  0.555556  0.777778  0.777778  0.000000  0.222222  0.4  0.222222  0.666667   
4  0.333333  0.000000  0.000000  0.222222  0.111111  0.1  0.222222  0.000000   
5  0.777778  1.000000  1.000000  0.777778  0.666667  1.0  0.888889  0.666667   
6  0.000000  0.000000  0.000000  0.000000  0.111111  1.0  0.222222  0.000000   
7  0.111111  0.000000  0.111111  0.000000  0.111111  0.1  0.222222  0.000000   
8  0.111111  0.000000  0.000000  0.000000  0.111111  0.1  0.000000  0.000000   
9  0.333333  0.111111  0.000000  0.000000  0.111111  0.1  0.111111  0.000000   

    Mitoses  
0  0.000000  
1  0.000000  
2  0.000000  
3  0.000000  
4  0.000000  
5  0.000000  
6  0.000000  
7  0.00

#### Split dataset into training and testing datasets

In [288]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, diagnosis, test_size=0.25, random_state=42)

#### Reindex 

In [289]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

#### Convert Pandas DataFrame to Numpy ndarray
* just need to convert X_train since X_test, y_train, and y_test are already numpy ndarrays.

In [290]:
X_train = X_train.values

#### Classify dataset using Random Forest Classifier

In [291]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train, y_train)
# y_predict = rfc.predict(X_test)

score = rfc.score(X_test, y_test)
print("score = ", score)

('score = ', 0.95999999999999996)


#### Deep learning

In [None]:
from keras.layers import Dense
from keras.models import Sequential
import keras
import keras.utils
from keras import utils as np_utils

In [295]:
model = Sequential()

#Define your architecture.
model.add(Dense(9, activation='relu', input_dim=9))
model.add(Dense(5, activation='relu', input_shape=(9,)))
model.add(Dense(1, activation='relu', input_shape=(5,)))
model.add(Dense(1, activation='sigmoid'))
# model.add(Dense(1, activation='sigmoid', input_shape=(1,)))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_68 (Dense)             (None, 9)                 90        
_________________________________________________________________
dense_69 (Dense)             (None, 5)                 50        
_________________________________________________________________
dense_70 (Dense)             (None, 1)                 6         
_________________________________________________________________
dense_71 (Dense)             (None, 1)                 2         
Total params: 148
Trainable params: 148
Non-trainable params: 0
_________________________________________________________________


In [296]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [297]:
print("X_train: ", X_train)
print("y_train: ", y_train)

('X_train: ', array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.66666667],
       [ 1.        ,  1.        ,  1.        , ...,  0.33333333,
         1.        ,  1.        ],
       [ 0.77777778,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       ..., 
       [ 0.77777778,  0.33333333,  0.66666667, ...,  0.22222222,
         0.88888889,  0.11111111],
       [ 1.        ,  0.77777778,  1.        , ...,  0.44444444,
         0.        ,  0.        ],
       [ 0.33333333,  0.        ,  0.11111111, ...,  0.22222222,
         0.        ,  0.        ]]))
('y_train: ', 0      0.0
1      1.0
2      1.0
3      0.0
4      0.0
5      0.0
6      0.0
7      1.0
8      0.0
9      0.0
10     0.0
11     1.0
12     0.0
13     0.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     0.0
23     1.0
24     1.0
25     1.0
26     1.0
27     1.0
28     1.0
29     0.0
      ... 
494    0.0
495  

In [298]:
model.fit(X_train, y_train, epochs=20, batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x124ff2d10>

In [299]:
score = model.evaluate(X_test, y_test, batch_size=10)

KeyError: '[0 1 2 3 4 5 6 7 8 9] not in index'