## Capstone Project - Winonsin Breast Cancer Diagnosis Deep Learning Revisited






#### Import necessary libraries

In [59]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import MinMaxScaler

# Import supplementary visualizations code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

#### Load in WBCD dataset

In [60]:
# Load the Boston housing dataset
headers = ["ID","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses","Diagnosis"]
data = pd.read_csv('breast-cancer-wisconsin.csv', names = headers)
data = data.reset_index(drop=True)

#### Handle missing data

In [61]:
data = data.replace('?', np.nan)
data = data.fillna(0)

#### Define a data generation function with random noise

In [62]:
def generateData(data):
    # make a copy of data
    data2 = data

    diagnosis2 = data2['Diagnosis']
    features2 = data2.drop(['ID','Diagnosis'], axis = 1)
    features2_headers = ["CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses"]

    mu, sigma = 0, 0.1 
    # creating a noise with the same dimension as the dataset 
    noise = np.random.normal(mu, sigma, features2.shape) 

    features2 = features2.apply(pd.to_numeric, errors='ignore')

    features2_with_noise = features2.add(pd.DataFrame(noise, columns = features2_headers), fill_value=0)

    data2 = pd.concat([features2_with_noise, pd.DataFrame(diagnosis2)], axis = 1)
    
    return data2

#### Add the new generated dataset into the existing dataset

In [63]:
new_data = generateData(data)
data = data.append(new_data, ignore_index=True)

new_data = generateData(data)
data = data.append(new_data, ignore_index=True)

print("data size = ", data.size)

('data size = ', 30756)


#### Scale dataset to the range of [0, 1]

In [64]:
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ["Diagnosis","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses"]
data[numerical] = scaler.fit_transform(data[numerical])

#### Separate Labels/Classes from Features

In [None]:
diagnosis = data['Diagnosis']
features = data.drop(['ID','Diagnosis'], axis = 1)

In [None]:
# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(features, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

In [None]:
# Scale the data using the natural logarithm
log_features = features.copy()
for feature_name in log_features.columns:
    log_features[feature_name] = np.log(log_features[feature_name])
    
log_features = log_features.replace([np.inf, -np.inf], np.nan)
log_features = log_features.fillna(0)

# log_features = log_features.subtract(log_features.mean())

log_features = scaler.fit_transform(log_features)

# Produce a scatter matrix for each pair of newly-transformed features
pd.plotting.scatter_matrix(pd.DataFrame(log_features), alpha = 0.3, figsize = (14,8), diagonal = 'kde');

# convert numpy ndarray into Pandas DataFrame
features = pd.DataFrame(log_features)

#### Split dataset into training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, diagnosis, test_size=0.25, random_state=42)

#### Reindex 

In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

#### Convert Pandas DataFrame to Numpy ndarray

In [None]:
X_train = X_train.values
y_train = y_train.values
X_test  = X_test.values
y_test  = y_test.values

#### Classify dataset using Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train, y_train)
# y_predict = rfc.predict(X_test)

score = rfc.score(X_test, y_test)
print("score = ", score)

#### Deep learning

In [None]:
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Sequential
import keras
import keras.utils
from keras import utils as np_utils

#### Define NN Architecture

In [None]:
model = Sequential()

#Define your architecture.
model.add(Dense(9, activation='relu', input_dim=9))
model.add(Dropout(0.5))
model.add(Dense(5, activation='relu', input_shape=(9,)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu', input_shape=(5,)))
model.add(Dense(1, activation='sigmoid'))

model.summary()

#### Compile NN Model

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

#### Train NN Model

In [None]:
model.fit(X_train, y_train, epochs=200, batch_size=10)

#### Test NN Model

In [None]:
score = model.evaluate(X_test, y_test, batch_size=10)
print("score = ", score)