In [71]:
#Imports and data imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#Using UCI Machine Learning Repository's database.
#Using data from the " Breast Cancer Wisconsin (Diagnostic) Data Set" (https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)
#Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. 

#Reading CSV Data into Pandas
col_names = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave_points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_standard_error",
    "texture_standard_error",
    "perimeter_standard_error",
    "area_standard_error",
    "smoothness_standard_error",
    "compactness_standard_error",
    "concavity_standard_error",
    "concave_points_standard_error",
    "symmetry_standard_error",
    "fractal_dimension_standard_error",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave_points_worst",
    "symmetry_worst",
    "fractal_dimension_worst"
]
raw_data = pd.read_csv(
    "./wdbc.data",
    names=col_names,
    na_values="?")


def massage_data(df):
    
    #Drop unknown values
    df = df.dropna()
    
    #One-hot encoding for only the feature
    col = "diagnosis"
    one_hot = pd.get_dummies(df[col])
    df = df.drop(col,axis=1)
    df = df.join(one_hot)
        
    #separating into features and labels
    #ID is irrelevant data, so skip the first column
    features = df[df.columns[1:-2]]
    labels = df[df.columns[-2:]]
    
    #Feature normalization
    min_max_scaler = preprocessing.MinMaxScaler()
    features = min_max_scaler.fit_transform(features)
    
    #Added to make concat in the future work.
    #May or may not be bad practice.
    labels = min_max_scaler.fit_transform(labels) 
    
    return features, labels

all_features, all_labels = massage_data(raw_training)
training_features, testing_features, training_labels, testing_labels = train_test_split(all_features, all_labels, test_size=0.2)


In [69]:
from IPython.display import clear_output


#Training and saving models.
def build_model():
    model = keras.Sequential([
        layers.Flatten(input_shape=(30,)),
        layers.Dropout(0.5),
        layers.Dense(30, activation='elu'),
        layers.Dropout(0.5),
        layers.Dense(30, activation='elu'),
        layers.Dropout(0.5),
        layers.Dense(2, activation='softmax')
    ])

    optimizer = keras.optimizers.Adam(learning_rate=0.01)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    
    model.compile(loss=loss,
                optimizer=optimizer,
                metrics=['accuracy'])
    return model

model = build_model()

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./checkpoints/cp-{epoch:04d}.ckpt", 
    verbose=0, 
    save_freq='epoch')


model.fit(
    training_features,
    training_labels,
    epochs=20,
    callbacks=[checkpoint],
    verbose=0)

clear_output()

#Accuracy in a NN ranges between 0.8 and 1.0.
#When the decision tree is subject to the test split, its best accuracy is worse.
testing_loss, testing_accuracy = model.evaluate(testing_features,testing_labels,verbose=2)

3/3 - 0s - loss: 0.3536 - accuracy: 0.9560


In [70]:
#Predictions on speculative examples.
#These examples are built from the decision tree.
features_df = pd.DataFrame(all_features, columns=col_names[2:])
labels_df = pd.DataFrame(all_labels, columns=['B','M'])
total_normalized_df = pd.concat([features_df, labels_df], axis=1)

#Based on the decision tree in the other file, these should only be Beign.
onlyB = total_normalized_df[total_normalized_df.radius_worst > 0.315]
onlyB = onlyB[onlyB.texture_mean <= 0.216]
onlyB = onlyB[onlyB.compactness_standard_error <= 0.139]

#The model should predict that these are Beign.
features_onlyB = onlyB[onlyB.columns[:-2]]
print(model.predict(features_onlyB))



[[0.99062574 0.00937422]
 [0.99522746 0.0047725 ]
 [0.94895905 0.05104094]
 [0.9964205  0.00357949]
 [0.8991218  0.10087816]
 [0.89266884 0.10733119]
 [0.9917957  0.00820429]
 [0.98639536 0.01360462]]
