In [11]:
#Imports and data imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing

#Using UCI Machine Learning Repository's database.
#Using data from the " Breast Cancer Wisconsin (Diagnostic) Data Set" (https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)
#Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. 

#Reading CSV Data into Pandas
col_names = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave_points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_standard_error",
    "texture_standard_error",
    "perimeter_standard_error",
    "area_standard_error",
    "smoothness_standard_error",
    "compactness_standard_error",
    "concavity_standard_error",
    "concave_points_standard_error",
    "symmetry_standard_error",
    "fractal_dimension_standard_error",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave_points_worst",
    "symmetry_worst",
    "fractal_dimension_worst"
]
raw_data = pd.read_csv(
    "./wdbc.data",
    names=col_names,
    na_values="?")


def massage_data(df):
    
    #Drop unknown values
    df = df.dropna()
    
    #One-hot encoding for only the feature
    col = "diagnosis"

        
    #separating into features and labels
    #ID is irrelevant data, so skip the first column
    features = df[df.columns[2:]]
    labels = df[df.columns[1:2]]
    
    #Feature normalization
    min_max_scaler = preprocessing.MinMaxScaler()
    features = min_max_scaler.fit_transform(features)
    
    return features, labels

all_features, all_labels = massage_data(raw_data)


ValueError: could not convert string to float: 'M'

In [3]:
from sklearn import tree
import graphviz 

breastCancerTree = tree.DecisionTreeClassifier()
breastCancerTree = breastCancerTree.fit(all_features, all_labels)

#Generate decision tree and output it into a pdf
#notably, this tree has NO testing data, only training data. It may be not be general enough.
dot_data = tree.export_graphviz(
    breastCancerTree, 
    out_file=None,
    feature_names=col_names[2:],
    class_names=["B","M"]) 
graph = graphviz.Source(dot_data)
graph.render("breastCancerTree_All")


'breastCancerTree_All.pdf'

In [4]:
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_validate
import numpy as np
import graphviz 

train_features, test_features, train_labels, test_labels = train_test_split(all_features, all_labels, test_size=0.25)

#This decision tree has a 80-20 training-testing split.
breastCancerTree = tree.DecisionTreeClassifier()
breastCancerTree = breastCancerTree.fit(train_features, train_labels)
scores = cross_validate(breastCancerTree, test_features, test_labels)

#Accuracy appears to be around the low 90s.
print(scores)
print("Average Score:",np.average(scores['test_score']))


#Generate decision tree and output it into a pdf
dot_data = tree.export_graphviz(
    breastCancerTree, 
    out_file=None,
    feature_names=col_names[2:],
    class_names=["B","M"]) 
graph = graphviz.Source(dot_data)
graph.render("breastCancerTree_Split")


{'fit_time': array([0.00618505, 0.00485039, 0.01170325, 0.00343251, 0.00326896]), 'score_time': array([0.00121284, 0.00068116, 0.00053835, 0.00075388, 0.00127625]), 'test_score': array([0.89655172, 0.93103448, 0.86206897, 0.92857143, 0.92857143])}
Average Score: 0.90935960591133


'breastCancerTree_Split.pdf'

In [19]:
features_df = pd.DataFrame(all_features, columns=col_names[2:])
labels_df = all_labels
total_normalized_df = pd.concat([features_df, labels_df], axis=1)


#Based on the decision tree, I expect only Beign diagonsises in this. This is true.
onlyB = total_normalized_df[total_normalized_df.radius_worst > 0.315]
onlyB = onlyB[onlyB.texture_mean <= 0.216]
onlyB = onlyB[onlyB.compactness_standard_error <= 0.139]
print(onlyB)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
133     0.413129      0.142712        0.402253   0.262227         0.379074   
209     0.392304      0.108218        0.375786   0.246872         0.263519   
347     0.368167      0.170105        0.352982   0.222778         0.326081   
375     0.434900      0.215083        0.431967   0.273595         0.416810   
406     0.433480      0.174163        0.418147   0.278473         0.382053   
472     0.375740      0.176530        0.363900   0.230498         0.255936   
484     0.414075      0.053094        0.407781   0.256076         0.466462   
491     0.514411      0.119040        0.489323   0.359958         0.232464   
508     0.441053      0.202570        0.420911   0.286872         0.375914   

     compactness_mean  concavity_mean  concave_points_mean  symmetry_mean  \
133          0.230783        0.167174             0.294881       0.381818   
209          0.131648        0.138051             0.156909       