In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import numpy as np

# Visualization
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus



In [2]:
col_names = ["classification",
             "age" ,
             "menopause" ,
             "tumor-size",
             "inv-nodes",
             "node-caps" ,
             "deg-malig" ,
             "breast" ,
             "breast-quad" ,
             "irradiat"]
feature_names = [ "age" ,
              "menopause" ,
             "tumor-size",
             "inv-nodes",
             "node-caps" ,
             "deg-malig" ,
             "breast" ,
             "breast-quad" ,
             "irradiat"]

In [3]:
ds = pd.read_csv("../Datasets/breast-cancer.data", header=None, names=col_names)

In [4]:
ds.head()

Unnamed: 0,classification,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [5]:
ds["deg-malig"] = ds["deg-malig"].astype('category')

In [6]:
set(ds["menopause"].values)

{'ge40', 'lt40', 'premeno'}

In [7]:
one_hot_feature_names = pd.get_dummies(ds[feature_names]).columns.values
one_hot_feature_names

array(['age_20-29', 'age_30-39', 'age_40-49', 'age_50-59', 'age_60-69',
       'age_70-79', 'menopause_ge40', 'menopause_lt40',
       'menopause_premeno', 'tumor-size_0-4', 'tumor-size_10-14',
       'tumor-size_15-19', 'tumor-size_20-24', 'tumor-size_25-29',
       'tumor-size_30-34', 'tumor-size_35-39', 'tumor-size_40-44',
       'tumor-size_45-49', 'tumor-size_5-9', 'tumor-size_50-54',
       'inv-nodes_0-2', 'inv-nodes_12-14', 'inv-nodes_15-17',
       'inv-nodes_24-26', 'inv-nodes_3-5', 'inv-nodes_6-8',
       'inv-nodes_9-11', 'node-caps_?', 'node-caps_no', 'node-caps_yes',
       'deg-malig_1', 'deg-malig_2', 'deg-malig_3', 'breast_left',
       'breast_right', 'breast-quad_?', 'breast-quad_central',
       'breast-quad_left_low', 'breast-quad_left_up',
       'breast-quad_right_low', 'breast-quad_right_up', 'irradiat_no',
       'irradiat_yes'], dtype=object)

In [8]:
enc = preprocessing.OneHotEncoder()
enc.fit(ds[feature_names])
FEATURES = enc.transform(ds[feature_names]).toarray()

In [9]:
FEATURES

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 1., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [10]:
le = preprocessing.LabelEncoder().fit(ds["classification"])

In [11]:
LABELS = le.transform(ds["classification"])

In [12]:
LABELS

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
#hyperparameters criterion min_samples_split max_depth 


N = len(FEATURES)
best_accuracy = 0
best_sensitivity = 0
for p_max_depth in range(1, N):
    print ("max_depth set to " + str(p_max_depth))
    for p_min_samples_split in range(2,N):
        for p_criterion in ["gini", "entropy"]:
            misses = np.array([])
            true_positive   = np.array([])
            true_negative   = np.array([])
            false_positive  = np.array([])
            false_negative  = np.array([])
            for i in range(0,N):
                if i == 0:
                    indexes = np.array(range(1,N))
                elif i < len(FEATURES) - 1:
                    indexes = np.append(np.array(range(0,i)), np.array(range(i + 1,N)))
                else:
                    indexes = np.array(range(0,N - 1))
                    
                dt = DecisionTreeClassifier(criterion=p_criterion, 
                                            max_depth= p_max_depth, 
                                            min_samples_split = p_min_samples_split)    
                
                dt = dt.fit(FEATURES[indexes], LABELS[indexes])  
                if dt.predict(FEATURES[i].reshape(1,-1))[0] != LABELS[i]:
                    misses = np.append(misses, np.array([i]))
                    if LABELS[i] == 0: # here we predicted 1 and the class is 0
                        false_positive = np.append(false_positive, np.array([i]))
                    else: # here we predicted 0 and the class is 1
                        false_negative = np.append(false_negative, np.array([i]))  
                else:
                    if LABELS[i] == 0:
                        true_negative = np.append(true_negative, np.array([i]))
                    else:    
                        true_positive = np.append(true_positive, np.array([i]))
            accuracy = 1 - (len(misses)/N)
            sensitivity = len(true_positive) / ( len(true_positive) + len(false_negative) )            
            if best_accuracy < accuracy:    
                best_accuracy = accuracy
                best_min_samples_split_acc = p_min_samples_split
                best_max_depth_acc = p_max_depth
                best_criterion_acc = p_criterion
                print("best_max_depth (accuracy) " + str(best_max_depth_acc))
                print("best_min_samples_split (accuracy) " + str(best_min_samples_split_acc))
                print("best_criterion (accuracy) " + best_criterion_acc)
                print("best ACCURACY so far " + str(best_accuracy))
                print("sensitivity: " + str(sensitivity))
            if best_sensitivity < sensitivity:    
                best_sensitivity = sensitivity
                best_min_samples_split_sen = p_min_samples_split
                best_max_depth_sen = p_max_depth
                best_criterion_sen = p_criterion
                print("best_max_depth (sensitivity) " + str(best_max_depth_sen))
                print("best_min_samples_split (sensitivity) " + str(best_min_samples_split_sen))
                print("best_criterion (accuracy) " + best_criterion_sen)
                print("best SENSITIVITY so far " + str(best_sensitivity))
                print("accuracy: " + str(accuracy))   
        
    
            
#print(len(misses)/N) 
#print(1 - (len(misses)/N)) 

        
    

max_depth set to 1
best_max_depth (accuracy) 1
best_min_samples_split (accuracy) 2
best_criterion (accuracy) gini
best ACCURACY so far 0.7202797202797202
sensitivity: 0.5294117647058824
best_max_depth (sensitivity) 1
best_min_samples_split (sensitivity) 2
best_criterion (accuracy) gini
best SENSITIVITY so far 0.5294117647058824
accuracy: 0.7202797202797202
max_depth set to 2
best_max_depth (accuracy) 2
best_min_samples_split (accuracy) 2
best_criterion (accuracy) gini
best ACCURACY so far 0.7272727272727273
sensitivity: 0.24705882352941178
best_max_depth (accuracy) 2
best_min_samples_split (accuracy) 2
best_criterion (accuracy) entropy
best ACCURACY so far 0.7342657342657343
sensitivity: 0.24705882352941178


In [None]:
dt = DecisionTreeClassifier(criterion=p_criterion, 
                                max_depth= best_max_depth, 
                                min_samples_split = best_min_samples_split) 
dt = dt.fit(FEATURES, LABELS)
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = one_hot_feature_names, class_names=["no-recurrence-events", "RECURRENCE-EVENTS"])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('dt.png')
Image(graph.create_png())