In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
survival = pd.read_csv('survival_data.csv', delimiter=',', header=None, 
                       names=['Patient_Age', 'Year_of_Operation', 'Number_of_Nodes', 'Survival_Status'])

In [3]:
print survival.shape

(306, 4)


In [4]:
survival.head()

Unnamed: 0,Patient_Age,Year_of_Operation,Number_of_Nodes,Survival_Status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [5]:
survival.Survival_Status.value_counts()

1    225
2     81
Name: Survival_Status, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
survival_data = survival[['Patient_Age', 'Year_of_Operation', 'Number_of_Nodes']]
survival_data.head()

Unnamed: 0,Patient_Age,Year_of_Operation,Number_of_Nodes
0,30,64,1
1,30,62,3
2,30,65,0
3,31,59,2
4,31,65,4


In [8]:
survival_target = survival[['Survival_Status']]
survival_target.head()

Unnamed: 0,Survival_Status
0,1
1,1
2,1
3,1
4,1


In [9]:
x_train, x_test, y_train, y_test = \
    train_test_split(survival_data, survival_target, test_size=0.4, random_state=0)

In [10]:
x_train.head()

Unnamed: 0,Patient_Age,Year_of_Operation,Number_of_Nodes
182,55,58,1
168,54,65,23
33,38,67,5
83,45,60,0
218,59,64,1


In [11]:
y_train.head()

Unnamed: 0,Survival_Status
182,1
168,2
33,1
83,1
218,1


In [12]:
y_train.Survival_Status.value_counts()

1    144
2     39
Name: Survival_Status, dtype: int64

In [13]:
x_test.head()

Unnamed: 0,Patient_Age,Year_of_Operation,Number_of_Nodes
274,67,63,1
66,43,63,14
258,65,58,0
210,58,61,1
159,53,60,9


In [14]:
y_test.head()

Unnamed: 0,Survival_Status
274,2
66,1
258,2
210,1
159,2


In [15]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
fit = clf.fit(x_train, y_train)
y_pre = fit.predict(x_test)

In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pre)
print cm

[[61 20]
 [31 11]]


In [17]:
from sklearn.metrics import classification_report
print classification_report(y_test, y_pre)

             precision    recall  f1-score   support

          1       0.66      0.75      0.71        81
          2       0.35      0.26      0.30        42

avg / total       0.56      0.59      0.57       123



In [18]:
from sklearn import tree
with open('survival_tree.dot', 'w') as f:
    f = tree.export_graphviz(clf, out_file=f,
        feature_names=['Patient_Age', 'Year_of_Operation', 'Number_of_Nodes'],
        rounded=True, special_characters=True, filled=True)

In [19]:
clf_2 = DecisionTreeClassifier(min_samples_split=20)
fit_2 = clf_2.fit(x_train, y_train)
y_pre_2 = fit_2.predict(x_test)

In [20]:
cm_2 = confusion_matrix(y_test, y_pre_2)
print cm_2

[[68 13]
 [34  8]]


In [21]:
print classification_report(y_test, y_pre_2)

             precision    recall  f1-score   support

          1       0.67      0.84      0.74        81
          2       0.38      0.19      0.25        42

avg / total       0.57      0.62      0.58       123



In [22]:
with open('survival_tree_2.dot', 'w') as f:
    f = tree.export_graphviz(clf_2, out_file=f,
        feature_names=['Patient_Age', 'Year_of_Operation', 'Number_of_Nodes'],
        rounded=True, special_characters=True, filled=True)

In [23]:
clf_3 = DecisionTreeClassifier(max_depth=5)
fit_3 = clf_3.fit(x_train, y_train)
y_pre_3 = fit_3.predict(x_test)
cm_3 = confusion_matrix(y_test, y_pre_3)
print cm_3

[[68 13]
 [32 10]]


In [24]:
print classification_report(y_test, y_pre_3)

             precision    recall  f1-score   support

          1       0.68      0.84      0.75        81
          2       0.43      0.24      0.31        42

avg / total       0.60      0.63      0.60       123



In [25]:
with open('survival_tree_3.dot', 'w') as f:
    f = tree.export_graphviz(clf_3, out_file=f,
        feature_names=['Patient_Age', 'Year_of_Operation', 'Number_of_Nodes'],
        rounded=True, special_characters=True, filled=True)