In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

In [2]:
df = pd.read_csv('transfusion.data',sep=',', header = 0)

In [3]:
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

In [4]:
df.columns 

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [5]:
df[0:748]

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
5,4,4,1000,4,0
6,2,7,1750,14,1
7,1,12,3000,35,0
8,2,9,2250,22,1
9,5,46,11500,98,1


# test model

In [6]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [8]:
cm = confusion_matrix(y_test, y_pre)
print (cm)


[[164  12]
 [ 33  16]]


In [9]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       176
           1       0.57      0.33      0.42        49

    accuracy                           0.80       225
   macro avg       0.70      0.63      0.65       225
weighted avg       0.78      0.80      0.78       225



# Default  model 

## model 80-20

In [22]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [48]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[147  19]
 [ 42  17]]


In [49]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       166
           1       0.47      0.29      0.36        59

    accuracy                           0.73       225
   macro avg       0.62      0.59      0.59       225
weighted avg       0.70      0.73      0.70       225



## model 60-40

In [50]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4)
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [52]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[208  38]
 [ 36  18]]


In [53]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       246
           1       0.32      0.33      0.33        54

    accuracy                           0.75       300
   macro avg       0.59      0.59      0.59       300
weighted avg       0.76      0.75      0.76       300



## model 40-60

In [31]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.6)
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [33]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[311  32]
 [ 72  34]]


In [34]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       343
           1       0.52      0.32      0.40       106

    accuracy                           0.77       449
   macro avg       0.66      0.61      0.63       449
weighted avg       0.74      0.77      0.75       449



# entropy model

## model 80-20

In [38]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [55]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[106  12]
 [ 25   7]]


In [41]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       110
           1       0.73      0.28      0.40        40

    accuracy                           0.78       150
   macro avg       0.76      0.62      0.63       150
weighted avg       0.77      0.78      0.74       150



## model 60-40

In [42]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [57]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[318  29]
 [ 87  15]]


In [58]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       347
           1       0.34      0.15      0.21       102

    accuracy                           0.74       449
   macro avg       0.56      0.53      0.53       449
weighted avg       0.68      0.74      0.70       449



## model 40-60

In [59]:
X = df.values[:, 0:3]
Y = df.values[:,4]

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.6)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None)
fit = clf.fit(X_train, y_train)
y_pre = fit.predict(X_test)

In [62]:
cm = confusion_matrix(y_test, y_pre)
print (cm)

[[211  20]
 [ 53  16]]


In [63]:
print (classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       231
           1       0.44      0.23      0.30        69

    accuracy                           0.76       300
   macro avg       0.62      0.57      0.58       300
weighted avg       0.72      0.76      0.73       300

