In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier



In [2]:
#Load dataset 
filename = 'breast-cancer-wisconsin.data'
raw = np.genfromtxt(filename, delimiter=',')

# NaN means missing data, need to fill it
mask = np.isnan(raw)
raw[mask]=0
mask_sum = np.sum(mask, axis = 0)
col_sum = np.sum(raw,axis=0)
for k in range(10):
    mask1d = mask[:,k]
    raw[mask1d,k] = col_sum[k] / (len(raw) - mask_sum[k])

# Sample features of all_X is 599 x 9
# We don't need ID (not a feature)
df_X = raw[:,1:10]
# Sample labels are in df_y. Shape of df_y is 599
df_y = raw[:,10]

In [3]:
repeat_times = 50

d_size, dim = np.shape(df_X)
repeat_times = 50

In [4]:
d_size = len(df_y)
train_len = int(d_size * 0.7+0.49) # do rounding instead of truncation
test_len = int(d_size * 0.3+0.49)

In [5]:
all_train_X = np.zeros((repeat_times,train_len,dim))
all_test_X = np.zeros((repeat_times,test_len,dim))
all_train_y = np.zeros((repeat_times,train_len,))
all_test_y = np.zeros((repeat_times,test_len))

In [6]:
for i in range(repeat_times):
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y)
    all_train_X[i,:,:] = X_train
    all_test_X[i,:,:] = X_test
    all_train_y[i,:] = y_train
    all_test_y[i,:] = y_test

In [7]:

for est_no in range(5,105,10): # number of weak classifiers (trees)
    train_acc = 0 
    acc = 0 # test accuracy
    for i in range(repeat_times):
        X_train = all_train_X[i,:,:] 
        X_test = all_test_X[i,:,:] 
        y_train = all_train_y[i,:] 
        y_test = all_test_y[i,:]
        #increase depth if execution time is too long
        ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=est_no) 
        ada.fit(X_train, y_train)
        train_acc = train_acc + ada.score(X_train, y_train)
        acc = acc + ada.score(X_test, y_test)

    train_acc = train_acc / repeat_times 
    acc = acc / repeat_times 
    print('Number of estimator = %d Train acc = %.3f test acc = %.3f' % (est_no, train_acc, acc))

Number of estimator = 5 Train acc = 0.975 test acc = 0.954
Number of estimator = 15 Train acc = 1.000 test acc = 0.954
Number of estimator = 25 Train acc = 1.000 test acc = 0.957
Number of estimator = 35 Train acc = 1.000 test acc = 0.957
Number of estimator = 45 Train acc = 1.000 test acc = 0.959
Number of estimator = 55 Train acc = 1.000 test acc = 0.960
Number of estimator = 65 Train acc = 1.000 test acc = 0.962
Number of estimator = 75 Train acc = 1.000 test acc = 0.961
Number of estimator = 85 Train acc = 1.000 test acc = 0.962
Number of estimator = 95 Train acc = 1.000 test acc = 0.963


In [8]:
for est_no in range(5,105,10): # number of weak classifiers (trees)
    train_acc = 0 
    acc = 0 # test accuracy
    for i in range(repeat_times):
        X_train = all_train_X[i,:,:] 
        X_test = all_test_X[i,:,:] 
        y_train = all_train_y[i,:] 
        y_test = all_test_y[i,:]
        #increase depth if execution time is too long
        ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=est_no) 
        ada.fit(X_train, y_train)
        train_acc = train_acc + ada.score(X_train, y_train)
        acc = acc + ada.score(X_test, y_test)

    train_acc = train_acc / repeat_times 
    acc = acc / repeat_times 
    print('Number of estimator = %d Train acc = %.3f test acc = %.3f' % (est_no, train_acc, acc))

Number of estimator = 5 Train acc = 0.956 test acc = 0.946
Number of estimator = 15 Train acc = 0.971 test acc = 0.953
Number of estimator = 25 Train acc = 0.981 test acc = 0.954
Number of estimator = 35 Train acc = 0.987 test acc = 0.954
Number of estimator = 45 Train acc = 0.992 test acc = 0.954
Number of estimator = 55 Train acc = 0.995 test acc = 0.955
Number of estimator = 65 Train acc = 0.997 test acc = 0.954
Number of estimator = 75 Train acc = 0.998 test acc = 0.953
Number of estimator = 85 Train acc = 0.999 test acc = 0.954
Number of estimator = 95 Train acc = 1.000 test acc = 0.953


### 1. 弱分類器=15，no training error
### 2. 弱分類器越多，acc 還是會微幅上升。
### 3. depth = 1，弱分類器=95，no training error。