In [None]:
# similar to Section2 Q7 in getting dataset and plot

from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

file = "../data/Dbig.txt"
data = pd.read_table(file,sep = ' ', header=None)
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
#random permutation
random_idx = np.random.RandomState(seed=6).permutation(10000)
error_arr = []
node_arr = []
set_list = [32,128,512,2048,8192]
#1808 fixed test set
x_test = data.iloc[random_idx[8192:],:-1]
y_test = data.iloc[random_idx[8192:],-1]

def split_traindata(data, rd_idx, num):
    if(num >8192):
        print("train too large")
        raise SystemExit()
    data_train = data.loc[rd_idx[:num]]
    x_train = data_train.iloc[:,:-1]
    y_train = data_train.iloc[:,-1]
    return x_train, y_train

def rate_error(T, x, y):
    #utilize clf to predict directly
    res_arr = T.predict(x)
    return (res_arr!=y).sum()/len(y)

def plot_boundary(T,low=-1.5,high=1.5):    
    x1grid = np.arange(low,high,0.01)
    x2grid = np.arange(low,high,0.01)
    xx,yy = np.meshgrid(x1grid, x2grid)
    x0, x1 = xx.flatten().reshape((len(xx)*xx.shape[1], 1)), yy.flatten().reshape((len(xx)*xx.shape[1], 1))
    #full coordinate samples
    grid = pd.DataFrame(np.hstack((x0,x1)))
    #utilize clf to predict directly
    predict_arr = T.predict(grid)
    zz = predict_arr.reshape(xx.shape)
    
    plt.scatter(x.iloc[:,0][y==1], x.iloc[:,1][y==1])
    plt.scatter(x.iloc[:,0][y==0], x.iloc[:,1][y==0])
    plt.contourf(xx,yy,zz, alpha = 0.5)
    plt.xlabel('x0')
    plt.ylabel('x1')
    return 

plt.figure(1,figsize=(15, 10))

#-------n = 32-------#
x_train, y_train = split_traindata(data, random_idx, 32)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

plt.subplot(2,3,1)
plot_boundary(clf)
node_arr.append(clf.tree_.node_count)
error_arr.append(rate_error(clf, x_test, y_test))
plt.title("n = 32")
#-------n = 128-------#
x_train, y_train = split_traindata(data, random_idx, 128)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

plt.subplot(2,3,2)
plot_boundary(clf)
node_arr.append(clf.tree_.node_count)
error_arr.append(rate_error(clf, x_test, y_test))
plt.title("n = 32")
#-------n = 512-------#
x_train, y_train = split_traindata(data, random_idx, 512)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

plt.subplot(2,3,3)
plot_boundary(clf)
node_arr.append(clf.tree_.node_count)
error_arr.append(rate_error(clf, x_test, y_test))
plt.title("n = 32")
#-------n = 2048-------#
x_train, y_train = split_traindata(data, random_idx, 2048)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

plt.subplot(2,3,4)
plot_boundary(clf)
node_arr.append(clf.tree_.node_count)
error_arr.append(rate_error(clf, x_test, y_test))
plt.title("n = 32")
#-------n = 8192-------#
x_train, y_train = split_traindata(data, random_idx, 8192)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

plt.subplot(2,3,5)
plot_boundary(clf)
node_arr.append(clf.tree_.node_count)
error_arr.append(rate_error(clf, x_test, y_test))
plt.title("n = 32")
print(error_arr,node_arr)

plt.figure(2)
plt.plot(set_list, error_arr)
plt.xlabel("number of train set")
plt.ylabel("error rate")
