In [None]:
# data preprocessing
with open('../hw2c/test.scaled', 'r') as f:
    lines = f.readlines()
    with open('test.data', 'w') as g:
        for line in lines:
            if (line[0]=='1'): g.write(line + '\n')
            else: g.write('0' + line[2:] + '\n' )

with open('../hw2c/train.scaled', 'r') as f:
    lines = f.readlines()
    with open('train.data', 'w') as g:
        for line in lines:
            if (line[0]=='1'): g.write(line + '\n')
            else: g.write('0' + line[2:] + '\n' )


In [7]:
import numpy as np
import matplotlib.pyplot as plt
import xgboost as bst
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2


In [5]:
# utility functions
def get_error_rate(pred,Y):
    return sum(pred != Y) / float(len(Y))

def generic_clf(Ytrain, Xtrain, Ytest, Xtest, clf):
    clf.fit(Xtrain,Ytrain)
    pred_train = clf.predict(Xtrain)
    pred_test  = clf.predict(Xtest)
    return get_error_rate(pred_train, Ytrain), get_error_rate(pred_test, Ytest)

def plot_error_rate(err_train, err_test, title):
    df_error = pd.DataFrame(([err_train,err_test])).T
    df_error.columns = ['Training', 'Test']
    plot1 = df_error.plot(linewidth = 3, figsize=(8,6), color = ['lightblue', 'darkblue'], grid=True)
    plot1.set_xlabel('Number of iterations', fontsize=12)
    plot1.set_ylabel('Error rate', fontsize=12)
    plot1.set_title(title, fontsize=16)
    plt.axhline(y=err_test[0],linewidth=1,color='red', ls='dashed')


In [8]:
# boosting algorithm
def adaboost_clf(Ytrain, Xtrain, Ytest, Xtest, T, clf):
    n_train, n_test = len(Xtrain), len(Xtest)
    w = np.ones(n_train)/n_train
    pred_test, pred_train = [np.zeros(n_test), np.zeros(n_train)]

    for i in range(T):
        clf.fit(Xtrain, Ytrain, sample_weight = w)
        pred_train_i = clf.predict(Xtrain)
        pred_test_i = clf.predict(Xtest)

        miss = [int(x) for x in (pred_train_i!=Ytrain)]
        miss2 = [x if x==1 else -1 for x in miss]

        err_m = np.dot(w,miss)/sum(w)

        alpha_m = 0.5 * np.log((1-err_m)/err_m)

        w = np.multiply(w,np.exp([float(x) * alpha_m for x in miss2]))

        pred_train = [sum(x) for x in zip(pred_train, [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, [x * alpha_m for x in pred_test_i])]
    
    pred_train,pred_test = np.sign(pred_train), np.sign(pred_test)
    return get_error_rate(pred_train, Ytrain), get_error_rate(pred_test, Ytest)

def logitboost_clf(Ytrain, Xtrain, Ytest, Xtest, T, clf):
    # initializing
    n_train, n_test = len(Xtrain), len(Xtest)
    w = np.ones(n_train)/n_train
    pred_test, pred_train = [np.zeros(n_test), np.zeros(n_train)]


    for i in range(T):
        # getting h_t and calculating rate
        clf.fit(Xtrain, Ytrain, sample_weight = w)
        pred_train_i = clf.predict(Xtrain)
        pred_test_i = clf.predict(Xtest)

        miss = [int(x) for x in (pred_train_i!=Ytrain)]
        miss2 = [x if x==1 else -1 for x in miss]

        err_m = np.dot(w,miss)/sum(w)

        alpha_m = 0.5 * np.log((1-err_m)/err_m)

        # updating w
        w = np.multiply(w,np.exp([float(x) * alpha_m for x in miss2]))
        w = w/(np.sum(w))

        # ---
        pred_train = [sum(x) for x in zip(pred_train, [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, [x * alpha_m for x in pred_test_i])]
    
    pred_train,pred_test = np.sign(pred_train), np.sign(pred_test)
    return get_error_rate(pred_train, Ytrain), get_error_rate(pred_test, Ytest)

