In [1]:
import numpy as np
import random 

## Generate data

In [6]:
def sign(value):
    if value >= 0:
        return 1
    else:
        return -1

def generate_uniform_data(start, end, size, noice_rate):
    x = [random.uniform(start, end) for _ in range(size)]
    y = [sign(x[i]) for i in range(size)]
    probs = [random.uniform(0, 1) for _ in range(len(y))]
    y = [ probs[i]<noice_rate and -value or value for i, value in enumerate(y)]
    
    return x, y

def evaluate(x, y, theta, pn):
    err_nums = 0
    
    for i, value in enumerate(x):
        if y[i] != sign(pn*(value - theta)):
            err_nums += 1
        
    return err_nums/len(y)

def compute_Eout(theta, noice_rate, pn):
    v = abs(theta)
    s = 1 - noice_rate
    Eout = 0.5 + pn*(v-1)*(s-0.5)
    
    return Eout

## Question 17/18

In [83]:
def stump_algorithm(x, y, noice_rate=0.2):
    thetas = np.sort(x)
    pns = [-1, 1]
    best_Ein  = 1
    best_Eout = 1
    for theta in thetas:
        for pn in pns:
            Ein = evaluate(x, y, theta, pn)

            if Ein < best_Ein:
                best_Ein  = Ein
                best_Eout = compute_Eout(theta, noice_rate, pn)

    return best_Ein, best_Eout

In [84]:
def measurement(iterations, noice_rate):
    Ein_sum = 0
    Eout_sum = 0
    
    for iter in range(iterations):
        x, y = generate_uniform_data(-1, 1, 20, 0.2)
        Ein, Eout = stump_algorithm(x, y, noice_rate=0.2)
        Ein_sum += Ein
        Eout_sum += Eout
        
        if (not iter%500):
            print("-----------------------")
            print("Round {}: \nEin: {}\nEout: {}".format(iter, Ein, Eout))

    return Ein_sum/iterations, Eout_sum/iterations

In [85]:
Ein, Eout = measurement(5000, 0.2)

print(Ein)
print(Eout)

-----------------------
Round 0: 
Ein: 0.1
Eout: 0.2538189356874775
-----------------------
Round 500: 
Ein: 0.25
Eout: 0.20729175539987288
-----------------------
Round 1000: 
Ein: 0.15
Eout: 0.4009782628251082
-----------------------
Round 1500: 
Ein: 0.15
Eout: 0.3226857872551057
-----------------------
Round 2000: 
Ein: 0.2
Eout: 0.31637553161737286
-----------------------
Round 2500: 
Ein: 0.35
Eout: 0.2905572210846784
-----------------------
Round 3000: 
Ein: 0.2
Eout: 0.2086976881482293
-----------------------
Round 3500: 
Ein: 0.2
Eout: 0.24310421831233664
-----------------------
Round 4000: 
Ein: 0.05
Eout: 0.26292615607373687
-----------------------
Round 4500: 
Ein: 0.15
Eout: 0.20486752220840426
0.17044000000000095
0.26154228002831953


## Question 19/20

In [7]:
def read_data(fileName):
    content = np.asarray([i.split() for i in open(fileName).readlines()], dtype='float')
    x = content[:, :-1]
    y = content[:, -1:].flatten()

    return x, y

In [8]:
def multi_dimension_stump(x, y):
    thetas = np.sort(x)
    pns = [-1, 1]
    best_Ein  = 1
    best_theta = 0
    best_pn = 0
    for theta in thetas:
        for pn in pns:
            Ein = evaluate(x, y, theta, pn)

            if Ein < best_Ein:
                best_Ein  = Ein
                best_theta = theta
                best_pn = pn

    return best_Ein, best_theta, best_pn

In [20]:
def multi_dimension_measurement(train_file, test_file):
    X_train, y_train = read_data(train_file)
    X_test, y_test = read_data(test_file)
    best_dimension = 0
    best_theta = 0
    best_pn = 0
    best_Ein = 1
    
    for i in range(len(X_train[0])):
        Ein, theta, pn = multi_dimension_stump(X_train[:,i], y_train)
        
        if Ein < best_Ein:
            best_Ein = Ein
            best_dimension = i
            best_theta = theta
            best_pn = pn
    
    Eout = evaluate(X_test[:, best_dimension], y_test, best_theta, best_pn)
            
    return best_Ein, Eout

In [21]:
train_file = "hw2_train.dat.txt"
test_file = "hw2_test.dat.txt"
Ein, Eout = multi_dimension_measurement(train_file, test_file)

print("------------------")
print("Result:")
print("Ein:  {}".format(Ein))
print("Eout: {}".format(Eout))

------------------
Result:
Ein:  0.25
Eout: 0.355
