# Problem 17

In [20]:
import numpy as np
from numpy import random

def getData(data_size):
    return random.uniform(-1, 1, data_size)

def sign(data, data_size, noise_rate):
    data = np.array(data)
    noise = random.uniform(size=data_size) - noise_rate
    return (np.sign(data) * np.sign(noise)).astype(np.int8)
    
def signTest():
    data_size = 10
    data = random.uniform(-1, 1, data_size)
    noise_rate = 0
    labels = sign(data, data_size, noise_rate)
    assert np.array_equal(labels, np.sign(data).astype(np.int8))
    
    noise_rate = 1
    labels = sign(data, data_size, noise_rate)
    assert np.array_equal(labels, np.sign(data).astype(np.int8) * -1)
    
def getTheta(data):
    data = np.array(data)
    data = np.concatenate([[-1], data, [1]])
    theta = []
    for i in range(len(data) - 1):
        middle = (data[i] + data[i + 1]) / 2
        theta.append(middle)
    return np.array(theta)

def getThetaTest():
    data = np.array([0.1, 0.2, 0.3])
    
    # the range of data is [-1, 1]
    assert ( np.all(data >= -1) and np.all(data <= 1) )
    
    theta = getTheta(data)
    assert (np.sum(theta - [-0.45, 0.15, 0.25, 0.65]) < 1e-10)
    

def findThreshold(data, labels, theta, s):
    
    theta_array = np.repeat(theta, len(data))
    theta_array = np.reshape(theta_array, (-1,len(data)))
    
    data_array = data.copy()
    label_array = labels.copy()
    for i in range(len(theta) - 1):
        data_array = np.concatenate([data_array, data])
        label_array = np.concatenate([label_array, labels])
    data_array = np.reshape(data_array, (-1, len(data)))
    label_array = np.reshape(label_array, (-1, len(data)))
    
    predictions = s * ( np.sign(data_array - theta_array).astype(np.int8)  )
    inner_product = np.diag(np.inner(predictions, label_array))
    max_innerProduct = np.max(inner_product)
    threshold_index = np.argmax(inner_product)
    return [max_innerProduct, threshold_index]
    
# s = 1 means positive ray, and vice versa
def calEin(data, labels, theta, s):
    
    [max_innerProduct, _] = findThreshold(data, labels, theta, s)                
    
    min_error = (len(data) - max_innerProduct) / 2
    assert (min_error >= 0)
    
    return int(min_error)

def calEinTest():
    data = [ -0.1, 0.1, 0.2, 0.3, -0.4]
    data = np.sort(data)
    theta = getTheta(data)
    labels = [-1, 1, 1, -1, -1]
    print(calEin(data, labels, theta, 1))
    print(calEin(data, labels, theta, -1))
    
        
        

if __name__ == '__main__':
    signTest()
    getThetaTest()
    # calEinTest()
    data_size = 20
    noise_rate = 0.2
    epoch = 5000
    
    
    errors = np.zeros(epoch, dtype = np.float)
    for i in range(epoch):
        
        # get Data by uniform distribution in [-1, 1]
        data = getData(data_size)
        data = np.sort(data)
        
        # calculate (n + 1) theta from (n) data 
        theta = getTheta(data)
        
        # positive array
        pos_error = calEin(data, sign(data, data_size, noise_rate), theta, 1)

        # negative array
        neg_error = calEin(data, sign(data, data_size, noise_rate), theta, -1)
        
        errors[i] = np.min([pos_error, neg_error]) / data_size
    
    print("Average Ein is : {}".format(np.mean(errors)))
    

Average Ein is : 0.17021


# Problem 18

In [21]:
import math
def calEout(data, labels, theta, s):
    [_, threshold_index] = findThreshold(data, labels, theta, s)
    threshold = theta[threshold_index]
    return 0.5 + 0.3 * s * (math.fabs(threshold) - 1)

if __name__ == '__main__':
    
    data_size = 20
    noise_rate = 0.2
    epoch = 5000
    errors = np.zeros(epoch, dtype = np.float)
    for i in range(epoch):
        
        # get Data by uniform distribution in [-1, 1]
        data = getData(data_size)
        data = np.sort(data)

        # calculate (n + 1) theta from (n) data 
        theta = getTheta(data)

        # positive ray
        pos_error = calEout(data, sign(data, data_size, noise_rate), theta, 1)

        # negative ray
        neg_error = calEout(data, sign(data, data_size, noise_rate), theta, -1)

        errors[i] = np.min([pos_error, neg_error])
    print('Eout is : {}'.format(np.mean(errors)))

Eout is : 0.25354118069189935


# Problem 19

In [6]:
def readFile(fileName, typeOfElement):
    fp = open(fileName)
    content = []
    count = 0
    for line in fp.readlines():
        content.append(line.split())
    content = np.array(content, dtype = typeOfElement)
    return content

def splitDataAndLabel(rawData):
    points = []
    labels = []
    for index, element in enumerate(rawData):
        points.append(element[0:9])
        labels.append(element[9])
    points = np.array(points)
    labels = np.array(labels)
    return points, labels
        

In [7]:
fileName = "hw2_train.dat"

rawData = readFile(fileName, 'float')
points, labels = splitDataAndLabel(rawData)
dims = points.shape[1]  # It's 9 here

errors = np.zeros(dims)
for dim in range(dims):
    
    data = []
    for index in range(points.shape[0]):
        data.append([points.T[dim][index], labels[index]])

    data.sort(key = lambda element:element[0])
    data = np.array(data)
    sorted_data = np.array(data[:,0])
    sorted_labels = np.array(data[:,1])
    
    # calculate (n + 1) theta from (n) data 
    theta = getTheta(sorted_data)
    
    # positive ray
    pos_error = calEin(sorted_data, sorted_labels, theta, 1)
    
    # negative ray
    neg_error = calEin(sorted_data, sorted_labels, theta, -1)
    
    errors[dim] = np.min([pos_error, neg_error]) / len(sorted_data)
    
print('Ein is : {}'.format(np.min(errors)))

Ein is : 0.25


# Problem 20

In [None]:
fileName = "hw2_test.dat"

rawData = readFile(fileName, 'float')
points, labels = splitDataAndLabel(rawData)
dims = points.shape[1]  # It's 9 here

errors = np.zeros(dims)
for dim in range(dims):
    
    data = []
    for index in range(points.shape[0]):
        data.append([points.T[dim][index], labels[index]])

    data.sort(key = lambda element:element[0])
    data = np.array(data)
    
    # calculate (n + 1) theta from (n) data 
    theta = getTheta(data[:,0])
    
    # positive ray
    pos_error = calEin(data[:,0], data[:,1], theta, 1)
    
    # negative ray
    neg_error = calEin(data[:,0], data[:,1], theta, -1)
    
    errors[dim] = np.min([pos_error, neg_error]) / data.shape[0]
    
print('Eout is : {}'.format(np.min(errors)))