In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import copy

## Part I SVM implementation

In [None]:
df = pd.read_csv('./OnlineNewsPopularity.csv')
df = df.drop(df.columns[list(range(0,2))+list(range(4,7))+list(range(13,39))],axis = 1)
dataset = np.array(df)

In [None]:
def split_attribute_and_label(data):
    x = data[:,:-1]
    y = data[:,-1]
    return x,y
def convert_label(threshold,label):
    new_label = copy.deepcopy(label)
    for i,val in enumerate(label):
        if (val >= threshold):
            new_label[i] = 1
        else:
            new_label[i] = -1
    return new_label

In [None]:
# shuffle the dataset
np.random.seed(42)
np.random.shuffle(dataset)
train = dataset[:1000]
dev = dataset[1000:2000]
test = dataset[2001:3001]
f'the train size is {train.shape[0]}, the dev shape is {dev.shape[0]}, and the test shape is {test.shape[0]}'

In [None]:
train_x ,train_y = split_attribute_and_label(train)
dev_x ,dev_y = split_attribute_and_label(dev)
test_x ,test_y = split_attribute_and_label(test)

In [None]:
stdScaler = StandardScaler()
stdScaler.fit(train_x)
centered_train = stdScaler.transform(train_x)
centered_dev = stdScaler.transform(dev_x)
centered_test = stdScaler.transform(test_x)

In [None]:
conv_train_y = convert_label(2000,train_y)
conv_dev_y = convert_label(2000,dev_y)
conv_test_y = convert_label(2000,test_y)

In [None]:
def generate_kernel(dataset,kernel,spread = None,dataset1 = None):
    if dataset1 == None:
        dataset1 = np.copy(dataset)
    empty_kernel = np.zeros((dataset.shape[0],dataset.shape[0]))
    for i in range(dataset.shape[0]):
        for j in range(i,dataset.shape[0]):
            X = dataset[i,:]
            Y = dataset1[j,:]
            if kernel == 'linear':
                K_ij = compute_linear_kernel(X,Y)
            if kernel == 'gaussian':
                K_ij = compute_gaussian_kernel(X,Y,spread)
            empty_kernel[i][j] = K_ij
            empty_kernel[j][i] = K_ij
    return  empty_kernel

def compute_linear_kernel(X,Y):
    return np.dot(X.T,Y)     
def _compute_linear_kernel(X,Y):
    return np.dot(X,Y.T)     
def compute_gaussian_kernel(X,Y,spread):
	return np.exp(-1 * (np.linalg.norm(X - Y)**2 / (2 * spread)) )
def _gaussian_kernel(data_matrix_1, data_matrix_2, spread):
    kernel_matrix = np.zeros((len(data_matrix_1), len(data_matrix_2)))
    for i in range(len(data_matrix_1)):
        for j in range(len(data_matrix_2)):
            numer = np.linalg.norm(data_matrix_1[i] - data_matrix_2[j]) ** 2
            denom = float(2 * (spread))
            kernel_matrix[i][j] = math.exp(-numer/denom)
    return kernel_matrix

In [None]:
def SVM(x, y, kernel, loss, eps, C, spread=None):
    if loss == 'hinge':
        if kernel == 'linear':
            K = generate_kernel(x,kernel)
        elif kernel == 'gaussian':
            K = generate_kernel(x,kernel,spread)
    K = K+1
    eta = np.asarray([1/k for k in np.diag(K)])
    alpha = np.random.rand(K.shape[0])
    index = [i for i in range(K.shape[0])]
    l1_norm = np.Inf
    while l1_norm > eps:
        alpha_prev = np.copy(alpha)
        np.random.shuffle(index)
        for k_index in index:
            alpha[k_index] = alpha[k_index] + eta[k_index] *\
                (1 - y[k_index] * (np.sum(np.multiply(np.multiply(alpha, y), K.T[k_index]))))
            if alpha[k_index] < 0:
                alpha[k_index] = 0
            if loss == "hinge" and alpha[k_index] > C:
                alpha[k_index] = C
        l1_norm = np.linalg.norm((alpha - alpha_prev))
    return alpha

In [None]:
def prediction(alpha, y, kernel):
    x = np.sign(np.sum((alpha * y * kernel),1))
    return x

To find the best C for linear alpha, we perform a grid search from C = 0.001 to 0.01

In [None]:
for i in np.linspace(0.001,0.01,15):
    linear_alpha = SVM(centered_train,conv_train_y,'linear','hinge',0.001,i)
    linear_pred = prediction(linear_alpha,conv_train_y,_compute_linear_kernel(centered_dev,centered_train)+1)
    #make sure the model is learning
    assert(np.sum(linear_pred == -1) != 1000)
    assert(np.sum(linear_pred == 1) != 1000)
    print(f'The precision is {np.sum(conv_dev_y == linear_pred)/1000} on dev set when C is {i}')

The best precison on dev set for linear alpha occurs when C is 0.005

To find the best C for gaussian alpha, we perform a grid search from C = 1 to 10 and spread from 10 to 200

In [None]:
for i in np.linspace(1,10,3):
    for j in np.linspace(50,200,3):
        gaussian_alpha = SVM(centered_train,conv_train_y,'gaussian','hinge',0.001,i,j)
        gaussian_pred = prediction(gaussian_alpha,conv_train_y,_gaussian_kernel(centered_dev,centered_train,j)+1)
        #make sure the model is learning, we do not use an assert here because the search space is large
        if np.sum(gaussian_pred == -1) == 1000 or np.sum(gaussian_pred == 1) == 1000:
            print(f'model is not learning when C is {i} and Spread is {j}')
        else:
            print(f'The precision is {np.sum(conv_dev_y == gaussian_pred)/1000} on dev set when C is {i} and Spread is {j}')

The best combination for gaussian kernel occurs when C is 10 and spread is 125

In [None]:
linear_alpha = SVM(centered_train,conv_train_y,'linear','hinge',0.001,0.005)
linear_pred = prediction(linear_alpha,conv_train_y,_compute_linear_kernel(centered_test,centered_train)+1)
print(f'The accuarcy of linear SVM on test set is{np.sum(conv_test_y == linear_pred)/1000} when C is {i}')

In [None]:
gaussian_alpha = SVM(centered_train,conv_train_y,'gaussian','hinge',0.001,5.5,50)
gaussian_pred = prediction(gaussian_alpha,conv_train_y,_gaussian_kernel(centered_test,centered_train,125)+1)
print(f'The accuarcy of gaussian SVM on test set is {np.sum(conv_test_y == gaussian_pred)/1000}')

## Part II Exam 1 Q5

In [None]:
import pandas as pd
import numpy as np
dataset = open('seeds_dataset.txt', 'r')
dataset_list = []
for line in dataset:
    data = line.strip().split('\t')
    while '' in data:
        data.remove('')
    assert(len(data) == 8)
    dataset_list.append([float(data[i]) for i in range(7)])
dataset = pd.DataFrame(data = np.array(dataset_list), columns=range(7)).to_numpy()

In [None]:
def get_mean(X):
    return X.sum(axis=0) / X.shape[0]

In [None]:
transformed_data = np.zeros((210,28))
for points in range(dataset.shape[0]):
    counter = 0
    for i in range(dataset.shape[1]):
        for j in range(i,dataset.shape[1]):
            if i != j:
                transformed_data[points][counter] = math.sqrt(2)*dataset[points,i]*dataset[points,j]
            else:
                transformed_data[points][counter] = dataset[points,i]*dataset[points,j]
            counter += 1
transformed_data = transformed_data - get_mean(transformed_data)

In [None]:
## helper functions
def length(a):
    return np.sqrt(np.dot(a.T,a))
def normalize(a):
    return a/length(a)
def get_cov(X,X_mean):
    return (np.dot((X-X_mean).T,(X-X_mean))/X.shape[0])
def get_mean(X):
    return X.sum(axis=0) / X.shape[0]
def total_variance(X,X_mean):
    return np.sum((X-X_mean)**2) / X.shape[0]
def project_data(eigen_vectors,centered_kernel):
    basis = np.column_stack([eigen_vectors[:, 1-i]
                           for i in range(2)])    
    projection = np.dot(basis.T,centered_kernel)
    return projection

In [None]:
np.random.seed(42)
A = np.copy(transformed_data)
A = A - get_mean(A)
## find out what is d
d = A.shape[1]
epsilon = 10e-6
intermediate_result = list()
X_i = np.random.random_sample([d,2])
X_i[:,0] = normalize(X_i[:,0])
X_i[:,1] = normalize(X_i[:,1])
cov = get_cov(A,get_mean(A))
while len(intermediate_result)<2 or np.linalg.norm(intermediate_result[-1] - intermediate_result[-2])>=epsilon:
    X_iplus1 = np.dot(cov,X_i)
    X_iplus1[:,1] = X_iplus1[:,1] - np.dot((np.dot(X_iplus1[:,1].T,X_iplus1[:,0])/np.dot(X_iplus1[:,0].T,X_iplus1[:,0])),X_iplus1[:,0])
    X_iplus1[:,0] = normalize(X_iplus1[:,0])
    X_iplus1[:,1] = normalize(X_iplus1[:,1])
    X_i = X_iplus1
    intermediate_result.append(X_iplus1)
u_1 = X_iplus1[:,0]
u_2 = X_iplus1[:,1]
labda_1 = np.dot(np.dot(u_1.T,cov.T),u_1)/(np.dot(u_1.T,u_1))
labda_2 = np.dot(np.dot(u_2.T,cov.T),u_2)/(np.dot(u_2.T,u_2))

In [None]:
print(f'The best direction is:')
u_1