Use the result of PCA to select s features 
Train the model using only selected features using SVM with slack

File Paths:  
- training data: data/gisette_train.data
- validation data: data/gisette_valid.data
- test data: data/gisette_test.data

In [2]:
import numpy as np
import cvxopt
import cvxopt.solvers
import random

#read file and store data to X and Y 
def read_data(filename): 
    file = open(filename, 'r', encoding='utf-8-sig')
    dataset = []
    for line in file:
        data = line.split(',')
        y_data = int(data[0])
        x_data = [float(x) for x in data[1:]] 
        dataset.append((x_data, y_data))

    X = np.array([x for x, y in dataset])
    Y = np.array([y for x, y in dataset])

    return X, Y

#training model for the train_data and get an optimal w and b
def train(c, X, Y): 
    num_of_data, num_of_features = X.shape

    #P matrix (identity matrix of w's, 0 for b and slack)
    P_matrix = np.identity(num_of_features) #identity matrix
    P_matrix = np.pad(P_matrix, ((0, 2), (0, 0)), mode='constant', constant_values=0) #add 2 new rows of 0's to the end
    P_matrix = np.pad(P_matrix, ((0, 0), (0, 2)), mode='constant', constant_values=0) #add 2 new cols of 0's to the end 

    #q vector (0's for w's and b, c for slack)
    q_vector = np.zeros(num_of_features+1) #col of 0's
    q_vector = np.append(q_vector, c) #add c value 

    #G matrix 
    Y = Y.reshape(Y.shape[0],1) 
    G_matrix = -np.multiply(Y, X) #-yixi
    bcol = np.array([-y for y in Y]) #-yi
    G_matrix = np.hstack((G_matrix, bcol)) #add bcol to G matrix
    G_matrix = np.pad(G_matrix, ((0, num_of_data), (0, 0)), mode='constant', constant_values=0) #add 3000 new rows of 0's to the end (for second equality)
    scol = -np.ones(num_of_data*2).reshape(num_of_data*2, 1) #col of slack variable (-1)
    G_matrix = np.hstack((G_matrix, scol)) #add bcol to G matrix

    #h vector
    h_vector = -np.ones(num_of_data) #col of -1's
    new_col = np.zeros(num_of_data) #col of 0's
    h_vector = np.append(h_vector, new_col)

    P = cvxopt.matrix(P_matrix)
    q = cvxopt.matrix(q_vector)
    G = cvxopt.matrix(G_matrix)
    h = cvxopt.matrix(h_vector) 

    cvxopt.solvers.options['show_progress'] = False #disable progess to console 
    sol = cvxopt.solvers.qp(P, q, G, h) 
    sol_arr = np.array(sol['x'])

    w = sol_arr[:num_of_features]
    b = sol_arr[num_of_features]

    return w, b

#constuct matrix W
def construct_W(X): 
    #get transpose of X (column becomes the data point)
    trans_X = X.T
    num_data = trans_X.shape[1]

    #compute sum of all data points 
    sum = 0
    for j in range(num_data): 
        sum += trans_X[:, j]
    #compute mean
    mean = sum / num_data

    W = np.zeros(trans_X.shape)
    for i in range(num_data): 
        W[:, i] = trans_X[:, i] - mean
    
    return W

#compute pi
def compute_pi(k_eigenvectors): 
    num_feature, k = k_eigenvectors.shape
    pi = []
    for j in range(num_feature): 
        sum = 0
        for i in range(k): 
            sum += k_eigenvectors[j][i] * k_eigenvectors[j][i]
        pi_j = 1/k * sum
        pi.append(pi_j)
    return np.array(pi)

#training dataset
train_X, train_Y = read_data('data/gisette_train.data')
#test dataset
X, Y = read_data("data/gisette_test.data")

#mean-centering the data
means = np.mean(train_X, axis=0)
mean_centered_train_X = train_X - means

means = np.mean(X, axis=0)
mean_centered_X = X - means

#construct W
W = construct_W(mean_centered_train_X)
covariance_matrix = np.dot(W, W.T)
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

#sort eigenvalues/eigenvectors in decreasing order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices] #reorder the columns

K = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
S = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
C = [1e6]

#select s features with each k
for k in K: 
    #get first k eigenvectors 
    k_eigenvectors = eigenvectors[:, :k]
    pi = compute_pi(k_eigenvectors)

    for s in S: 
        errors = []
        for _ in range(10): 

            #select s features randomly using pi as weight for each feature
            selected_features = random.choices(range(len(pi)), pi, k=s)
            selected_train_x = mean_centered_train_X[:, selected_features]
            selected_x = mean_centered_X[:, selected_features]
        
            #train model 
            for c in C: 
                w, b = train(c, selected_train_x, train_Y)  
                correct_predictions = 0
                total_data = selected_x.shape[0]

                for x, y in zip(selected_x, Y):
                    result = y * (np.dot(w.reshape(1, w.shape[0]),x) + b)
                    if (result > 0): #correct classification 
                        correct_predictions += 1
                        
                accuracy = correct_predictions / total_data
                errors.append(1 - accuracy)
                
                # print("c:", c)
                # print("accuracy:", accuracy, "%")
        print('k:', k)
        print('s', s)
        print('avg error:', np.mean(errors))
        print()

  if not _isfinite(total):


k: 10
s 10
avg error: 0.28659999999999997

k: 10
s 20
avg error: 0.2072

k: 10
s 30
avg error: 0.1824

k: 10
s 40
avg error: 0.19780000000000003

k: 10
s 50
avg error: 0.14920000000000005

k: 10
s 60
avg error: 0.1418

k: 10
s 70
avg error: 0.11679999999999997

k: 10
s 80
avg error: 0.11779999999999999

k: 10
s 90
avg error: 0.10559999999999999

k: 10
s 100
avg error: 0.09939999999999997

k: 20
s 10
avg error: 0.2840000000000001

k: 20
s 20
avg error: 0.23000000000000004

k: 20
s 30
avg error: 0.19679999999999997

k: 20
s 40
avg error: 0.16320000000000004

k: 20
s 50
avg error: 0.14560000000000003

k: 20
s 60
avg error: 0.13540000000000002

k: 20
s 70
avg error: 0.127

k: 20
s 80
avg error: 0.11279999999999997

k: 20
s 90
avg error: 0.1116

k: 20
s 100
avg error: 0.10619999999999999

k: 30
s 10
avg error: 0.2948

k: 30
s 20
avg error: 0.22920000000000001

k: 30
s 30
avg error: 0.20939999999999995

k: 30
s 40
avg error: 0.17260000000000003

k: 30
s 50
avg error: 0.1688

k: 30
s 60
avg e