In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

In [2]:
X_train = np.loadtxt("Dataset/train/X_train.txt")
y_train = np.loadtxt("Dataset/train/y_train.txt")

X_test = np.loadtxt("Dataset/test/X_test.txt")
y_test = np.loadtxt("Dataset/test/y_test.txt")

In [3]:
def standardize(X_train, X_test):
    mu = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    X_train_standardized = (X_train - mu)/std
    X_test_standardized = (X_test - mu)/std
    return X_train_standardized, X_test_standardized

X_train, X_test = standardize(X_train, X_test)

In [4]:
def get_pooled_cov(X_train, y_train):

    class_num = np.unique(y_train).astype(int)
    K = len(class_num)
    
    p = len(X_train[0,:])
    
    N = len(X_train[:,0])
    
    pooled_cov = np.zeros((p,p))
    
    mu = []
    
    pi = np.zeros((K))
    
    for i in class_num:
        idx = np.array(np.where(y_train == i))
        # print(i)
        
        idx = idx[0]        
        # print(idx)
        
        X_train_class_i = X_train[idx,:]
        # print(y_train[idx])
        
        pi[i-1] = len(X_train_class_i[:,0]) / N
        
        mu_i = np.matrix(np.mean(X_train_class_i, axis=0)).T
        mu.append(mu_i)
        
        # print(mu_i.shape)
        
        for j in range(len(X_train_class_i)):
            # pooled_cov = pooled_cov + X_train_class_i
            x_current = np.matrix(X_train_class_i[j,:]).T
            
            pooled_cov = pooled_cov + (x_current - mu_i) * (x_current - mu_i).T
            
    pooled_cov = 1/(N - K) * pooled_cov

    return pooled_cov, mu, pi

In [5]:
pooled_cov, mu, pi = get_pooled_cov(X_train, y_train)

In [6]:
def LDA(X_train, y_train, X_test, shrinkage):
    
    class_num = np.unique(y_train).astype(int)
    K = len(class_num)
    
    N_test = len(y_test)
    
    p = len(X_train[0,:])
    
    pooled_cov, mu, pi = get_pooled_cov(X_train, y_train)
    
    pooled_cov = (1-shrinkage) * pooled_cov + shrinkage * np.identity(p)
    
    pooled_cov_inv = pooled_cov.I
        
    delta = np.zeros((K))
    
    y_test_pred = np.zeros((N_test))
    
    for i in range(N_test):
        
        x_one = np.matrix(X_test[i,:])
        # print(x_one)
        
        for j in range(K):
            
           # print(x_one.shape)
            #print(pooled_cov.shape)
            #print(mu[j].shape)
            
            delta[j] = np.log(pi[j]) + x_one *(pooled_cov_inv) * (mu[j]) - 0.5 * mu[j].T * (pooled_cov_inv) * (mu[j])
        # print(delta)
    
    # np.log(pi[j]) 
        y_test_pred[i] = np.argmax(delta) + 1
        
    return y_test_pred

In [7]:
def LDA_usd_in_loop(X_test, shrinkage, mu, pi, pooled_cov):
    
    class_num = np.unique(y_train).astype(int)
    K = len(class_num)
    
    N_test = len(y_test)
    
    p = len(X_train[0,:])
    
    
    # pooled_cov = (1-shrinkage) * pooled_cov + shrinkage * np.identity(p)
    
    pooled_cov =  pooled_cov + shrinkage * np.identity(p)
    
    pooled_cov_inv = pooled_cov.I
        
    delta = np.zeros((K))
    
    y_test_pred = np.zeros((N_test))
    
    for i in range(N_test):
        
        x_one = np.matrix(X_test[i,:])
        # print(x_one)
        
        for j in range(K):
            
           # print(x_one.shape)
            #print(pooled_cov.shape)
            #print(mu[j].shape)
            
            delta[j] = np.log(pi[j]) + x_one *(pooled_cov_inv) * (mu[j]) - 0.5 * mu[j].T * (pooled_cov_inv) * (mu[j])
        # print(delta)
    
    # np.log(pi[j]) 
        y_test_pred[i] = np.argmax(delta) + 1
        
    return y_test_pred

In [8]:
y_test_pred = LDA_usd_in_loop(X_test, 0.5, mu, pi, pooled_cov)
accuracy_score(y_test_pred, y_test)

0.9416355615880556

In [9]:
accuracy = []
for shrinkage in np.arange(0.00,1.01,0.05):
    y_test_pred = LDA_usd_in_loop(X_test, shrinkage, mu, pi, pooled_cov)
    accuracy.append(accuracy_score(y_test_pred, y_test))
    
print(accuracy)

[0.030539531727180182, 0.9569053274516457, 0.9524940617577197, 0.9501187648456056, 0.9494401085850017, 0.9467254835425857, 0.9446895147607737, 0.9440108585001696, 0.9423142178486597, 0.9419748897183576, 0.9416355615880556, 0.9395995928062436, 0.9382422802850356, 0.9375636240244316, 0.9358669833729216, 0.9348489989820156, 0.9331523583305056, 0.9341703427214116, 0.9338310145911096, 0.9331523583305056, 0.9324737020699015]


array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])