In [1]:
import matplotlib.pyplot as plt
import numpy as np
import PIL 
import pandas as pd
import seaborn as sns #Ensure seaborn is installed for use on dcs machine
import random
from sklearn.metrics.pairwise import euclidean_distances

# Load all the images
dat = pd.read_csv('accent-mfcc-data-1.data', header=None)

dat_str = dat[0] #Get y column
datY = np.zeros(dat_str.shape[0])
for i in range(0,datY.shape[0]): #Converts symbols to numbers
    if dat_str[i] == 'FR':
        datY[i] = 1
    elif dat_str[i] == 'GE':
        datY[i]=2
    elif dat_str[i] == 'IT':
        datY[i]=3
    elif dat_str[i] == 'UK':
        datY[i]=4
    elif dat_str[i] == 'US':
        datY[i] = 5
    else:
        datY[i] = 0

#Data is not linearly seperable in Base data set
        
datX = dat.loc[:,1:]#Get rest of data

sizeX = datX.shape[0]

print ("Total num: ",sizeX)



ModuleNotFoundError: No module named 'seaborn'

In [None]:
def getPca(X, num): #Function to obtain PCA
    mean = np.mean(X, axis=0)
    dat_centred = X - mean
    U, S, V = np.linalg.svd(dat_centred, full_matrices=True)
    comps = V[:num]
    return  comps, mean, dat_centred

top_2, _, _ = getPca(datX, 2) #Get PCA for 2 dimensions


In [None]:

mean = np.mean(datX, axis=0)
dat_centred = datX-mean

PcaTransformed2 = np.dot(dat_centred, top_2.T)#Transforms the Pca


labels=datY
colours={0:'red',1:'green',2:'blue',3:'yellow',4:'black',5:'purple'}
labl={0:'ES', 1:'FR', 2:'GE', 3:'IT', 4:'UK', 5:'US'}
fig,ax=plt.subplots(figsize=(8,8))
for i in np.unique(labels): #Plot every point
    ix=np.where(labels==i)
    ax.scatter(PcaTransformed2[:, 0][ix],PcaTransformed2[:, 1][ix],s=20 ,c=colours[i],label=labl[i])
    
plt.xlabel("First PC",fontsize=15)#Label axes
plt.ylabel("Second PC",fontsize=15)

plt.legend() 
plt.show()#Plot

In [None]:
#Evidently, the data is no linearly seperable in 2 dimenions given how much the clusters in the left overlap

In [None]:

#Old function before discovering euclidean_distances
#def getKernel(x,gamma):
#    X = x.to_numpy()
#    K = np.zeros((x.shape[0],x.shape[0]))
#    for i in range(0,X.shape[0]):
#        for j in range(0,X.shape[0]):
#            n = (np.linalg.norm(X[i,:]-X[j,:]))
#            K[i][j] = np.exp(-gamma * (n * n))
#    A = np.full((sizeX,sizeX),1/sizeX)
#    AK = np.matmul(A,K) #Saves having to compute AK twice
#    Knorm = np.add(np.subtract(np.subtract(K,AK),np.matmul(K,A)),np.matmul(AK,A))
#    return Knorm


def getKernel(x,gamma): #Function to find kernel
    X = x.to_numpy()
    dist = euclidean_distances(X, X, squared=True) #Gets euclidean distance for kernels
    K = np.exp(-gamma * dist) #Sets kernel value
    A = np.full((sizeX,sizeX),1/sizeX)
    AK = np.matmul(A,K) #Saves having to compute AK twice
    Knorm = np.add(np.subtract(np.subtract(K,AK),np.matmul(K,A)),np.matmul(AK,A)) #Get normalised K matrix
    return Knorm



In [None]:


def Perceptron(x,y,max_steps):
    step = 0
    tot_steps = 0
    tot_errors = 0
    errors = 1
    w = np.ones((len(x[1]),np.argmax(y) + 1)) #Initialise weights to 1
    x_arr = np.array(x)
    while(step < max_steps):     #Run for each step 
        errors = 0
        
        shuffler = np.random.permutation(len(x))#Randomize to reduce err
        x_arr = x_arr[shuffler]
        y = y[shuffler]
        for i in range (len(x)): #Calculates prediction for each x row
            xi = x_arr[i][0:len(x[1])]     
            yi = y[i]

            y_pred = np.argmax(np.matmul(xi,w)) 

            if (y_pred != yi):     
                for j in range (len(w)):#Updates w and increments errors if prediction is incorrect

                    w[j][int(yi)] += xi[j]          
                    w[j][int(y_pred)] -= xi[j]     
                errors += 1
                tot_errors += 1
            tot_steps += 1

        step = step + 1 
    #check errors after all steps
    for i in range (len(x)):   
        xi = x_arr[i][0:len(x[1])]     
        yi = y[i]
        y_pred = np.argmax(np.matmul(xi,w)) 
        if (y_pred != yi):         
            errors += 1
            tot_errors += 1
    return w,1-(tot_errors/tot_steps),errors






In [None]:
variance = np.var(datX, axis=0).to_numpy()

max_var = np.max(variance)
min_var = np.min(variance)
gammax = 1 / (2 * ((min_var) - 0.05))
gammin = 1 / (2 * ((max_var) + 0.05)) #Calculate min and max gamma

mean = np.mean(datX, axis=0)
dat_centred = datX - mean
gammas = []
nums = []

g = gammin

while g < gammax:
    gammas.append(g)
    g+=0.01
    
increment = 10 #Increases principle components by 10 each time. Can be lowered.

for i in range(0,329,increment):
    nums.append(i)

    
errors = np.zeros((len(nums),len(gammas))) #Store for errors for each hyperparameter combo
g = 0


while gammin < gammax: #Grid search
    Ker = getKernel(dat_centred,gammin) #Get kernel for current gamma
    n=0
    for i in range(2,sizeX,increment):
        print(i)
        PCA, _, _ = getPca(Ker,i)
        transformed = np.dot(Ker, PCA.T)#Transform pca for use in perceptron
        transformed = np.append(np.ones((len(transformed),1)),transformed,axis=1) #Add bias column
        w, acc, errs = Perceptron(transformed,datY,10)#Run perceptron to get errors
        errors[n][g] = errs

        n+=1#Next PC number
    gammin += 0.01
    g+=1
    print(gammin)

In [None]:
np.random.seed(0)
sns.set()
    
ax1 = hm = sns.heatmap(errors, xticklabels=gammas, yticklabels=nums) #Display heatmap with errors for each gamma and PC
ax1.set(xlabel='Gamma', ylabel='Principle Components')
plt.show()


In [None]:
#The Data seems to be lineraly seperable for 20 + PCs, generating high number of errors for less.