### LDA before k Nearest Neighbor as a pre-processing step

In [1]:
# Performing the k-nearest neighbor algorithm on the iris dataset along with the preprocessing of the data using the linear discriminant analysis

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# loading the dataset
data = pd.read_csv('iris.csv')
# converting the data into numpy array
data = data.values
# remove the first column of the data
data = data[:,1:]
# converting the string labels into integers
for i in range(data.shape[0]):
    if data[i,4] == 'Iris-setosa':
        data[i,4] = 0
    elif data[i,4] == 'Iris-versicolor':
        data[i,4] = 1
    else:
        data[i,4] = 2
# splitting the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(data[:,0:4], data[:,4], test_size=0.2, random_state=42, stratify=data[:,4])

In [2]:
# separate the data for each class
mean_classes = np.zeros((3,4))
# finding the mean of each class
for i in range(3):
    # finding the mean of the class i
    mean_classes[i] = np.mean(X_train[Y_train[:] == i], axis=0)
# overall mean of the data
mean_overall = np.mean(X_train, axis=0)

In [3]:
# computing the within class scatter matrix for the training data
# declare the within class scatter matrix
within_class_scatter_matrix = np.zeros((4,4))
for i in range(3):
    within_class_scatter_matrix_i = np.zeros((4, 4))
    for j in range(X_train.shape[0]):
        if Y_train[j] == i:
            x = X_train[j]
            mean = mean_classes[i]
            within_class_scatter_matrix_i = within_class_scatter_matrix_i + (x - mean).reshape(-1, 1).dot((x - mean).reshape(1, -1))
    within_class_scatter_matrix = within_class_scatter_matrix + within_class_scatter_matrix_i

In [4]:
# declare the between class scatter matrix
between_class_scatter_matrix = np.zeros((4,4))
for i in range(3):
    n = X_train[Y_train[:] == i].shape[0]
    mean = mean_classes[i]
    between_class_scatter_matrix = between_class_scatter_matrix + n * (mean - mean_overall).reshape(-1, 1).dot((mean - mean_overall).reshape(1, -1))


In [5]:
# converting the matrices from float to float64
within_class_scatter_matrix = np.array(within_class_scatter_matrix, dtype='float64')
between_class_scatter_matrix = np.array(between_class_scatter_matrix, dtype='float64')

In [6]:
# finding matrix product of the inverse of the within class scatter matrix and the between class scatter matrix
req_matrix = np.matmul(np.linalg.inv(within_class_scatter_matrix), between_class_scatter_matrix)

In [7]:
# computing the eigen values and eigen vectors of the matrix
eigen_values, eigen_vectors = np.linalg.eig(req_matrix)
# sorting the eigen values and eigen vectors in descending order
indices = np.argsort(eigen_values)[::-1]
eigen_values = eigen_values[indices]
eigen_vectors = eigen_vectors[:,indices]
# choosing the eigenvectors with the highest eigen value
eigen_vectors = eigen_vectors[:,0:1]

In [8]:
# projecting the data onto the new feature space
# compute the matrix product of the chosen eigenvectors and the data
X_train_lda = np.matmul(X_train, eigen_vectors)
X_test_lda = np.matmul(X_test, eigen_vectors)


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# instantiating the number of neighbors
k = 5
# instantiating the KNN classifier
knn = KNeighborsClassifier(n_neighbors=k)
# fitting the model
Y_train = Y_train.astype('int')
knn.fit(X_train_lda, Y_train)


In [10]:
# predicting the values
Y_pred = knn.predict(X_test_lda)
# calculate the accuracy of the model
count = 0
for i in range(len(Y_test)):
    if (Y_test[i] == Y_pred[i]):
        count+=1
print("Accuracy of the model with the pre-processing step of LDA is: ", count/len(Y_test)*100 ,"%")
accuracy_with_LDA = count/len(Y_test)*100


Accuracy of the model with the pre-processing step of LDA is:  100.0 %


### K-Nearest Neighbor implemented without LDA pre-processing

In [11]:
# Performing the k-nearest neighbor algorithm on the iris dataset without the preprocessing of the data using the linear discriminant analysis
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# loading the dataset
data = pd.read_csv('iris.csv')
# converting the data into numpy array
data = data.values
# remove the first column of the data
data = data[:,1:]
# converting the string labels into integers
for i in range(data.shape[0]):
    if data[i,4] == 'Iris-setosa':
        data[i,4] = 0
    elif data[i,4] == 'Iris-versicolor':
        data[i,4] = 1
    else:
        data[i,4] = 2
# splitting the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(data[:,0:4], data[:,4], test_size=0.2, random_state=2, stratify=data[:,4])

In [12]:
# k represents the number of neighbors
k = 5
# instantiating the KNN classifier
knn = KNeighborsClassifier(n_neighbors=k)
# fitting the model
Y_train = Y_train.astype('int')
knn.fit(X_train, Y_train)
# predicting the values
Y_pred = knn.predict(X_test)
# calculate the accuracy of the model
count = 0
for i in range(len(Y_test)):
    if (Y_test[i] == Y_pred[i]):
        count+=1
print("Accuracy of the model is: ", count/len(Y_test)*100 ,"%")
accuracy_without_LDA = count/len(Y_test)*100

Accuracy of the model is:  96.66666666666667 %


In [13]:
# Final accuracy of the model with and without the preprocessing step of LDA
print("Training size: 120 samples")
print("Test size: 30 samples")
print("Accuracy of the model with the pre-processing step of LDA is: ", accuracy_with_LDA ,"%")
print("Accuracy of the model is: ", accuracy_without_LDA ,"%")

Training size: 120 samples
Test size: 30 samples
Accuracy of the model with the pre-processing step of LDA is:  100.0 %
Accuracy of the model is:  96.66666666666667 %
