In [61]:
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import data_utils
import download
from scipy.stats import skew, kurtosis
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import cv2 
import keras
from keras import layers
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from keras import ops

In [62]:
def loadData(path):
    listOfTestFiles = os.listdir(path=path)
    train = []
    train_labels = []
    test = []
    test_labels = []
        
        
    print("Training files = ",listOfTestFiles[1:6])
    #For collecting Training data:
    for file in listOfTestFiles[1:6]:
        with open(path+file,'rb') as fo:
            dict = pickle.load(fo,encoding='bytes')
            train.append(dict[b'data'])
            train_labels.append(dict[b'labels'])

    print(listOfTestFiles[7])
    #for collecting Testing data
    with open(path+listOfTestFiles[7],'rb') as fo:
            dict = pickle.load(fo,encoding='bytes')
            test.append(dict[b'data'])
            test_labels.append(dict[b'labels'])

    dictData = {}
    dictData['train_data'] = np.reshape(np.array(train),newshape=(np.array(train).shape[0]*np.array(train).shape[1],np.array(train).shape[2]))
    dictData['train_labels'] = np.reshape(np.array(train_labels),newshape=(np.array(train_labels).shape[0]*np.array(train_labels).shape[1]))
    dictData['test_data'] = np.reshape(np.array(test),newshape=(np.array(test).shape[0]*np.array(test).shape[1],np.array(test).shape[2]))
    dictData['test_labels'] = np.reshape(np.array(test_labels),newshape=(np.array(test_labels).shape[0]*np.array(test_labels).shape[1]))
    return dictData

In [63]:
class kNearestNeighbour(object):
    def __init__(self, metric='l1'):
        """
        Initialize the KNN classifier.
        :param metric: The distance metric to use ('l1', 'l2', 'cosine').
        """
        self.metric = metric

    def train(self, X, Y):
        """
        Memorize the training data.
        :param X: Training data of shape (N, F).
        :param Y: Training labels of shape (N,).
        """
        self.Xtr = X
        self.Ytr = Y

    def _compute_distance(self, x1, x2):
        """
        Compute the distance between a single test example and all training examples.
        :param x1: A single test example of shape (F,).
        :param x2: Training examples of shape (N, F).
        :return: Distance of shape (N,).
        """
        if self.metric == 'l1':
            return np.sum(np.abs(x2 - x1), axis=1)
        elif self.metric == 'l2':
            return np.sqrt(np.sum((x2 - x1) ** 2, axis=1))
        elif self.metric == 'cosine':
            x1_norm = np.linalg.norm(x1)
            x2_norms = np.linalg.norm(x2, axis=1)
            return 1 - (np.dot(x2, x1) / (x1_norm * x2_norms))
        else:
            return np.sum(np.abs(x2 - x1), axis=1) # default L1 distance

    def predict(self, X, k):
        """
        Predict labels for test data.
        :param X: Test data of shape (M, F).
        :param k: Number of neighbors to consider.
        :return: Predicted labels of shape (M,).
        """
        if k > self.Xtr.shape[0]:
            raise ValueError(f"k={k} is greater than the number of training samples={self.Xtr.shape[0]}")

        test_samples = X.shape[0]
        Ypred = np.zeros(test_samples, dtype=self.Ytr.dtype)

        for i in range(test_samples):
            print(f"Test example = {i}", end="\r")

            # Compute distances based on the chosen metric
            dist = self._compute_distance(X[i, :], self.Xtr)

            # Find the indices of the k smallest distances
            idx = np.argpartition(dist, k)[:k]

            # Weighted voting or simple majority voting
            label_count = np.zeros(10, dtype=np.float64)
            for x in idx:
                weight = 1 / (dist[x] + 1e-9)  # Avoid division by zero
                label_count[int(self.Ytr[x])] += weight

            # Assign the label with the maximum weighted vote
            Ypred[i] = np.argmax(label_count)

        return Ypred


In [64]:
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
assert x_train.shape == (50000, 32, 32, 3)
assert x_test.shape == (10000, 32, 32, 3)
assert y_train.shape == (50000, 1)
assert y_test.shape == (10000, 1)

In [65]:
# Normalize the data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Flatten the data
x_train_flat = x_train.reshape(x_train.shape[0], -1)  # (50000, 32*32*3)
x_test_flat = x_test.reshape(x_test.shape[0], -1)    # (10000, 32*32*3)

# Reshape labels to 1D
y_train = y_train.flatten()
y_test = y_test.flatten()

In [66]:
# Initialize the KNN classifier
knn = kNearestNeighbour()

# Train the classifier
knn.train(x_train_flat, y_train)

# Predict the labels for a subset of test data (e.g., 100 samples)
num_test_samples = 100  # To save time, use a small subset for testing
y_pred = knn.predict(x_test_flat[:num_test_samples], k=20)

# Calculate accuracy
accuracy = accuracy_score(y_test[:num_test_samples], y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.37 99


In [67]:
# Apply LDA
lda = LinearDiscriminantAnalysis(n_components=9)  # CIFAR-10 has 10 classes, so max components = 10 - 1
x_train_lda = lda.fit_transform(x_train_flat, y_train)
x_test_lda = lda.transform(x_test_flat)

In [68]:
# Train the classifier after LDA

knn.train(x_train_lda, y_train)

# Define the range of k values to test
k_values = range(1, 75) 

# Store the accuracies for each k
accuracies = []

best_k = None
highest_accuracy = 0

for k in k_values:
    # Predict the labels for a subset of test data
    num_test_samples = 100  # To save time, use a small subset for testing
    y_pred = knn.predict(x_test_lda[:num_test_samples], k=k)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test[:num_test_samples], y_pred)
    accuracies.append(accuracy)
    
    # Check if this is the best accuracy so far
    if accuracy > highest_accuracy:
        highest_accuracy = accuracy
        best_k = k
    
    # Print accuracy for this k
    print(f"Accuracy for k = {k}: {accuracy * 100:.2f}%")

# Print the best k and highest accuracy
print("\nBest k value:")
print(f"k = {best_k}: Accuracy = {highest_accuracy * 100:.2f}%")


Accuracy for k = 1: 30.00%
Accuracy for k = 2: 30.00%
Accuracy for k = 3: 34.00%
Accuracy for k = 4: 38.00%
Accuracy for k = 5: 38.00%
Accuracy for k = 6: 35.00%
Accuracy for k = 7: 40.00%
Accuracy for k = 8: 39.00%
Accuracy for k = 9: 40.00%
Accuracy for k = 10: 37.00%
Accuracy for k = 11: 38.00%
Accuracy for k = 12: 36.00%
Accuracy for k = 13: 36.00%
Accuracy for k = 14: 33.00%
Accuracy for k = 15: 36.00%
Accuracy for k = 16: 35.00%
Accuracy for k = 17: 34.00%
Accuracy for k = 18: 33.00%
Accuracy for k = 19: 37.00%
Accuracy for k = 20: 37.00%
Accuracy for k = 21: 37.00%
Accuracy for k = 22: 36.00%
Accuracy for k = 23: 37.00%
Accuracy for k = 24: 37.00%
Accuracy for k = 25: 36.00%
Accuracy for k = 26: 38.00%
Accuracy for k = 27: 37.00%
Accuracy for k = 28: 38.00%
Accuracy for k = 29: 38.00%
Accuracy for k = 30: 39.00%
Accuracy for k = 31: 41.00%
Accuracy for k = 32: 39.00%
Accuracy for k = 33: 40.00%
Accuracy for k = 34: 41.00%
Accuracy for k = 35: 41.00%
Accuracy for k = 36: 41.00%
A

In [69]:
# Train the classifier after LDA
knn = kNearestNeighbour(metric='cosine')
knn.train(x_train_lda, y_train)

# Define the range of k values to test
k_values = range(1, 75) 

# Store the accuracies for each k
accuracies = []

best_k = None
highest_accuracy = 0

for k in k_values:
    # Predict the labels for a subset of test data
    num_test_samples = 100  # To save time, use a small subset for testing
    y_pred = knn.predict(x_test_lda[:num_test_samples], k=k)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test[:num_test_samples], y_pred)
    accuracies.append(accuracy)
    
    # Check if this is the best accuracy so far
    if accuracy > highest_accuracy:
        highest_accuracy = accuracy
        best_k = k
    
    # Print accuracy for this k
    print(f"Accuracy for k = {k}: {accuracy * 100:.2f}%")

# Print the best k and highest accuracy
print("\nBest k value:")
print(f"k = {best_k}: Accuracy = {highest_accuracy * 100:.2f}%")

Accuracy for k = 1: 22.00%
Accuracy for k = 2: 22.00%
Accuracy for k = 3: 27.00%
Accuracy for k = 4: 33.00%
Accuracy for k = 5: 33.00%
Accuracy for k = 6: 36.00%
Accuracy for k = 7: 35.00%
Accuracy for k = 8: 35.00%
Accuracy for k = 9: 37.00%
Accuracy for k = 10: 37.00%
Accuracy for k = 11: 35.00%
Accuracy for k = 12: 35.00%
Accuracy for k = 13: 36.00%
Accuracy for k = 14: 38.00%
Accuracy for k = 15: 40.00%
Accuracy for k = 16: 37.00%
Accuracy for k = 17: 40.00%
Accuracy for k = 18: 41.00%
Accuracy for k = 19: 41.00%
Accuracy for k = 20: 38.00%
Accuracy for k = 21: 40.00%
Accuracy for k = 22: 39.00%
Accuracy for k = 23: 40.00%
Accuracy for k = 24: 41.00%
Accuracy for k = 25: 41.00%
Accuracy for k = 26: 41.00%
Accuracy for k = 27: 40.00%
Accuracy for k = 28: 41.00%
Accuracy for k = 29: 41.00%
Accuracy for k = 30: 41.00%
Accuracy for k = 31: 41.00%
Accuracy for k = 32: 41.00%
Accuracy for k = 33: 41.00%
Accuracy for k = 34: 41.00%
Accuracy for k = 35: 41.00%
Accuracy for k = 36: 41.00%
A

In [70]:
# Apply PCA
n_components = 100  # Set the number of components you want to keep after PCA
pca = PCA(n_components=n_components)

# Fit PCA on training data and transform both train and test data
x_train_pca = pca.fit_transform(x_train_flat)
x_test_pca = pca.transform(x_test_flat)

In [73]:
# Flatten y_test if necessary
y_test = y_test.flatten()

# Train the classifier after PCA
knn = kNearestNeighbour(metric='cosine')
knn.train(x_train_pca, y_train)

# Define the range of k values to test
k_values = range(1, 75)

# Store the accuracies for each k
accuracies = []

best_k = None
highest_accuracy = 0

for k in k_values:
    # Predict the labels for a subset of test data
    num_test_samples = 100  # To save time, use a small subset for testing
    y_pred = knn.predict(x_test_pca[:num_test_samples], k=k)  # Use x_test_pca here
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test[:num_test_samples], y_pred)
    accuracies.append(accuracy)
    
    # Check if this is the best accuracy so far
    if accuracy > highest_accuracy:
        highest_accuracy = accuracy
        best_k = k
    
    # Print accuracy for this k
    print(f"Accuracy for k = {k}: {accuracy * 100:.2f}%")

# Print the best k and highest accuracy
print("\nBest k value:")
print(f"k = {best_k}: Accuracy = {highest_accuracy * 100:.2f}%")


Accuracy for k = 1: 36.00%
Accuracy for k = 2: 36.00%
Accuracy for k = 3: 37.00%
Accuracy for k = 4: 43.00%
Accuracy for k = 5: 41.00%
Accuracy for k = 6: 40.00%
Accuracy for k = 7: 43.00%
Accuracy for k = 8: 46.00%
Accuracy for k = 9: 44.00%
Accuracy for k = 10: 42.00%
Accuracy for k = 11: 45.00%
Accuracy for k = 12: 45.00%
Accuracy for k = 13: 46.00%
Accuracy for k = 14: 48.00%
Accuracy for k = 15: 48.00%
Accuracy for k = 16: 49.00%
Accuracy for k = 17: 46.00%
Accuracy for k = 18: 46.00%
Accuracy for k = 19: 48.00%
Accuracy for k = 20: 46.00%
Accuracy for k = 21: 47.00%
Accuracy for k = 22: 48.00%
Accuracy for k = 23: 50.00%
Accuracy for k = 24: 52.00%
Accuracy for k = 25: 50.00%
Accuracy for k = 26: 50.00%
Accuracy for k = 27: 49.00%
Accuracy for k = 28: 50.00%
Accuracy for k = 29: 49.00%
Accuracy for k = 30: 51.00%
Accuracy for k = 31: 52.00%
Accuracy for k = 32: 51.00%
Accuracy for k = 33: 50.00%
Accuracy for k = 34: 48.00%
Accuracy for k = 35: 52.00%
Accuracy for k = 36: 50.00%
A