In [31]:
#Added to avoid Kernel crash.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier

import time

In [32]:
# Load mnist dataset
(x_train_mnist, y_train_mnist), (x_test_mnist, y_test_mnist) = tf.keras.datasets.mnist.load_data()

In [3]:
#Generating the random class images for mnist dataset.
class_images_mnist =[]
for i in range(10):
  class_images_mnist.append(x_train_mnist[y_train_mnist==i][np.random.randint(len(x_train_mnist[y_train_mnist==i]))])

In [4]:
# Take one random image from each class and estimate L1 distance to every other class. Display these results as cell output.

def L1_distance(class_images):
  distances = np.zeros((10,10))
  for i in range(10):
    for j in range(10):
      distances[i][j] = np.sum(np.abs(class_images[i]-class_images[j]))
  return distances

In [5]:
#function call for mnist dataset
L1_distances_mnist = L1_distance(class_images_mnist)
print(L1_distances_mnist)

[[    0. 29732. 30822. 33370. 31675. 30239. 29639. 32595. 34288. 36786.]
 [30684.     0. 24386. 29750. 21911. 19195. 25507. 27439. 24268. 29326.]
 [32410. 28350.     0. 32756. 31573. 34233. 27745. 30957. 32138. 33356.]
 [33702. 32714. 34316.     0. 36193. 33221. 34413. 37881. 36502. 41560.]
 [32837. 20841. 25515. 32671.     0. 26468. 23820. 20632. 23605. 34295.]
 [34529. 23045. 32583. 34619. 28828.     0. 33448. 34100. 30417. 37779.]
 [35385. 33629. 35231. 40851. 36852. 37720.     0. 40076. 37929. 40939.]
 [30381. 26833. 26899. 33031. 29544. 32460. 29812.     0. 26013. 33887.]
 [41744. 31796. 33398. 37482. 36811. 35375. 36823. 32867.     0. 40642.]
 [33358. 28274. 28852. 33192. 28425. 33645. 32021. 30369. 31294.     0.]]


In [6]:
# Load Cifar 10 dataset
(x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = tf.keras.datasets.cifar10.load_data()

In [7]:
# Generate the class name for cifar10
class_names_cifar = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [8]:
cifar_label = np.ravel(y_train_cifar)
print(cifar_label[0])

6


In [9]:
#Generating the random class images for Cifar-10 dataset.
class_images_cifar =[]
for i in range(10):
    class_images_cifar.append(x_train_cifar[cifar_label==i][np.random.randint(len(x_train_cifar[cifar_label==i]))])

In [10]:
#function call for Cifar10 dataset
L1_distances_cifar = L1_distance(class_images_cifar)
print(L1_distances_cifar)

[[     0. 404982. 386052. 361540. 412997. 431712. 355500. 432510. 343938.
  392868.]
 [375050.      0. 371982. 357966. 429903. 447082. 393398. 386696. 372108.
  422062.]
 [395772. 410098.      0. 388672. 445505. 487260. 350888. 424826. 375934.
  374688.]
 [424124. 426418. 396224.      0. 442881. 379164. 365928. 455482. 387646.
  383840.]
 [371387. 352433. 337599. 341503.      0. 451355. 436327. 358969. 439869.
  415327.]
 [352672. 334998. 297124. 405988. 330213.      0. 445516. 247838. 448290.
  397124.]
 [428116. 390986. 433496. 418968. 347801. 336564.      0. 383954. 381654.
  367096.]
 [349826. 396152. 356742. 329414. 421575. 537058. 400430.      0. 413700.
  400166.]
 [440190. 412020. 408450. 394178. 342723. 334046. 402474. 370172.      0.
  364578.]
 [389468. 361554. 410464. 401056. 369057. 383932. 394504. 383706. 419806.
       0.]]


In [11]:
# Find the closest neighboring class (K=1) based on L1 distance for each image class and provide your thoughts on why these images from different class looks similar. 

def find_nearest_neighbor(distances):
  nearest_neighbors = []
  for i in range(10):
    min_distance = np.min(distances[i])
    nearest_neighbor_index = np.where(distances[i] == min_distance)[0][0]
    nearest_neighbors.append(nearest_neighbor_index)
  return nearest_neighbors

In [12]:
nearest_neighbors_mnist = find_nearest_neighbor(L1_distances_mnist)
nearest_neighbors_cifar = find_nearest_neighbor(L1_distances_cifar)

print("Nearest neighbors for MNIST:")
for i in range(10):
  print(f"Class {i}: -> {[nearest_neighbors_mnist[i]]}")

print("\nNearest neighbors for CIFAR-10:")
for i in range(10):
  print(f"Class {i}: {class_names_cifar[i]} -> {class_names_cifar[nearest_neighbors_cifar[i]]}")

Nearest neighbors for MNIST:
Class 0: -> [0]
Class 1: -> [1]
Class 2: -> [2]
Class 3: -> [3]
Class 4: -> [4]
Class 5: -> [5]
Class 6: -> [6]
Class 7: -> [7]
Class 8: -> [8]
Class 9: -> [9]

Nearest neighbors for CIFAR-10:
Class 0: airplane -> airplane
Class 1: automobile -> automobile
Class 2: bird -> bird
Class 3: cat -> cat
Class 4: deer -> deer
Class 5: dog -> dog
Class 6: frog -> frog
Class 7: horse -> horse
Class 8: ship -> ship
Class 9: truck -> truck


In [13]:
# Take one random image from each class and estimate L1 distance to every other class. Display these results as cell output.

def L2_distance(class_images):
  distances = np.zeros((10,10))
  for i in range(10):
    for j in range(10):
      distances[i][j] = np.sqrt(np.sum(np.power(np.abs(class_images[i]-class_images[j]), 2)))
  return distances

In [14]:
#function call for mnist L2 distance dataset
L2_distances_mnist = L2_distance(class_images_mnist)
print(L2_distances_mnist)

[[  0.         139.39153489 146.17113258 149.42556675 122.10241603
  138.6758811  143.34922393 139.38794783 150.1932089  152.74815875]
 [139.39153489   0.         116.67904696 148.69431731  83.108363
  119.72050785 136.26077939 117.5542428  128.98837157 135.60973416]
 [146.17113258 116.67904696   0.         152.5516306  107.80074211
  134.04103849 143.3352713  132.3669143  138.82362911 136.96714935]
 [149.42556675 148.69431731 152.5516306    0.         133.18032888
  151.22499793 152.88230767 144.26018162 152.25636276 153.84407691]
 [122.10241603  83.108363   107.80074211 133.18032888   0.
  102.52804494 117.38824473  94.3504107  115.7367703  116.94870671]
 [138.6758811  119.72050785 134.04103849 151.22499793 102.52804494
    0.         145.83552379 129.05812644 143.10485666 147.25148556]
 [143.34922393 136.26077939 143.3352713  152.88230767 117.38824473
  145.83552379   0.         137.46999673 147.53643618 147.91551643]
 [139.38794783 117.5542428  132.3669143  144.26018162  94.3504107

In [15]:
#function call for cifar10 L2 distance dataset
L2_distances_cifar = L2_distance(class_images_cifar)
print(L2_distances_cifar)

[[  0.         560.04999777 572.74252505 579.06649014 552.43370643
  566.3938559  576.95753743 572.07866592 575.03565107 563.62753659]
 [560.04999777   0.         564.52812153 569.82628932 575.49022581
  567.51211441 579.28921965 562.27929003 573.46839494 568.20066878]
 [572.74252505 564.52812153   0.         571.91782627 569.44271002
  565.5934936  566.64097981 575.41463311 567.38170573 574.38837036]
 [579.06649014 569.82628932 571.91782627   0.         576.06162865
  576.74084301 575.81941614 573.59218963 565.20615708 580.84937807]
 [552.43370643 575.49022581 569.44271002 576.06162865   0.
  561.00356505 574.15241879 562.35309193 571.09106104 572.53558841]
 [566.3938559  567.51211441 565.5934936  576.74084301 561.00356505
    0.         573.43003061 566.89681601 570.52081469 554.09205011]
 [576.95753743 579.28921965 566.64097981 575.81941614 574.15241879
  573.43003061   0.         568.15842861 577.35084654 544.95871403]
 [572.07866592 562.27929003 575.41463311 573.59218963 562.35309

In [16]:
nearest_neighbors_mnist = find_nearest_neighbor(L2_distances_mnist)
nearest_neighbors_cifar = find_nearest_neighbor(L2_distances_cifar)

print("Nearest neighbors for MNIST:")
for i in range(10):
  print(f"Class {i}: -> {[nearest_neighbors_mnist[i]]}")

print("\nNearest neighbors for CIFAR-10:")
for i in range(10):
  print(f"Class {i}: {class_names_cifar[i]} -> {class_names_cifar[nearest_neighbors_cifar[i]]}")

Nearest neighbors for MNIST:
Class 0: -> [0]
Class 1: -> [1]
Class 2: -> [2]
Class 3: -> [3]
Class 4: -> [4]
Class 5: -> [5]
Class 6: -> [6]
Class 7: -> [7]
Class 8: -> [8]
Class 9: -> [9]

Nearest neighbors for CIFAR-10:
Class 0: airplane -> airplane
Class 1: automobile -> automobile
Class 2: bird -> bird
Class 3: cat -> cat
Class 4: deer -> deer
Class 5: dog -> dog
Class 6: frog -> frog
Class 7: horse -> horse
Class 8: ship -> ship
Class 9: truck -> truck


In [33]:
#Combine the mnist dataset..
X = np.concatenate([x_train_mnist, x_test_mnist], axis=0)
y = np.concatenate([y_train_mnist, y_test_mnist], axis=0)

print(X.shape)
print(y.shape)

(70000, 28, 28)
(70000,)


In [34]:
#Split the training and testing data into 80% and 20%
from sklearn.model_selection import train_test_split

# Assuming X and y are already defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and test sets
print("Training set shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("\nTest set shapes:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Training set shapes:
X_train: (56000, 28, 28)
y_train: (56000,)

Test set shapes:
X_test: (14000, 28, 28)
y_test: (14000,)


In [19]:
#Based on L1 distance function, run K-nearest neighbor classifier for K=5 on test set.

def distance_formula(val, metric):
  
  #Define the KNN Classifier with k=5
  knn_mnist = KNeighborsClassifier(n_neighbors=val,metric= metric)
  
  #Fit the classifier on the traininf data
  knn_mnist.fit(X_train.reshape(-1,28*28),y_train)
  
  #Predict the labels for the test data
  predicted_labels_mnist = knn_mnist.predict(X_test.reshape(-1, 28 * 28))
  
  # Calculate the accuracy of the predictions
  accuracy_mnist = knn_mnist.score(X_test.reshape(-1, 28 * 28), y_test)
  
  return predicted_labels_mnist,accuracy_mnist

In [20]:
#Time execution started here...
start_time = time.time()

#Function run time..
predicted_labels,accuracy = distance_formula(5,'manhattan')

print("Predicted Labels: ",predicted_labels)

print("Accuracy: ",accuracy)

#Time execution ended here...
end_time = time.time()

elapsed_time = end_time-start_time

print("Time taken to complete the execution is:",elapsed_time)

Predicted Labels:  [8 4 8 ... 2 7 1]
Accuracy:  0.965
Time taken to complete the execution is: 2418.2009992599487


In [21]:
#Time execution started here...
start_time = time.time()

#Function run time..
predicted_labels,accuracy = distance_formula(25,'manhattan')

print("Predicted Labels: ",predicted_labels)

print("Accuracy: ",accuracy)

#Time execution ended here...
end_time = time.time()

elapsed_time = end_time-start_time

print("Time taken to complete the execution is:",elapsed_time)

Predicted Labels:  [8 4 3 ... 2 7 1]
Accuracy:  0.9515714285714286
Time taken to complete the execution is: 2361.558183193207


In [22]:
#Time execution started here...
start_time = time.time()

#Function run time..
predicted_labels,accuracy = distance_formula(55,'manhattan')

print("Predicted Labels: ",predicted_labels)

print("Accuracy: ",accuracy)

#Time execution ended here...
end_time = time.time()

elapsed_time = end_time-start_time

print("Time taken to complete the execution is:",elapsed_time)

Predicted Labels:  [8 4 8 ... 2 7 1]
Accuracy:  0.9405714285714286
Time taken to complete the execution is: 1480.6820652484894


In [23]:
#Time execution started here...
start_time = time.time()

#Function run time..
predicted_labels,accuracy = distance_formula(105,'manhattan')

print("Predicted Labels: ",predicted_labels)

print("Accuracy: ",accuracy)

#Time execution ended here...
end_time = time.time()

elapsed_time = end_time-start_time

print("Time taken to complete the execution is:",elapsed_time)

Predicted Labels:  [8 4 3 ... 2 7 1]
Accuracy:  0.9271428571428572
Time taken to complete the execution is: 1146.7460265159607


In [74]:
import numpy as np
import time

class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test,i):
        if i =='euclidean':
            pred = predict_using_L2(self,X_test)
            return pred
        elif i == 'manhattan':
            predictions = np.zeros(X_test.shape[0], dtype=int)
            for i, x_test in enumerate(X_test):
                distances = np.sum(np.abs(self.X_train - x_test), axis=1)
                k_neighbors_indices = np.argsort(distances)[:self.k]
                k_nearest_labels = self.y_train[k_neighbors_indices]
                predictions[i] = np.argmax(np.bincount(k_nearest_labels))

            return predictions

    def predict_using_L2(self, X_test):
      predictions = np.zeros(X_test.shape[0], dtype=int)
      for i, x_test in enumerate(X_test):
        # L2 or Euclidean Distance calculation
        distances = np.linalg.norm(self.X_train - x_test, axis=1)
        k_neighbors_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_neighbors_indices]
        predictions[i] = np.argmax(np.bincount(k_nearest_labels))

      return predictions


# Load MNIST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)

# Extract features (images) and labels
X = np.array(mnist.data.astype('float32'))
y = np.array(mnist.target.astype('int'))

# Normalize pixel values to be between 0 and 1
X_normalized = X / 255.0

# Split the data into train (80%) and test (20%) sets
split_index = int(0.8 * len(X_normalized))
X_train, X_test = X_normalized[:split_index], X_normalized[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(X_train.shape,X_test.shape)



  warn(


(56000, 784) (14000, 784)


In [71]:
#X_train= X_train.reshape(56000,28*28)
#X_test = X_test.reshape(14000,28*28)
for i in ['manhattan','euclidean']:
    for j in [5,25,55,105]:
        start_time = time.time()
        print("Distance calculating using " + i +" the value k is "+str(j))
        clf = KNN(j)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test,i)
        acc = np.mean(predictions == y_test)
        print("Accuracy of the Execution: ",acc)
        end_time = time.time()
        elapsed_time = (end_time-start_time)
        print("Time taken for the execution is: ",elapsed_time)

Distance calculating using manhattan the value k is 5
Accuracy of the Execution:  0.9648571428571429
Time taken for the execution is:  3267.532927751541
Distance calculating using manhattan the value k is 25
Accuracy of the Execution:  0.9572142857142857
Time taken for the execution is:  2522.59134721756
Distance calculating using manhattan the value k is 55
Accuracy of the Execution:  0.9470714285714286
Time taken for the execution is:  3214.1539709568024
Distance calculating using manhattan the value k is 105
Accuracy of the Execution:  0.9367857142857143
Time taken for the execution is:  1784.227282524109
Distance calculating using euclidean the value k is 5


NameError: name 'predict_using_L2' is not defined

In [None]:
#Posted the values for the manhattan distance execution.
Distance calculating using manhattan the value k is 5
Accuracy of the Execution:  0.9648571428571429
Time taken for the execution is:  3267.532927751541
Distance calculating using manhattan the value k is 25
Accuracy of the Execution:  0.9572142857142857
Time taken for the execution is:  2522.59134721756
Distance calculating using manhattan the value k is 55
Accuracy of the Execution:  0.9470714285714286
Time taken for the execution is:  3214.1539709568024
Distance calculating using manhattan the value k is 105
Accuracy of the Execution:  0.9367857142857143
Time taken for the execution is:  1784.227282524109

In [77]:
    i='euclidean'
    for j in [5,25,55,105]:
        start_time = time.time()
        print("Distance calculating using " + i +" the value k is "+str(j))
        clf = KNN(j)
        clf.fit(X_train, y_train)
        predictions = clf.predict_using_L2(X_test)
        acc = np.mean(predictions == y_test)
        print("Accuracy of the Execution: ",acc)
        end_time = time.time()
        elapsed_time = (end_time-start_time)
        print("Time taken for the execution is: ",elapsed_time)

Distance calculating using euclidean the value k is 5
Accuracy of the Execution:  0.9714285714285714
Time taken for the execution is:  2224.3867087364197
Distance calculating using euclidean the value k is 25
Accuracy of the Execution:  0.964
Time taken for the execution is:  3634.046198129654
Distance calculating using euclidean the value k is 55
Accuracy of the Execution:  0.9555
Time taken for the execution is:  3739.5229263305664
Distance calculating using euclidean the value k is 105
Accuracy of the Execution:  0.9473571428571429
Time taken for the execution is:  2852.9343724250793


In [None]:
Q4: Summarize the key observations in Q1, Q2, and Q3. 
Q1 description:I have loaded the mnist and cifar 10 datasets and calculated the manhattan and euclidean distances for them respectively.
For the Calculation of the distance, i have taken the random images from the datasets and took the one image from each class and calculated
the distance for one among the other datapoints.

Q2 description: I have previously loaded the data but it seems like it is divided in 90 10 ratio.But i have merged the test and training data 
in one cell and used the train test split method for the purpose of dividing the data in 80 and 20 in train and test ratios respectively.
Then i have initially calculated k=5,25,55,105 with manhattan distance using the inbuilt function KneighbourClassifier.But later on i have 
executed all the cases with knn code from the scratch using the classes and function.

From the observations..
If the K value is increasing then the Accuracy of the Execution is decreasing.
If the K value is increasing then the time taken by the execution also is increasing.
If there is any larger data the algorithm performance is going to be bad.
If the K=1 the algorithm performing better but it might take the longer time needs more processing speed.
Comparatively the performance is little better when we use the euclidean distance than using the Manhattan distance.
When coming to the execution with the dense layer networks.

NOTE:Executed the Dense Layer network in Different NoteBook[EE 623 Assignment1_vchalla3_Q3]