In [3]:

import numpy as np
import os
import cv2
from scipy.spatial.distance import cdist, euclidean
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import pandas as pd


In [4]:
def load_local_data(data_path):
    images = {}

    for label in range(10):
        images[label] = []

        folder_path = os.path.join(data_path, str(label))
        file_names = os.listdir(folder_path)

        for file_name in file_names:
            image_path = os.path.join(folder_path, file_name)
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            if image is not None:
                # Resize the image to a consistent size if needed
                # image = cv2.resize(image, (desired_width, desired_height))

                images[label].append(image.flatten())

    return images



In [5]:
data_path = 'TinyMNIST_2/TinyMNIST/train/'
images = load_local_data(data_path)

for label, image in images.items():
    print(f"Number {label}: {len(image)} images")

Number 0: 400 images
Number 1: 400 images
Number 2: 400 images
Number 3: 400 images
Number 4: 400 images
Number 5: 400 images
Number 6: 400 images
Number 7: 400 images
Number 8: 400 images
Number 9: 400 images


In [6]:
images_mean = {}

for label in images.keys():
    images_mean[label] = np.mean(images[label], axis=0) 

In [7]:

def calculate_minimum_distance(test_image_path, images_mean):
    test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE).flatten()
    distances = list(range(0,10))
    for label in range(0,10):
        # distances[label] = cdist(np.array([images_mean[label]]), np.array([test_image]), metric='euclidean')
        distances[label] = euclidean(images_mean[label], test_image)
    min_distance = np.min(distances)
    return distances.index(min_distance)#, min_distance, distances

In [13]:
test_image = cv2.imread('TinyMNIST_2/TinyMNIST/test/0/img_1.jpg', cv2.IMREAD_GRAYSCALE).flatten()
distances = list(range(0,10))
for label in range(0,10):
    # distances[label] = cdist(np.array([images_mean[label]]), np.array([test_image]), metric='euclidean')
    distances[label] = m, c = np.linalg.lstsq(np.vstack([images_mean, np.ones(len(images_mean))]).T, test_image,rcond=None)[0]
min_distance = np.min(distances)
# distances.index(min_distance)#, min_distance, distances

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 10

In [8]:
df = pd.DataFrame(columns=['id', 'label'])

for label in range(10):
    img_list = os.listdir(f"TinyMNIST_2/TinyMNIST/test/{label}")
    for img in img_list:
        data = {
        "id": int(img.split(".")[0].split('_')[1]),
        "label": int(label)
        }

        # df.append(pd.DataFrame(data))
        df.loc[len(df)] = data

test = df.drop_duplicates().sort_values('id')

In [9]:
df = pd.DataFrame(columns=['id', 'label'])

for img in os.listdir('TinyMNIST/test/test'):
    data = {
        "id": int(img.split(".")[0].split('_')[1]),
        "label": int(calculate_minimum_distance(f'TinyMNIST/test/test/{img}', images_mean))
    }
    df.loc[len(df)] = data

pred = df.drop_duplicates().sort_values('id')

In [11]:
accuracy_score(test['label'], pred['label'])

0.79

In [12]:
confusion_matrix(test['label'], pred['label'])

array([[53,  0,  0,  1,  0,  4,  2,  0,  1,  0],
       [ 0, 56,  0,  0,  0,  1,  0,  0,  2,  0],
       [ 0,  5, 41,  2,  3,  0,  3,  1,  3,  0],
       [ 0,  2,  1, 50,  1,  3,  0,  1,  1,  1],
       [ 0,  0,  0,  0, 50,  0,  2,  0,  1,  7],
       [ 1,  7,  0, 10,  1, 36,  2,  0,  1,  1],
       [ 1,  4,  1,  1,  3,  2, 48,  0,  1,  0],
       [ 0,  4,  1,  0,  3,  0,  1, 43,  3,  7],
       [ 0,  2,  0,  7,  0,  2,  0,  0, 47,  2],
       [ 1,  1,  0,  1,  2,  1,  0,  2,  2, 50]])

In [189]:
def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
    
    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc

    return roc_auc_dict

roc_auc_score_multiclass(test['label'], pred['label'])

{0: 0.9316432981538367,
 1: 0.9514709107428178,
 2: 0.8506807481867922,
 3: 0.8962962962962964,
 4: 0.9046296296296298,
 5: 0.793069958332028,
 6: 0.8841661851029533,
 7: 0.843056721429428,
 8: 0.8777777777777777,
 9: 0.9}

In [10]:
test

Unnamed: 0,id,label
61,0,1
0,1,0
72,2,1
255,3,4
43,4,0
...,...,...
534,653,8
478,654,7
412,659,6
536,672,8


In [11]:
pred

Unnamed: 0,id,label
0,0,1
1,1,0
112,2,1
223,3,4
334,4,0
...,...,...
558,653,8
559,654,9
560,659,6
563,672,8
