In [1]:
!pip install -q -r ../requirements.txt

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import dist

** Read Data**

---

In [3]:
test_in = pd.read_csv("test_in.csv",index_col=None)
test_in.shape
im = test_in.iloc[0].values.reshape((16,16))
# plt.imshow(im)

In [4]:
def no_index_read():
    def func(input:str):
        return pd.read_csv(input,index_col=None)
    return func
read_csv = no_index_read()

In [5]:
test_in = read_csv("test_in.csv")
test_out = read_csv("test_out.csv")
train_in = read_csv("train_in.csv")
train_out = read_csv("train_out.csv")

**1.1**  

---
We have to:
- find centers
- calculate distances between centers
- explain what we expect about the accuracy of the model
- what pairs are most difficult to separate?
---


In [6]:
# CALCULATE CENTERS
digits = sorted(list(set(train_out.values.flatten())))
centers = {}
for d in digits:
    centers[d] = train_in.iloc[(train_out == d).values].mean(axis=0)

In [7]:
# CALCULATE DISTANCES BETWEEN CENTERS
distances = np.zeros((len(digits),len(digits)))
for d1 in digits:
    for d2 in digits:
        distances[d1,d2] = dist(centers[d1],centers[d2])

In [None]:
# VISUALIZE DISTANCES
plt.imshow(distances,cmap="gnuplot")
plt.xticks(digits)
plt.yticks(digits)
plt.colorbar()
plt.show()

Using this scale it is easy for us to notice similarities in the puple areas where the eucledian distance is low.
- 4 is similar to 5 and a bit with 6
- 9 is similar to 4 and 7 and a bit with 8
- 0 and 1 are very different between them

**1.2**

---
Use:
- PCA
- U-MAP
- T-SNE
on the MNIST dataset
---

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

pca_2d = pca.fit_transform(train_in)
def plot2d(arr_2d,title):
    x = arr_2d[:,0]
    y = arr_2d[:,1]
    plt.scatter(x,y,c=train_out.values.flatten(),cmap="tab10")
    plt.colorbar()
    plt.title(title)
plot2d(pca_2d,"PCA")

We can see that PCA results create an overall non separable data distribution.
- 1 highly separable
- 0 has a part that is away from anything else
- all the other numbers are blended with at least 2 other numbers

In [None]:
# UMAP
import umap
reducer = umap.UMAP()
umap_2d = reducer.fit_transform(train_in)
plot2d(umap_2d,"UMAP")

UMAP makes a much better job of gathering most of the items from a class close together
Some classes are still bounded close together
- 0 with 6
- 3,8,5 
- 9,4,7 (same as what we expected from centers)
- 1 with some 4s
- 2 stands alone but in between everything



In [None]:
#T-SNE
from sklearn.manifold import TSNE
reducer = TSNE()
tsne_2d = reducer.fit_transform(train_in)
plot2d(tsne_2d,"TSNE")

Similar results with UMAP but manages to separate 6 with 0
- 1 alone (again with little amount of 4s)
- 4,9,7
- 2 in the middle
- 3,5,8 (bottom right)
- 0 to the right
- 6 at the top