In [14]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy.stats as ss
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import time

In [None]:
compressed = np.load("../MNIST_project/output/compressed_output/compressed.npz")
names = compressed["names"]
data = compressed["data"]
print(f"Latent space shape: {data.shape}")

zeros = np.where(names == 0)[0]
ones = np.where(names == 1)[0]
twos = np.where(names == 2)[0]
threes = np.where(names == 3)[0]
fours = np.where(names == 4)[0]
fives = np.where(names == 5)[0]
sixes = np.where(names == 6)[0]
sevens = np.where(names == 7)[0]
eights = np.where(names == 8)[0]
nines = np.where(names == 9)[0]

numbers = [
    [zeros, "zero"],
    [ones, "one"],
    [twos, "two"],
    [threes, "three"],
    [fours, "four"],
    [fives, "five"],
    [sixes, "six"],
    [sevens, "seven"],
    [eights, "eight"],
    [nines, "nine"],
]
good = 0
for count, number in enumerate(numbers):
    for name in names[number[0]]:
        if count == name:
            good += 1
print(good)

In [None]:
pca = PCA(n_components=2)
start = time.time()
latent_2d_pca = pca.fit_transform(data)
print(f"Time taken linearly {time.time()-start:.2f}s")

In [None]:
pca = PCA(n_components=10)
latent_10d_pca = pca.fit_transform(data)

tsne = TSNE(n_components=2, perplexity=50)
latent_2d_tsne = tsne.fit_transform(latent_10d_pca)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(36, 14))

x_min = 0
x_max = 0
y_min = 0
y_max = 0

for number, ax in zip(numbers, axs.flatten()):
    ax.scatter(latent_2d_pca[:,0][number[0]], latent_2d_pca[:,1][number[0]], alpha=0.1)
    ax.set_title(number[1], fontsize=20)
    if (maxi := max(ax.get_xticks())) > x_max:
        x_max = maxi
    if (mini := min(ax.get_xticks())) < x_min:
        x_min = mini
    if (maxi := max(ax.get_yticks())) > y_max:
        y_max = maxi
    if (mini := min(ax.get_yticks())) < y_min:
        y_min = mini
    ax.set_xlabel("Feature 1", fontsize=16)
    ax.set_ylabel("Feature 2", fontsize=16)

    
for ax in axs.flatten():
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
fig.suptitle("PCA dimensionality reduction from 79 features -> 2 features\nModel: AE, Epochs: 80, Batch Size: 512", fontsize=36)
plt.savefig("latent_space_2_features_plot.png", dpi=600)

In [71]:
means = []
covs = []
good = 0
bad = 0
min_idxs = []
max_idxs = []
for count1, number in enumerate(numbers):
    mean = np.mean(data[number[0]], axis=0)
    means.append(mean)
    cov = np.cov(data, rowvar=0)
    distances = []
    idx = 0
    min_dist = 10000000
    max_dist = 0
    for count2, image in enumerate(data[number[0]]):
        dist = np.sqrt(np.sum((image - mean)**2))
        distances.append(dist)
        if dist < min_dist:
            min_idx = np.where(data == image)[0][0]
            min_dist = dist
        if dist > max_dist:
            max_idx = np.where(data == image)[0][0]
            max_dist = dist
    min_idxs.append(min_idx)
    max_idxs.append(max_idx)
#     for name in names[number[0]]:
#         if name != count1:
#             bad+= 1
#         else:
#             good += 1
# print(good, bad)





In [None]:
orig_file = np.load("/gluster/home/ofrebato/baler/workspaces/MNIST/data/mnist_combined.npz")
decomp_file = np.load(
    "/gluster/home/ofrebato/baler/workspaces/MNIST/MNIST_project/output/decompressed_output/decompressed.npz"
)

if not np.all(orig_file["names"] == decomp_file["names"]):
    orig_file = np.load("/gluster/home/ofrebato/baler/workspaces/MNIST/data/mnist_combined_outlier_order.npz")
    print("Outlier order")

orig_data = orig_file["data"].astype(np.float32)
decomp_data = decomp_file["data"].astype(np.float32)
names = orig_file["names"][: len(decomp_data)]
print(decomp_data.shape)


for idx in max_idxs:
    print("=== Plotting ===")

    tile_data = orig_data[idx].astype(np.float32)
    tile_data_decompressed = decomp_data[idx].astype(np.float32)

    diff = tile_data - tile_data_decompressed

    max_value = np.amax([np.amax(tile_data), np.amax(tile_data_decompressed), np.amax(diff)])
    min_value = np.amin([np.amin(tile_data), np.amin(tile_data_decompressed), np.amin(diff)])


    fig, axs = plt.subplots(1, 3, figsize=(29.7 * (1 / 2.54), 10 * (1 / 2.54)), sharey=True)
    axs[0].set_title("Original", fontsize=11)
    im1 = axs[0].imshow(tile_data, vmax=max_value, vmin=min_value)
    axs[1].set_title("Reconstructed", fontsize=11)
    im2 = axs[1].imshow(tile_data_decompressed, vmax=max_value, vmin=min_value)
    axs[2].set_title("Difference", fontsize=11)
    im3 = axs[2].imshow(diff, vmax=max_value, vmin=min_value)

    fig.subplots_adjust(right=0.8)
    cbar_ax = fig.add_axes([0.815, 0.2, 0.02, 0.59])
    cb2 = fig.colorbar(im3, cax=cbar_ax, location="right", aspect=10)
    # plt.savefig(f"/gluster/home/ofrebato/baler/workspaces/MNIST/testing/latent_closest/closest_{names[idx]}.png", dpi=600)
