In [None]:
import sklearn.datasets
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

###   t-SNE, UMAP and LargeVis

In this and the next notebook we will use manifold learning for data visualization of large data sets (with high dimensionality). In addition to t-SNE, two relatively new methods will be used that are more efficient on large data sets.


- UMAP (Uniform Manifold Approximation and Projection) - Install this Python package: https://umap-learn.readthedocs.io/en/latest/index.html. UMAP package is compatible with scikit-learn, making use of the same API and able to be added to sklearn pipelines. UMAP can work as a drop in replacement for t-SNE and other dimension reduction classes from scikit-learn


- LargeVis (Visualizing Large-scale and High-dimensional Data) - Many techniques (like t-SNE, UMAP and LargeVis) first compute a similarity structure of the data points and then project them into a low-dimensional space with the structure preserved. These two steps suffer from considerable computational costs Comparing to tSNE, LargeVis significantly reduces the computational cost of the graph construction step and employs a principled probabilistic model for the visualization step, the objective of which can be effectively optimized through asynchronous stochastic gradient descent with a linear time complexity. Download this algorithm repository and follow the installation instructions. https://github.com/lferry007/LargeVis


In [None]:
from sklearn.manifold import TSNE
import umap

To get data we use the sklearn.datasets.fetch_openml method, which as the name requires, Fetch dataset from openml by name or dataset id. We will use MNIST and Fashion-MNIST(Zalando's article images). Fashion-MNIST is intended to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. Instead of numbers it contains thumbnails of clothes images.

In [None]:
mnist = sklearn.datasets.fetch_openml('mnist_784')
fmnist = sklearn.datasets.fetch_openml('Fashion-MNIST')

Below are drawings of some samples from mnist and fmnist data sets

In [None]:
mnist_names = [i for i in range(10)]
 
plt.figure(figsize=(14,10))
for i in range(40):
    plt.subplot(5, 8, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    #     plt.imshow(mnist.data[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.imshow(mnist.data.values[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.xlabel(mnist_names[int(mnist.target[i])])
plt.show()

In [None]:
fmnist_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
 
plt.figure(figsize=(14,10))
for i in range(40):
    plt.subplot(5, 8, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(fmnist.data.values[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.xlabel(fmnist_names[int(fmnist.target[i])])
plt.show()

#### Use t-SNE, UMAP and LargeVis to project mnist and fmnist data sets into a 2-dimensional space. For LargeVis, you need to create a function that saves the data to the required by LargeVis txt file format, and a function that loads the resulting file. Draw charts for all visualizations.

## UMAP

In [None]:
def umap_vis(embedding, target, title, custom_labels=None):
    fig, ax = plt.subplots(figsize=(12, 10))
    color = target.astype(int)
    scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=color, cmap="Spectral", s=0.1)
    
    if custom_labels is not None:
        cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
        cbar.set_ticks(np.arange(10))
        cbar.set_ticklabels(custom_labels)
    else:
        # produce a legend with the unique colors from the scatter
        legend = ax.legend(*scatter.legend_elements(),
                        loc="upper right", title="Classes")
        ax.add_artist(legend)
    
    plt.title(f"UMAP for {title}", fontsize=18)
    plt.xticks([])
    plt.yticks([])
    plt.show()

In [None]:
reducer_mnist = umap.UMAP(random_state=42)
embedding_mnist = reducer_mnist.fit_transform(mnist.data)

umap_vis(embedding_mnist, mnist.target, 'MNIST')

In [None]:
np.shape(embedding_mnist[:,0])

UMAP grupuje cyfry w poszczególne klasy, jak i zachowuje ogólną strukturę między różnymi klasami. Metoda ta zachowuje 1 daleko od 0 oraz grupuje 3,5,8 i 4,7,9 razem (w ostatnim przypadku cyfry te są stosunkowo do siebie podobne).

In [None]:
reducer_fmnist = umap.UMAP(random_state=42)
embedding_fmnist = reducer_fmnist.fit_transform(fmnist.data)

umap_vis(embedding_fmnist, fmnist.target, 'Fashion-MNIST', custom_labels=fmnist_names)

Bardzo dobrze rozdzielone zostały klasy: trouser i bag. Różne rodzaje butów (Ankle boot, Sneaker, Sandal) zostały zgrupowane razem, jednak nadal dostrzegalny jest podział między nimi. Duże podobieństwo między klasami: Shirt, Coat, Dress, Pullover i T-shirt/top spowodowało, że zostały one zobrazowane jako jedna grupa, gdzie podział nie jest jednoznaczny. 

#### In order to compare the results of these three methods, calculate for each case the average distance between two points belonging to the same class divided by the average distance between points belonging to 2 different classes

In [None]:
from scipy.spatial.distance import cdist

In [None]:
def calc_distance(embedding, target, metric='euclidean'):
    ''' Calculate the average distance between two points 
    from 2 different classes (or within the same class) '''
    
    data = np.concatenate((embedding, target.reshape(-1, 1)), axis=1)
    df = pd.DataFrame(data=data, columns=['embed1', 'embed2', 'target'])
    
    results = []
    for base_target in np.unique(target):
        single_class_buffer = []
        for compare_target in np.unique(target):
            base_class_records = df[df['target']==base_target]
            base_coord = base_class_records.loc[:, ['embed1', 'embed2']].values.astype('float32')
            compare_class_records = df[df['target']==compare_target]
            compare_coord = compare_class_records.loc[:, ['embed1', 'embed2']].values.astype('float32')         
            avg_dist = np.mean(cdist(base_coord, compare_coord, metric))
            single_class_buffer.append(avg_dist)
        results.append(single_class_buffer)
    return np.array(results)

def plot_distances_heatmap(distance, dataset_name, method_name, ticklabels=None):
    ''' Visualize distances between classes using heatmap'''
    
    plt.figure(figsize=(15,12))
    sn.heatmap(distance, annot=True)
    plt.title(f'Average distance between 2 points for {dataset_name} dataset using {method_name} method')
    plt.xlabel('comparison class')
    plt.ylabel('base class')
    if ticklabels is not None:
        # set custom ticks labels
        xlocs, xlabels = plt.xticks()
        plt.xticks(xlocs, labels=ticklabels)
        ylocs, ylabels = plt.yticks()
        plt.yticks(ylocs, labels=ticklabels)
    plt.show()
    
def calc_performance_metric(distance):
    ''' Return the average distance between two points belonging to the same class 
    divided by the average distance between points belonging to 2 different classes'''
    
    diagonal_sum = np.trace(distance)
    total_sum = np.sum(distance)
    metric = diagonal_sum / (total_sum - diagonal_sum)
    return metric

### UMAP on MNIST performance measurement

In [None]:
%%time
distance_mnist_umap = calc_distance(embedding_mnist, mnist.target.to_numpy())

In [None]:
plot_distances_heatmap(distance_mnist_umap, 'MNIST', 'UMAP')

In [None]:
print(f'Performance metric for UMAP method on MNIST dataset')
print(f'{calc_performance_metric(distance_mnist_umap):.4f}')

### UMAP on fashion-MNIST performance measurement

In [None]:
%%time
distance_fmnist_umap = calc_distance(embedding_fmnist, fmnist.target.to_numpy())

In [None]:
plot_distances_heatmap(distance_fmnist_umap, 'fashion-MNIST', 'UMAP', fmnist_names)

In [None]:
print(f'Performance metric for UMAP method on f-MNIST dataset')
print(f'{calc_performance_metric(distance_fmnist_umap):.4f}')

## LargeVis

In [None]:
reduced = 10000
mnistReducedData = mnist.data.values[:reduced]
mnistReducedTarget = mnist.target[:reduced]

In [None]:
import LargeVis

In [None]:
def saveToTxt(data,filename="input.txt"):
    np.savetxt(filename,data,header="{} {}".format(len(data),len(data[0])), comments="",fmt="%u")

In [None]:
filename = "mnist.txt"
saveToTxt(mnistReducedData,filename = "mnist.txt")

In [None]:
LargeVis.loadfile(filename)
LargeVis_MNIST = np.array(LargeVis.run(2, -1, -1, -1, -1, -1, -1, -1, -1, finalPerplexity))

In [None]:
%%time
filename = "mnist.txt"
perplexities = [20,40]
for perplexity in perplexities:
    LargeVis.loadfile(filename)
    LargeVis_MNIST = np.array(LargeVis.run(2, -1, -1, -1, -1, -1, -1, -1, -1, perplexity))
    distance_mnist_largeVis = calc_distance(LargeVis_MNIST, mnistReducedTarget.to_numpy())
    print(f'perplexity[{perplexity}]:{calc_performance_metric(distance_mnist_largeVis):.4f}')

In [None]:
umap_vis(LargeVis_MNIST, mnistReducedTarget , 'MNIST')

In [None]:
fmnistReducedData = fmnist.data.values[:reduced]
fmnistReducedTarget = fmnist.target[:reduced]

In [None]:
filename = "fmnist.txt"
saveToTxt(fmnistReducedData,filename)

In [None]:
%%time
LargeVis.loadfile(filename)
LargeVis_FMNIST = np.array(LargeVis.run(2, -1, -1, -1, -1, -1, -1, -1, -1, 20))

In [None]:
%%time
filename = "fmnist.txt"
perplexities = [20,40]
for perplexity in perplexities:
    LargeVis.loadfile(filename)
    LargeVis_FMNIST = np.array(LargeVis.run(2, -1, -1, -1, -1, -1, -1, -1, -1, perplexity))
    distance_fmnist_largeVis = calc_distance(LargeVis_FMNIST, fmnistReducedTarget.to_numpy())
    print(f'perplexity[{perplexity}]:{calc_performance_metric(distance_fmnist_largeVis):.4f}', flush=True)

In [None]:
umap_vis(LargeVis_FMNIST, fmnistReducedTarget , 'FMNIST')

### LargeVis on MNIST performance measurement

In [None]:
%%time
distance_mnist_largeVis = calc_distance(LargeVis_MNIST, mnistReducedTarget.to_numpy())

In [None]:
plot_distances_heatmap(distance_mnist_largeVis, 'MNIST', 'UMAP')

In [None]:
print(f'Performance metric for largeVis method on MNIST dataset')
print(f'{calc_performance_metric(distance_mnist_largeVis):.4f}')

### LargeVis on fashion-MNIST performance measurement

In [None]:
%%time
distance_fmnist_largeVis = calc_distance(LargeVis_FMNIST, fmnist.target.to_numpy())

In [None]:
plot_distances_heatmap(distance_fmnist_largeVis, 'fashion-MNIST', 'largeVis')

In [None]:
print(f'Performance metric for largeVis method on f-MNIST dataset')
print(f'{calc_performance_metric(distance_fmnist_largeVis):.4f}')