# Homework 4 - Dimensional Reduction

In [1]:
### install the env
! pip install nbformat==5.7.3
! pip install plotly==4.14.3
! pip install mlxtend
! pip install scikit-learn
! pip install umap-learn

### download the data 
! curl -O http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
! curl -O http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
! gunzip t*-ubyte.gz

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nbformat==5.7.3
  Downloading nbformat-5.7.3-py3-none-any.whl.metadata (5.0 kB)
Downloading nbformat-5.7.3-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.1/78.1 kB[0m [31m611.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nbformat
  Attempting uninstall: nbformat
    Found existing installation: nbformat 5.9.2
    Uninstalling nbformat-5.9.2:
      Successfully uninstalled nbformat-5.9.2
Successfully installed nbformat-5.7.3
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting plotly==4.14.3
  Downloading plotly-4.14.3-py2.py3-none-any.whl.metadata (7.6 kB)
Collecting retrying>=1.3.3 (from plotly==4.14.3)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2

In [2]:
import numpy as np
import pandas as pd
import time

# import plot
import plotly.io as plt_io
import plotly.graph_objects as go
import matplotlib

from sklearn.preprocessing import StandardScaler
# import tool - PCA
from sklearn.decomposition import PCA
# import tool - TSNE
from sklearn.manifold import TSNE
# import tool - UMAP
import umap
# import tool - LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn import datasets

import platform
from mlxtend.data import loadlocal_mnist

In [3]:
def Loaddataset(name='digits'):
    
    if name == 'sign_mnist':
        # load the sign_mnist data
        # https://www.kaggle.com/datasets/datamunge/sign-language-mnist?resource=download
        train = pd.read_csv('dataset/sign_mnist_test.csv')
        train.head()
        #picking only the first 10 labels
        train = train[train['label'] < 10]
        # Setting the label and the feature columns
        y = train.loc[:,'label'].values
        x = train.loc[:,'pixel1':].values

    if name == 'digits':
        # load the digits data
        # https://scikit-learn.org/stable/auto_examples/datasets/plot_digits_last_image.html
        digits = datasets.load_digits()
        # Setting the label and the feature columns
        x = np.array(digits.images).reshape((-1, 64))
        y = np.array(digits.target)

    if name == 'mnist':
        # load the digits data
        x, y = loadlocal_mnist(
            images_path='train-images-idx3-ubyte', 
            labels_path='train-labels-idx1-ubyte')
        
        # select a subset for time saving
        x = x[:10000]
        y = y[:10000]


    return x,y

In [4]:
def plot_2d(component1, component2):
    # plot the visulize results with plotly
    
    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=5,
            color=y, #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=500,height=500)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [53]:
def plot_3d(component1, component2, component3):

  fig = go.Figure(data=[go.Scatter3d(
    x=component1,
        y=component2,
        z=component3,
        mode='markers',
        marker=dict(
            size=10,
            color=y,                # set color to an array/list of desired values
            colorscale='Rainbow',   # choose a colorscale
            opacity=1,
            line_width=1)
      )])
  fig.update_layout(margin=dict(l=5,r=5,b=5,t=5),width=200,height=220)
  fig.layout.template = 'plotly_dark'
  fig.show()

In [37]:
principalComponents

array([[ 1.91421366, -0.95450157, -3.94603482, ...,  0.42243302,
        -0.17028567,  0.15055005],
       [ 0.58898033,  0.9246358 ,  3.92475494, ..., -0.13759245,
        -0.10897935,  0.7641426 ],
       [ 1.30203906, -0.31718882,  3.02333293, ...,  0.33828028,
         0.15937163,  0.50380069],
       ...,
       [ 1.02259599, -0.14791087,  2.46997366, ..., -0.84622956,
        -0.22598017, -0.10986348],
       [ 1.07605522, -0.38090625, -2.45548693, ..., -0.41645303,
         0.09456011,  0.30842642],
       [-1.25770233, -2.22759087,  0.28362789, ..., -0.57034025,
         0.47231596,  0.27707267]])

In [38]:
x, y = Loaddataset()

## Standardizing the data
x = StandardScaler().fit_transform(x)
# PCA
start = time.time()

# Tip: you can calculate the principalComponents with pca method:
# more information in https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# Please fill in the blanks below
pca = PCA(n_components=50)
principalComponents = pca.fit_transform(x)
# principalComponents = PCA(n_components=2)

print('Duration: {} seconds'.format(time.time() - start))
principal = pd.DataFrame(data = principalComponents[:, :2],
                        columns = ['principal component 1', 'principal component 2'])

plot_2d(principalComponents[:, 0],principalComponents[:, 1])

Duration: 0.019315481185913086 seconds


In [39]:
x, y = Loaddataset()

## Standardizing the data
x = StandardScaler().fit_transform(x)
# LDA
start = time.time()

# Tip: you can calculate the principalComponents with pca method:
# Please fill in the blanks below
# principalComponents = pass
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

print('Duration: {} seconds'.format(time.time() - start))
principal = pd.DataFrame(data = principalComponents,
                        columns = ['principal component 1', 'principal component 2'])

plot_2d(principalComponents[:, 0],principalComponents[:, 1])

Duration: 0.007923603057861328 seconds


In [40]:
x, y = Loaddataset()
# tSNE
start = time.time()
pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(x)

# Tip: you can calculate the principalComponents with tsne method:
# more information in https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Please fill in the blanks below
# tsne = pass
tsne_pre = TSNE(n_components=2, random_state=42)
tsne = tsne_pre.fit_transform(pca_result_50)


print('Duration: {} seconds'.format(time.time() - start))

# plot the 2d vis results
plot_2d(tsne[:, 0], tsne[:, 1])
# plot_3d(tsne[:, 0], tsne[:, 1], tsne[:,2])

Duration: 3.6778552532196045 seconds


In [54]:
x, y = Loaddataset()

# UMAP
start = time.time()
# Tip: you can calculate the principalComponents with umap method:
# more information in https://github.com/lmcinnes/umap
# Please fill in the blanks below
# embedding = pass
umap_pre = umap.UMAP(n_components=3, random_state=42)
embedding = umap_pre.fit_transform(pca_result_50)


print('Duration: {} seconds'.format(time.time() - start))

import plotly.express as px
plot_2d(embedding[:, 0], embedding[:, 1])
plot_3d(embedding[:, 0], embedding[:, 1], embedding[:, 2])


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Duration: 3.076197624206543 seconds
