<center><h3> The purpose of this notebook is to show techniques of </h3></center>
<center><h1> (Non-linear) Dimensionality Reduction 🔀🔀🔀 </h1></center>

# Simple Setup

In [None]:
%reset -sf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Data

In [None]:
# Reading data

from pandas import read_csv

train = read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
test = read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
sample_subm = read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
# Dates format

from pandas import to_datetime

train['time'] = to_datetime(train['time'])

In [None]:
# Encode direction column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
_ = le.fit(train['direction'])

train['direction_c'] = le.transform(train['direction'])

In [None]:
# Inspecting Roads

'x =>', train['x'].unique()
'y =>', train['y'].unique()
'direction =>', train['direction_c'].unique()

from pandas import crosstab

crosstab(train['x'], train['direction_c'], values='congestion', aggfunc='count')
crosstab(train['y'], train['direction_c'], values='congestion', aggfunc='count')

# There are some combinations not present, therefore 
# total number of roads is not 3 * 4 * 8 = 96, rather the aforementioned 65

# Helpers

In [None]:
# Helpers

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('talk')

times = (
    ['Morning', 'Afternoon', 'Night'],
    ['0000', '1200', '1700'],
    ['1200', '1700', '0000'],
    ['#1b9e77', '#d95f02', '#7570b3'])

def plot_reduced_data(df, title=None):
    fig, ax = plt.subplots(1, 1, figsize=(15,15), constrained_layout=True)
    for moment, t1, t2, color in zip(*times):
        _ = ax.scatter(df.between_time(t1, t2).iloc[:, 0],
                   df.between_time(t1, t2).iloc[:, 1],
                   label=moment,
                   s=100,
                   alpha=0.5,
                   facecolor='none',
                   edgecolor=color,
                   linewidth=2
                  )
    _ = ax.set_title(f'Non-Linear Dimensionality Reduction\n{title}')
    _ = ax.set_xlabel('Comp. 1')
    _ = ax.set_ylabel('Comp. 2')
    _ = ax.legend()
    plt.show()
    plt.close()

# Dimensionality Reduction 🔀

#### We will be working over a pivot dataset, with:
- datetimes in rows,
- roads in columns, and,
- congestion in values.

#### This means that we'll be going from N-roads to M-roads for all K-datetimes.

In [None]:
# Setups

from random import randint

from pandas import DataFrame, Series

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Pivot
pv = train.set_index('time')
pv = pv.groupby(['x', 'y', 'direction_c']).resample('20T')[['congestion']].mean()
pv = pv.unstack(level=[0,1,2])
pv.shape

# Prepr
si = SimpleImputer()
ss = StandardScaler()

# Fit prepr pipe
pipe = make_pipeline(si, ss)  # pca
new_pv = pipe.fit_transform(pv.to_numpy())
new_pv.shape
new_pv = DataFrame(new_pv, index=pv.index, columns=pv.columns)
new_pv

# Isomap

In [None]:
# Isomap

from pandas import IndexSlice
idxs = IndexSlice

from sklearn.manifold import Isomap

mani = Isomap()
mani_pv = mani.fit_transform(new_pv)
mani_pv = DataFrame(mani_pv, index=new_pv.index)
    
plot_reduced_data(mani_pv, title='Isomap')

# LLE

In [None]:
# Locally Linear Embedding

from sklearn.manifold import LocallyLinearEmbedding

mani = LocallyLinearEmbedding()
mani_pv = mani.fit_transform(new_pv)
mani_pv = DataFrame(mani_pv, index=new_pv.index)
    
plot_reduced_data(mani_pv, title='Locally Linear Embedding')

# t-SNE

In [None]:
# T-SNE

from sklearn.manifold import TSNE

mani = TSNE()
mani_pv = mani.fit_transform(new_pv)
mani_pv = DataFrame(mani_pv, index=new_pv.index)
    
plot_reduced_data(mani_pv, title='T-SNE')

# Spectral Embedding

In [None]:
# Spectral Embedding

from sklearn.manifold import SpectralEmbedding

mani = SpectralEmbedding()
mani_pv = mani.fit_transform(new_pv)
mani_pv = DataFrame(mani_pv, index=new_pv.index)
    
plot_reduced_data(mani_pv, title='Spectral Embedding')

# UMAP

In [None]:
# UMAP

import umap

mani = umap.UMAP()
mani_pv = mani.fit_transform(new_pv)
mani_pv = DataFrame(mani_pv, index=new_pv.index)

plot_reduced_data(mani_pv, title='UMAP')

# MDS

In [None]:
# MDS

from sklearn.manifold import MDS

if False: 
    # Beware: MDS takes times to compute
    mani = MDS()
    mani_pv = mani.fit_transform(new_pv)
    mani_pv = DataFrame(mani_pv, index=new_pv.index)

    plot_reduced_data(mani_pv, title='MDS')

So, by looking at all graphs, it seems that morning datetimes are more heterogenous than other times of day.

Now, you try using different reductors settings and see what new results.

# Hope you liked it!

# Any comments, suggestions are welcome!