<center><h3> The purpose of this notebook is to detect </h3></center>
<center><h1> Outlier Datapoints 🔎🔎🔎 </h1></center>

# Simple Setup

In [None]:
%reset -sf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Data

In [None]:
# Reading data

from pandas import read_csv

train = read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
test = read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
sample_subm = read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
# Dates format

from pandas import to_datetime

train['time'] = to_datetime(train['time'])

In [None]:
# Encode direction column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
_ = le.fit(train['direction'])

train['direction_c'] = le.transform(train['direction'])

In [None]:
# Inspecting Roads

'x =>', train['x'].unique()
'y =>', train['y'].unique()
'direction =>', train['direction_c'].unique()

from pandas import crosstab

crosstab(train['x'], train['direction_c'], values='congestion', aggfunc='count')
crosstab(train['y'], train['direction_c'], values='congestion', aggfunc='count')

# There are some combinations not present, therefore 
# total number of roads is not 3 * 4 * 8 = 96, rather the aforementioned 65

# Outliers Detection 🔎

# On PCA data

In [None]:
# Outlier Detecion on reduced data (PCA, 2 components)

from random import randint

from pandas import DataFrame, Series
from numpy import column_stack

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('talk')

# Pivot
pv = train.set_index('time')
pv = pv.groupby(['x', 'y', 'direction_c']).resample('20T')[['congestion']].mean()
pv = pv.unstack(level=[0,1,2])
#pv.head()

# Prepr
si = SimpleImputer()
ss = StandardScaler()
pca = PCA(n_components=5)
isf = IsolationForest()

# Fit pipe
pipe = make_pipeline(si, ss)  # pca
new_pv = pipe.fit_transform(pv.to_numpy())

# Fit algo
_ = isf.fit(new_pv)
preds = Series(isf.predict(new_pv))

fig, ax = plt.subplots(1, 1, figsize=(15,15), constrained_layout=True)
pca_pv = pca.fit_transform(new_pv)
print(f"PCA 2-comps explains {pca.explained_variance_ratio_}")
for pred, pred_l, pred_s, pred_c in zip([-1, 1], ['Outlier', 'Inlier'], [150, 25], ['red', 'blue']):  # first is predictions
    temp = pca_pv[preds==pred]
    #col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
    _ = ax.scatter(temp[:, 0],
               temp[:, 1],
               label=pred_l, 
               s=pred_s,
               alpha=0.5,
               facecolor='none',
               edgecolor=pred_c,
               linewidth=1.5
              )
_ = ax.set_title(f'Outliers Detection\nEach 20 Minutes')
_ = ax.set_xlabel('Comp. 1')
_ = ax.set_ylabel('Comp. 2')
_ = ax.legend()

In [None]:
# Visualizing more PCA Components

fig, axs = plt.subplots(2, 2, figsize=(15,15), constrained_layout=True)
axs = axs.flatten()
pca_pv = pca.fit_transform(new_pv)
print(f"PCA 2-comps explains {pca.explained_variance_ratio_}")
for comp_col, ax in enumerate(axs):
    for pred, pred_l, pred_s, pred_c in zip([-1, 1], ['Outlier', 'Inlier'], [150, 25], ['red', 'blue']):  # first is predictions
        temp = pca_pv[preds==pred]
        #col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
        _ = ax.scatter(temp[:, comp_col],
                   temp[:, comp_col+1],
                   label=pred_l, 
                   s=pred_s,
                   alpha=0.5,
                   facecolor='none',
                   edgecolor=pred_c,
                   linewidth=1.5
                  )
    _ = ax.set_title(f'Outliers Detection\nEach 20 Minutes')
    _ = ax.set_xlabel(f'Comp. {comp_col}')
    _ = ax.set_ylabel(f'Comp. {comp_col+1}')
    _ = ax.legend()

It seems roads dataset is quite heterogeneous, because PCA (data) explained variance ratio is quite low for the first 2 components...

However, most outliers are close to each other...

Let's try Umap

In [None]:
# Visualizing on 2D UMAP dimensionality

import umap

fig, ax = plt.subplots(1, 1, figsize=(15,15), constrained_layout=True)
umap_pv = umap.UMAP().fit_transform(new_pv)
for pred, pred_l, pred_s, pred_c in zip([-1, 1], ['Outlier', 'Inlier'], [150, 25], ['red', 'blue']):  # first is predictions
    temp = umap_pv[preds==pred]
    #col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
    _ = ax.scatter(temp[:, 0],
               temp[:, 1],
               label=pred_l, 
               s=pred_s,
               alpha=0.5,
               facecolor='none',
               edgecolor=pred_c,
               linewidth=1.5
              )
_ = ax.set_title(f'Outliers Detection\nEach 20 Minutes')
_ = ax.set_xlabel('Umap Comp. 1')
_ = ax.set_ylabel('Umap Comp. 2')
_ = ax.legend()

The fact that we cant see groupings with Umap (transformed) data, means 
the roads dataset is quite heterogeneous, as said before...

However, most outliers indeed are found in the upper right of the graph

In [None]:
# Visualizing in original data

# Hit this cell repeatedly, for new column results

fig, ax = plt.subplots(1, 1, figsize=(15,15), constrained_layout=True)
col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
for pred, pred_l, pred_s, pred_c in zip([-1, 1], ['Outlier', 'Inlier'], [150, 25], ['red', 'blue']):  # first is predictions
    temp = new_pv[preds==pred]
    #col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
    _ = ax.scatter(temp[:, col1],
               temp[:, col2],
               label=pred_l, 
               s=pred_s,
               alpha=0.5,
               facecolor='none',
               edgecolor=pred_c,
               linewidth=1.5
              )
_ = ax.set_title(f'Outliers Detection\nEach 20 Minutes')
_ = ax.set_xlabel(str(pv.columns[col1]))
_ = ax.set_ylabel(str(pv.columns[col2]))
_ = ax.legend()

In [None]:
# More on fitted data

# Visualizing more Components

fig, axs = plt.subplots(2, 2, figsize=(15,15), constrained_layout=True)
axs = axs.flatten()
for col, ax in enumerate(axs):
    for pred, pred_l, pred_s, pred_c in zip([-1, 1], ['Outlier', 'Inlier'], [150, 25], ['red', 'blue']):  # first is predictions
        temp = new_pv[preds==pred]
        #col1, col2 = randint(0, new_pv.shape[1]-1), randint(0, new_pv.shape[1]-1)
        _ = ax.scatter(temp[:, col],
                   temp[:, col+1],
                   label=pred_l, 
                   s=pred_s,
                   alpha=0.5,
                   facecolor='none',
                   edgecolor=pred_c,
                   linewidth=1.5
                  )
    _ = ax.set_title(f'Outliers Detection\nEach 20 Minutes')
    _ = ax.set_xlabel(f'Feat. {col}')
    _ = ax.set_ylabel(f'Feat. {col+1}')
    _ = ax.legend()

Similar as before, but outliers are not found close to each other, rather 
at the bounds of data

# Showing Outliers

In [None]:
# Identifying outliers date times

from numpy import arange, array

temp = DataFrame([pv.index.month,
          pv.index.day,
          pv.index.hour,
          pv.index.minute,
          preds,
          ], index=['m', 'd', 'h', 't', 'p']).T
temp = temp.set_index(['m', 'd', 'h', 't'])
temp = temp.unstack(level=['h', 't']).sort_index(ascending=False)

each = 10
yticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.index) if t % each == 0]
xticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.droplevel(0, axis=1).columns) if t % each == 0]

fig, ax = plt.subplots(figsize=(10,15))
im = ax.pcolor(temp, cmap='Reds_r', linewidths=.1)

_ = ax.set_yticks(array([t[0] for t in yticks]) + 0.5, minor=False)
_ = ax.set_yticklabels([t[1] for t in yticks])
_ = ax.set_ylabel('Month | Day', rotation=360, labelpad=60)

_ = ax.set_xticks(array([t[0] for t in xticks]), minor=False)
_ = ax.set_xticklabels([t[1] for t in xticks])
_ = ax.set_xlabel('Hour | Minute')
_ = ax.xaxis.tick_top()
_ = ax.xaxis.set_label_position('top') 

_ = ax.set_title('Outlier or Not\nby Date times')

In [None]:
# Identifying outliers date times, more granular with scores

temp = DataFrame([pv.index.month,
          pv.index.day,
          pv.index.hour,
          pv.index.minute,
          isf.decision_function(new_pv),
          ], index=['m', 'd', 'h', 't', 'p']).T
temp = temp.set_index(['m', 'd', 'h', 't'])
temp = temp.unstack(level=['h', 't']).sort_index(ascending=False)

each = 15
yticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.index) if t % each == 0]
xticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.droplevel(0, axis=1).columns) if t % each == 0]

fig, ax = plt.subplots(figsize=(10,15))
im = ax.pcolor(temp, cmap='Greens_r', linewidths=.1)

_ = ax.set_yticks(array([t[0] for t in yticks]) + 0.5, minor=False)
_ = ax.set_yticklabels([t[1] for t in yticks])
_ = ax.set_ylabel('Month | Day', rotation=360, labelpad=60)

_ = ax.set_xticks(array([t[0] for t in xticks]), minor=False)
_ = ax.set_xticklabels([t[1] for t in xticks])
_ = ax.set_xlabel('Hour | Minute')
_ = ax.xaxis.tick_top()
_ = ax.xaxis.set_label_position('top') 

_ = ax.set_title('Outlier Scores\nby Date times')

cb = fig.colorbar(im, drawedges=False)
_ = cb.set_label('Outlier Score\n(Mean)', rotation=360, labelpad=25)

# lower, the more outlier

## It seems that for almost all roads, between hours 1.5 and 4,
## their congestion behavior is considered an outlier.
## all other date times does not seem to show this behavior

# Profiling Outliers

In [None]:
# Lastly, can we say something about those outleirs?

temp = DataFrame([pv.index.month,
          pv.index.day,
          pv.index.hour,
          pv.index.minute,
          new_pv.std(axis=1),
          ], index=['m', 'd', 'h', 't', 'p']).T
temp = temp.set_index(['m', 'd', 'h', 't'])
temp = temp.unstack(level=['h', 't']).sort_index(ascending=False)

each = 15
yticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.index) if t % each == 0]
xticks = [(t, tl) for t, tl, in zip(arange(len(temp)), temp.droplevel(0, axis=1).columns) if t % each == 0]

fig, ax = plt.subplots(figsize=(10,15))
im = ax.pcolor(temp, cmap='Blues', linewidths=.1)

_ = ax.set_yticks(array([t[0] for t in yticks]) + 0.5, minor=False)
_ = ax.set_yticklabels([t[1] for t in yticks])
_ = ax.set_ylabel('Month | Day', rotation=360, labelpad=60)

_ = ax.set_xticks(array([t[0] for t in xticks]), minor=False)
_ = ax.set_xticklabels([t[1] for t in xticks])
_ = ax.set_xlabel('Hour | Minute')
_ = ax.xaxis.tick_top()
_ = ax.xaxis.set_label_position('top') 

_ = ax.set_title('Congestion (Standard deviation)\nby Date times')

cb = fig.colorbar(im, drawedges=False)
_ = cb.set_label('Congestion\n(Standard\ndeviation)', rotation=360, labelpad=55)

## It appears that for almost all roads between hours 1.5 and 4, 
## their level of congestion is very volatile (high standard deviation), 
## which probably contributed to the outlier association. 

# Hope you liked it!

# Any comments, suggestions are welcome!