# UMAP for Data Exploration

## Env Preparation

In [1]:
import os
import sys
os.chdir('/app/')
print(os.getcwd())
from os.path import join as JP
sys.path.append(os.getcwd())
sys.path.append(JP(os.getcwd(),'utils'))
sys.path.append(JP(os.getcwd(),'scripts'))

/app


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

# %matplotlib notebook
%matplotlib inline
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [3]:
import pickle
from pprint import pprint
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

## Data

In [4]:
df = data = pd.read_csv(JP('data','bbc-text-processed.csv')).iloc[:,1:]
data.head()

FileNotFoundError: [Errno 2] File b'data/bbc-text-processed.csv' does not exist: b'data/bbc-text-processed.csv'

## TFIDF

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
EMBED_SIZE = 10000        # TODO: Increase
NUM_CLUSTERS = data['category'].nunique()
WORDS_PER_CLUSTER = None
print(NUM_CLUSTERS)

In [None]:
vectorizer = TfidfVectorizer(
    min_df=.05,
    max_df=.9,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=EMBED_SIZE,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))

## Moving to a Catalog Format

In [None]:
documents = [Document() for i in range(data.shape[0])]
for d in range(len(documents)):
    documents[d].processed_text = data['processed'][d]

In [None]:
catalog = Catalog()
catalog.documents = documents

In [None]:
_ = catalog.collect_corpus(attr='processed_text', form=list)
tfidf = catalog.to_matrix(
    vectorizer=vectorizer,
    modelname='TFIDF',
    max_docs=None)
print(tfidf.representation.shape)
tfidf.representation.head()

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)
pca_output = pca.fit_transform(tfidf.representation.values)
print('Singular Values = ', pca.singular_values_)
print('Explaiend Varaince = ', pca.explained_variance_)
print('Explaiend Varaince Ratio = ', pca.explained_variance_ratio_)

In [None]:
df = df.assign(pca1=pca_output[:,0], pca2=pca_output[:,1], pca3=pca_output[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.pca1, y=df.pca2, hue=data.category)

In [None]:
fig = px.scatter_3d(df, x='pca1', y='pca2', z='pca3', color='category')
fig.update_traces(marker=dict(size=3))
fig.show()

### PCA + Scaling
I am not sure if normalization of counts makes any sense

In [None]:
pca_ = pca.fit_transform(StandardScaler().fit_transform(tfidf.representation.values))
df = df.assign(pca_sc1=pca_[:,0], pca_sc2=pca_[:,1], pca_sc3=pca_[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=pca_[:,0], y=pca_[:,1], hue=data.category)

In [None]:
fig = px.scatter_3d(df, x='pca_sc1', y='pca_sc2', z='pca_sc3', color='category')
fig.update_traces(marker=dict(size=3))
fig.show()

## Truncated SVD
Truncated SVD is prefered for sparse input matrices

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=3)
svd_output = svd.fit_transform(tfidf.representation.values)
df = df.assign(svd_1=svd_output[:,0], svd_2=svd_output[:,1], svd_3=svd_output[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.svd_1, y=df.svd_2, hue=data.category)

In [None]:
fig = px.scatter_3d(df, x='svd_1', y='svd_2', z='svd_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.show()

# T-SNE without PCA

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, verbose=1)
%time tsne_ = tsne.fit_transform(tfidf.representation.values)

In [None]:
df = df.assign(tsne_1=tsne_[:,0], tsne_2=tsne_[:,1], tsne_3=tsne_[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.tsne_1, y=df.tsne_2, hue=data.category).set_title('t-SNE')
plt.show()

In [None]:
fig = px.scatter_3d(df, x='tsne_1', y='tsne_2', z='tsne_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.show()

# T-SNE with PCA

In [None]:
%time tsne_pca_ = tsne.fit_transform(pca_output)

In [None]:
df = df.assign(tsne_pca_1=tsne_pca_[:,0], tsne_pca_2=tsne_pca_[:,1], tsne_pca_3=tsne_pca_[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.tsne_pca_1, y=df.tsne_pca_2, hue=data.category).set_title('t-SNE w. PCA')
plt.show()

In [None]:
fig = px.scatter_3d(df, x='tsne_pca_1', y='tsne_pca_2', z='tsne_pca_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.update_layout(title_text="t-SNE w. PCA ", title_font_size=30)
fig.show()

# T-SNE with PCA + Scaling

In [None]:
tsne_pca_sc = tsne.fit_transform(
    pca.fit_transform(
        StandardScaler().fit_transform(
            tfidf.representation.values)))

In [None]:
df = df.assign(tsne_pca_sc_1=tsne_pca_sc[:,0], tsne_pca_sc_2=tsne_pca_sc[:,1], tsne_pca_sc_3=tsne_pca_sc[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.tsne_pca_1, y=df.tsne_pca_2, hue=data.category).set_title('t-SNE w. scaled PCA')
plt.show()

In [None]:
fig = px.scatter_3d(df, x='tsne_pca_sc_1', y='tsne_pca_sc_2', z='tsne_pca_sc_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.update_layout(title_text="t-SNE w. Scaled PCA ", title_font_size=30)
fig.show()

# T-SNE with Truncated SVD

In [None]:
tsne_svd_ = tsne.fit_transform(
    svd.fit_transform(
        StandardScaler().fit_transform(
            tfidf.representation.values)))

In [None]:
df = df.assign(tsne_svd_1=tsne_svd_[:,0], tsne_svd_2=tsne_svd_[:,1], tsne_svd_3=tsne_svd_[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.tsne_svd_1, y=df.tsne_svd_2, hue=data.category).set_title('t-SNE w. TruncSVD')
plt.show()

In [None]:
fig = px.scatter_3d(df, x='tsne_svd_1', y='tsne_svd_2', z='tsne_svd_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.update_layout(title_text="t-SNE w. TruncSVD ", title_font_size=30)
fig.show()

## UAMP

In [None]:
import umap

In [None]:
c = umap.UMAP(
    n_neighbors=50,
    n_components=3,
    metric='cosine').fit_transform(tfidf.representation.values)

In [None]:
df = df.assign(umap_1=umap_[:,0], umap_2=umap_[:,1], umap_3=umap_[:,2])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=df.umap_1, y=df.umap_2, hue=data.category).set_title('UMAP')
plt.show()

In [None]:
fig = px.scatter_3d(df, x='umap_1', y='umap_2', z='umap_3', color='category')
fig.update_traces(marker=dict(size=3))
fig.update_layout(title_text="UMAP", title_font_size=30)
fig.show()

In [None]:
df.to_csv(JP(paths['data'],'visualization.csv'))