# Genomics & High Dimensional Data

#### Preliminaries

##### Libraries

In [14]:
from os import path
import numpy as np
import pandas as pd
from sklearn import decomposition, cluster, manifold
import plotly.express as px
import plotly.graph_objects as go

##### Utilities

In [2]:
from utilities import json as utl_json

##### Configuration

In [3]:
env_config = utl_json.to_dict(file_path="../../config/env.json")

## PCA

In [4]:
X = np.load(
    file=path.normpath(
        path.join(
            env_config['root'],
            "modules/m2/data/p1",
            "X.npy"
        )
    )
)

In [5]:
X_log = np.log2((X + 1))

In [6]:
max(X_log[:, 0])

3.6939215228197613

In [7]:
pca = decomposition.PCA(n_components=50)
X_redux = pca.fit_transform(X_log)
print(X_redux.shape)

(511, 50)


In [69]:
n_clusters = 6

In [70]:
kmeans = cluster.KMeans(n_clusters=n_clusters, max_iter=10000)

In [71]:
kmeans.fit(X_redux)

In [72]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=X_redux[:,0],
       y=X_redux[:,1],
       mode="markers",
       marker = {
           "color":kmeans.labels_
       }
    )
)
fig.show()


In [73]:
mds = manifold.MDS(n_components=2, n_init=100)
mds_X_redux = mds.fit_transform(X_log)
print(mds_X_redux.shape)

KeyboardInterrupt: 

In [122]:
tsne = manifold.TSNE(n_components=2, perplexity = 40, max_iter = 1000, n_iter_without_progress=500)
tnse_X_redux = tsne.fit_transform(X_redux)
print(tnse_X_redux.shape)

(511, 2)


In [123]:
frame = (
    pd.DataFrame(
        {
            "pca_1":X_redux[:,0],
            "pca_2":X_redux[:,1],
            "mds_1":mds_X_redux[:,0],
            "mds_2":mds_X_redux[:,1],
            "tsne_1":tnse_X_redux[:,0],
            "tsne_2":tnse_X_redux[:,1],
            "kmeans":kmeans.labels_
        }
    )
)

In [75]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=frame.mds_1,
        y=frame.mds_2,
        mode="markers",
        marker={
            "color":frame.kmeans
        }
    )
)

In [57]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=frame.tsne_1,
        y=frame.tsne_2,
        mode="markers",
        marker={
            "color":frame.kmeans
        }
    )
)
fig.show()

In [85]:
sse = {}
labels = {}
for k in range(1, 10):
    kmeans = cluster.KMeans(n_clusters=k, max_iter=1000)
    kmeans.fit(X_redux)
    labels[k]=kmeans.labels_
    sse[k]=kmeans.inertia_

In [86]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x = list(sse.keys()),
        y = list(sse.values())
    )
)
fig.show()

In [88]:
sse[4]

6166425.4599450305

In [89]:
n_clusters = 4

In [92]:
kmeans = cluster.KMeans(n_clusters=n_clusters, max_iter=1000)
kmeans.fit(X_redux)

In [114]:
frame_X = (
    pd.DataFrame(X)
    .assign(
        label = kmeans.labels_
    )
)

In [115]:
frame_X.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45759,45760,45761,45762,45763,45764,45765,45766,45767,label
438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [116]:
grouped_frame = (
    frame_X.groupby(by=['label'])
    .mean()
    .reset_index()
    )
grouped_frame.columns = grouped_frame.columns.astype(str)

In [117]:
grouped_frame.sample()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,45758,45759,45760,45761,45762,45763,45764,45765,45766,45767
2,2,0.0,0.0,189.981828,204.982643,0.0,18.288789,6.277745,0.0,162.34521,...,0.0,0.095191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
pca = decomposition.PCA(n_components=2)
X_grouped_redux = pca.fit_transform(grouped_frame)
print(X_grouped_redux.shape)

(4, 2)


In [120]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=X_grouped_redux[:,0],
       y=X_grouped_redux[:,1],
       mode="markers",
       marker = {
           "color":"blue",
           "size":12,
           "symbol":"circle-cross-open",
       }
    )
)

fig.show()


In [124]:
fig = go.Figure()

fig.add_traces(
    go.Scatter(
        x=frame.tsne_1,
        y=frame.tsne_2,
        mode="markers",
        marker={
            "color":frame.kmeans
        }
    )
)
fig.show()
