# Genomics & High Dimensional Data

#### Preliminaries

##### Libraries

In [8]:
from os import path
import numpy as np
import pandas as pd
from sklearn import decomposition, cluster, manifold
import plotly.express as px
import plotly.graph_objects as go
from alive_progress import alive_it

##### Utilities

In [2]:
from utilities import json as utl_json

##### Configuration

In [3]:
env_config = utl_json.to_dict(file_path="../../config/env.json")

## PCA

In [4]:
X = np.load(
    file=path.normpath(
        path.join(
            env_config['root'],
            "modules/m2/data/p1",
            "X.npy"
        )
    )
)

In [5]:
X_log = np.log2((X + 1))

In [6]:
pca = decomposition.PCA(n_components=50)
X_redux = pca.fit_transform(X_log)
print(X_redux.shape)

(511, 50)


In [50]:
n_clusters = 5

In [51]:
kmeans_dict = {}
inertia_dict = {}
kmeans = cluster.KMeans(
        n_clusters=n_clusters,
        n_init = 10,
        max_iter=10000)

for seed in alive_it(range(0, 10000, 10)):
    print("Now running...", seed)
    kmeans.set_params(random_state = seed)
    kmeans.fit(X_redux)
    kmeans_dict[seed] = kmeans.labels_
    inertia_dict[seed] = kmeans.inertia_

on 0: Now running... 0
on 1: Now running... 10
on 2: Now running... 20
on 3: Now running... 30
on 4: Now running... 40
on 5: Now running... 50
on 6: Now running... 60
on 7: Now running... 70
on 8: Now running... 80
on 9: Now running... 90
on 10: Now running... 100
on 11: Now running... 110
on 12: Now running... 120
on 13: Now running... 130
on 14: Now running... 140
on 15: Now running... 150
on 16: Now running... 160
on 17: Now running... 170
on 18: Now running... 180
on 19: Now running... 190
on 20: Now running... 200
on 21: Now running... 210
on 22: Now running... 220
on 23: Now running... 230
on 24: Now running... 240
on 25: Now running... 250
on 26: Now running... 260
on 27: Now running... 270
on 28: Now running... 280
on 29: Now running... 290
on 30: Now running... 300
on 31: Now running... 310
on 32: Now running... 320
on 33: Now running... 330
on 34: Now running... 340
on 35: Now running... 350
on 36: Now running... 360
on 37: Now running... 370
on 38: Now running... 380
on 39: 

In [53]:
min(inertia_dict, key=inertia_dict.get)

0

In [54]:
max(inertia_dict, key=inertia_dict.get)

3830

In [52]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=list(inertia_dict.keys()),
       y=list(inertia_dict.values()),
       mode="lines+markers",

    )
)
fig.show()


In [55]:
kmeans = cluster.KMeans(
    n_clusters=n_clusters, 
    random_state=0,
    n_init = 10,
    max_iter=10000)
kmeans.fit(X_redux)

In [56]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=X_redux[:,0],
       y=X_redux[:,1],
       mode="markers",
       marker = {
           "color":kmeans.labels_
       }
    )
)
fig.show()


In [62]:
kmeans_frame = (
    pd.DataFrame(
        X
    )
    .assign(
        cluster = kmeans.labels_
    )
    .groupby(by=["cluster"])
    .mean()
    .reset_index()
)
kmeans_frame.columns = kmeans_frame.columns.astype(str)

In [73]:
kmeans_frame.sample(2)

Unnamed: 0,cluster,0,1,2,3,4,5,6,7,8,...,45758,45759,45760,45761,45762,45763,45764,45765,45766,45767
3,3,0.019438,0.0,94.812166,80.772638,0.191462,5.166394,5.366168,0.0,85.617858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,113.0616,190.917347,0.0,16.199633,17.318562,0.0,101.819874,...,0.0,0.049426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
pca_kmeans = decomposition.PCA(n_components=2)
kmeans_redux = pca_kmeans.fit_transform(kmeans_frame.drop(columns=['cluster']))
print(kmeans_redux.shape)

(5, 2)


In [71]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=kmeans_redux[:,0],
       y=kmeans_redux[:,1],
       mode="markers",
       marker = {
           "size":20,
           "symbol":'triangle-up-open-dot'
       }

    )
)
fig.show()


In [74]:
mds_kmeans = manifold.MDS(n_components=2)
kmeans_mds_redux = mds_kmeans.fit_transform(kmeans_frame.drop(columns=['cluster']))
print(kmeans_mds_redux.shape)

(5, 2)


In [75]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=kmeans_mds_redux[:,0],
       y=kmeans_mds_redux[:,1],
       mode="markers",
       marker = {
           "size":20,
           "symbol":'triangle-up-open-dot'
       }

    )
)
fig.show()


In [80]:
tsne = manifold.TSNE(n_components=2, perplexity=40)
tsne_redux = tsne.fit_transform(X_redux)
print(tsne_redux.shape)


(511, 2)


In [81]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=tsne_redux[:,0],
       y=tsne_redux[:,1],
       mode="markers",
       marker = {
           "color":kmeans.labels_
       }
    )
)
fig.show()


In [82]:
pca = decomposition.PCA(n_components=2)
X_redux = pca.fit_transform(X)
print(X_redux.shape)
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=X_redux[:,0],
       y=X_redux[:,1],
       mode="markers",
       marker = {
           "color":kmeans.labels_
       }
    )
)
fig.show()


(511, 2)


In [84]:
mds_kmeans = manifold.MDS(n_components=2)
kmeans_mds_redux = mds_kmeans.fit_transform(X)
print(kmeans_mds_redux.shape)
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=kmeans_mds_redux[:,0],
       y=kmeans_mds_redux[:,1],
       mode="markers",

    )
)
fig.show()


(511, 2)


In [85]:
tsne = manifold.TSNE(n_components=2, perplexity=40)
tsne_redux = tsne.fit_transform(X)
print(tsne_redux.shape)
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=tsne_redux[:,0],
       y=tsne_redux[:,1],
       mode="markers",
       marker = {
           "color":kmeans.labels_
       }
    )
)
fig.show()


(511, 2)
