# Genomics & High Dimensional Data

#### Preliminaries

##### Libraries

In [1]:
from os import path
import numpy as np
import pandas as pd
from sklearn import decomposition, cluster, manifold
import plotly.express as px
import plotly.graph_objects as go
from alive_progress import alive_it

##### Utilities

In [2]:
from utilities import json as utl_json

##### Configuration

In [3]:
env_config = utl_json.to_dict(file_path="../../config/env.json")

## Data Set

In [4]:
X = np.load(
    file=path.normpath(
        path.join(
            env_config['root'],
            "modules/m2/data/p2_unsupervised",
            "X.npy"
        )
    )
)

In [5]:
X_log = np.log2((X + 1))

In [6]:
X.shape

(2169, 45768)

# Visualization

### Three Main Types

Let's use 100 components. 
<br>
We aim to capture variability along the top 100 dimensions.

In [7]:
pca = decomposition.PCA(n_components=100)
X_redux = pca.fit_transform(X_log)
print(X_redux.shape)

(2169, 100)


We aim to showcase three distinct clusters. 
<br>
One for each type of brain cell. 

In [8]:
n_clusters = 3

In [9]:
kmeans = cluster.KMeans(n_clusters=n_clusters, max_iter=10000)

To ensure that neither the initialization of K-means nor the random state of the computer is influencing our results, we will run a loop varying the random state.
<br>
We will track the inertia param to capture the lower case possible. 

In [10]:
inertia_dict = {}

for seed in alive_it(range(0, 1001)):
    kmeans.set_params(**{
        "random_state":seed
    })

    kmeans.fit(X_redux)

    inertia_dict[seed] = kmeans.inertia_

|████████████████████████████████████████| 1001/1001 [100%] in 6.0s (166.88/s) 


In [11]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=list(inertia_dict.keys()),
        y=list(inertia_dict.values()),
        mode="markers",
        marker={
            "color":"blue"
        }
    )
)
fig.show()

In [12]:
min(inertia_dict.values())

30255607.00978434

In [13]:
min(inertia_dict, key=inertia_dict.get)

0

In [14]:
inertia_dict[1]

30255607.00978434

In [15]:
kmeans_3 = cluster.KMeans(n_clusters=n_clusters,
                        max_iter=10000,
                        random_state=1)

kmeans_3.fit(X_redux)

In [16]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=X_redux[:, 0],
        y=X_redux[:, 1],
        mode="markers",
        marker={
            "color":kmeans_3.labels_
        }
    )
)
fig.update_layout(
    title_text = "Cell clusters projected on the two principal dimensions of highest variability in gene expression"
)
fig.update_xaxes(title_text="Principal Component 1")
fig.update_yaxes(title_text="Principal Component 2")
fig.show()

### Main subtypes

Let's attempt to identify at least two sub-types with each cluster directly. 
<br>
That is, without any further categorization, let's see if we can clearly identify two more subtypes.


In [17]:
n_clusters = 9

In [18]:
kmeans = cluster.KMeans(n_clusters=n_clusters, max_iter=10000)

In [19]:
inertia_dict = {}

for seed in alive_it(range(0, 1001)):
    kmeans.set_params(**{
        "random_state":seed
    })

    kmeans.fit(X_redux)

    inertia_dict[seed] = kmeans.inertia_

|████████████████████████████████████████| 1001/1001 [100%] in 15.5s (64.48/s) 


In [20]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=list(inertia_dict.keys()),
        y=list(inertia_dict.values()),
        mode="markers",
        marker={
            "color":"blue"
        }
    )
)
fig.show()

In [21]:
min(inertia_dict.values())

21531692.808437258

In [22]:
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=min(inertia_dict, key=inertia_dict.get), max_iter=10000)
kmeans.fit(X_redux)

In [23]:
frame_multi_cluster = (
    pd.DataFrame(
        X_redux,
        columns=[
            f"pc_{i}"
            for i in range(1, 101)
        ]
        )
    .assign(
        main_cell_type = kmeans_3.labels_,
        cell_sub_type = kmeans.labels_
    )
)

In [24]:
frame_multi_cluster.pc_1


0       -37.352022
1        77.835382
2        66.618090
3        14.490310
4       -11.685859
           ...    
2164   -363.323327
2165   -336.711237
2166   -346.301792
2167   -331.708470
2168   -291.154565
Name: pc_1, Length: 2169, dtype: float64

In [40]:
fig = go.Figure()
symbol_types = [
    "star-triangle-down-open-dot",
    "octagon-open-dot",
    "triangle-up-open"

]
color_subtypes = [
    "blue",
     "cadetblue",
    "red",
    "coral",
    "cyan",
    "cornflowerblue",
    "chartreuse",
    "green",
    "lightpink",

]
marker_dict = {
    "opacity":0.7
}
# for i, group in frame_multi_cluster.groupby(by=[
#     "main_cell_type",
# ]):
    # marker_dict["symbol"] = symbol_types[i[0]]
for j, sub_group in frame_multi_cluster.groupby(by=[
    "cell_sub_type"
]):
    
    marker_dict['color'] = color_subtypes[j[0]]
    if j[0] in {0, 4, 5}:
        marker_dict['symbol'] = symbol_types[0]
        name_str = "Cell Type A"
    elif j[0] in {2, 8, 3}:
        marker_dict['symbol'] = symbol_types[1]
        name_str = "Cell Type B"
    else:
        marker_dict['symbol'] = symbol_types[2]
        name_str = "Cell Type C"
    # print(marker_dict)
    fig.add_traces(
        go.Scatter(
            x=sub_group.pc_1,
            y=sub_group.pc_2,
            mode="markers",
            name = name_str + f" Subtype {j[0]}",
            marker=marker_dict
        )
    )
fig.update_layout(
    title_text = "Cell clusters projected on the two principal dimensions of highest variability in gene expression"
)
fig.update_xaxes(title_text="Principal Component 1")
fig.update_yaxes(title_text="Principal Component 2")
fig.show()