# Genomics & High Dimensional Data

#### Preliminaries

##### Libraries

In [1]:
from os import path
import numpy as np
import pandas as pd
from sklearn import decomposition
import plotly.express as px
import plotly.graph_objects as go

##### Utilities

In [2]:
from utilities import json as utl_json

##### Configuration

In [3]:
env_config = utl_json.to_dict(file_path="../../config/env.json")

## PCA

In [4]:
X = np.load(
    file=path.normpath(
        path.join(
            env_config['root'],
            "modules/m2/data/p1",
            "X.npy"
        )
    )
)

In [5]:
X_log = np.log2((X + 1))

In [6]:
max(X_log[:, 0])

3.6939215228197613

In [7]:
pca_raw = decomposition.PCA()
pca_log = decomposition.PCA()

In [8]:
pca_raw.fit(X)
pca_log.fit(X_log)

In [9]:
pca_raw.n_components_

511

In [10]:
pca_log.n_components_

511

In [11]:
pca_raw.explained_variance_ratio_[0]

0.4277967098357265

In [12]:
pca_log.explained_variance_ratio_[0]

0.13887564870826197

In [13]:
pca_frame = (
    pd.DataFrame({
        "explained_variance_ratio_raw":pca_raw.explained_variance_ratio_,
        "cumsum_explained_variance_raw": np.cumsum(pca_raw.explained_variance_ratio_),
        "explained_variance_ratio_log":pca_log.explained_variance_ratio_,
        "cumsum_explained_variance_log": np.cumsum(pca_log.explained_variance_ratio_),
        "pc_idx":[i for i in range(1, (X.shape[0]+1))] 
    })
)

In [14]:
pca_frame.sample()

Unnamed: 0,explained_variance_ratio_raw,cumsum_explained_variance_raw,explained_variance_ratio_log,cumsum_explained_variance_log,pc_idx
503,4.8e-05,0.999732,0.000643,0.996381,504


In [17]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        y=pca_frame.cumsum_explained_variance_raw,
        x=pca_frame.pc_idx,
        mode='lines+markers',
        name='Raw Data'

    )
)
fig.add_trace(
    go.Scatter(
        y=pca_frame.cumsum_explained_variance_log,
        x=pca_frame.pc_idx,
        mode='lines+markers',
        name='Transformed Data [log2]'

    )
)
fig.add_hline(y=0.85)
fig.show()

In [18]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = X_log[:,0],
        y = X_log[:,1]
    )
)
fig.show()

In [25]:
pca = decomposition.PCA(n_components = 2)
X_redux = pca.fit_transform(X_log)
print(X_redux.shape)

(511, 2)


In [26]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = X_redux[:,0],
        y = X_redux[:,1],
        mode='markers'
    )
)
fig.show()