# Genomics & High Dimensional Data

#### Preliminaries

##### Libraries

In [44]:
from os import path
import numpy as np
import pandas as pd
from sklearn import decomposition, cluster, linear_model, multiclass
import plotly.express as px
import plotly.graph_objects as go
from alive_progress import alive_it
import pickle as pkl 

##### Utilities

In [3]:
from utilities import json as utl_json

##### Configuration

In [4]:
env_config = utl_json.to_dict(file_path="../../config/env.json")

## Data Set

In [5]:
X = np.load(
    file=path.normpath(
        path.join(
            env_config['root'],
            "modules/m2/data/p2_unsupervised",
            "X.npy"
        )
    )
)

In [6]:
X_log = np.log2((X + 1))

In [7]:
X.shape

(2169, 45768)

In [8]:
pca = decomposition.PCA(n_components=100)
X_redux = pca.fit_transform(X_log)
print(X_redux.shape)

(2169, 100)


# Unsupervised Feature Selection

## Cell sub-types

From the previous vizualization there are 3 distinct cell types. 
<br>
These can be further divided into 2 clearly visible sub-types. 
<br>
In total, this provides for 6 clusters.
<br>
This would be the initial number used to attempt the same visuzalization again. 

In [19]:
n_clusters = 6
kmeans = cluster.KMeans(n_clusters=n_clusters, max_iter=10000)
inertia_dict = {}

for seed in alive_it(range(0, 1001)):
    kmeans.set_params(**{
        "random_state":seed
    })

    kmeans.fit(X_redux)

    inertia_dict[seed] = kmeans.inertia_

|████████████████████████████████████████| 1001/1001 [100%] in 11.5s (87.16/s) 


In [20]:
fig = go.Figure()
fig.add_traces(
    go.Scatter(
        x=list(inertia_dict.keys()),
        y=list(inertia_dict.values()),
        mode="markers",
        marker={
            "color":"blue"
        }
    )
)
fig.show()

In [32]:
kmeans = cluster.KMeans(n_clusters=n_clusters,
                        max_iter=10000,
                        random_state=min(inertia_dict, key=inertia_dict.get))

kmeans.fit(X_redux)

In [33]:
frame_multi_cluster = (
    pd.DataFrame(
        X_redux,
        columns=[
            f"pc_{i}"
            for i in range(1, 101)
        ]
        )
    .assign(
        cell_sub_type = kmeans.labels_
    )
)

In [34]:
fig = go.Figure()
symbol_types = [
    "star-triangle-down-open-dot",
    "octagon-open-dot",
    "triangle-up-open"

]
color_subtypes = [
    "blue",
      "green",
    "red",
    "cornflowerblue",
     "chartreuse",
      "lightpink",
    "cyan",
   
   
  

]
marker_dict = {
    "opacity":0.5
}
# for i, group in frame_multi_cluster.groupby(by=[
#     "main_cell_type",
# ]):
    # marker_dict["symbol"] = symbol_types[i[0]]
for j, sub_group in frame_multi_cluster.groupby(by=[
    "cell_sub_type"
]):
    
    marker_dict['color'] = color_subtypes[j[0]]
    if j[0] in {0, 3}:
        marker_dict['symbol'] = symbol_types[0]
        name_str = "Cell Type A"
    elif j[0] in {2, 5}:
        marker_dict['symbol'] = symbol_types[1]
        name_str = "Cell Type B"
    else:
        marker_dict['symbol'] = symbol_types[2]
        name_str = "Cell Type C"
    # print(marker_dict)
    fig.add_traces(
        go.Scatter(
            x=sub_group.pc_1,
            y=sub_group.pc_2,
            mode="markers",
            name = name_str + f" Subtype {j[0]}",
            marker=marker_dict
        )
    )
fig.update_layout(
    title_text = "Cell clusters projected on the two principal dimensions of highest variability in gene expression"
)
fig.update_xaxes(title_text="1st Principal Component")
fig.update_yaxes(title_text="2nd Principal Component")
fig.show()

## Logistic regression

***LogisticRegressionCV***, the **cross-validation estimator** from sk-learn will be used. 
<br>
Here are some parameters chosen:
* Solver: ***liblinear***
* Regularization: ***L2***
* cv: ***10***. This implies 10-folds for the ***stratefied cross validation***.
* Multi class : ***one versus rest*** (ovr).
* Random state: 777

In [48]:
RETRAIN = False

In [35]:
frame = (
    pd.DataFrame(
        X,
        columns=[f"Gene{i}" for i in range(1, (X.shape[1]+1))]
    )
    .assign(
        cell_sub_type = kmeans.labels_
    )
)

In [41]:
pkl_file_addrss = path.normpath(
            path.join(
                env_config['root'],
                'modules/m2/data/p2_unsupervised',
                "multi_class_model.pkl"
            )
        )

if RETRAIN:
    multi_class_model = multiclass.OneVsRestClassifier(linear_model.LogisticRegressionCV(
                                                                    cv=10,
                                                                    penalty="l2",
                                                                    solver="liblinear",
                                                                    random_state=777
                                                                ))
    
    multi_class_model.fit(
        X=X_log,
        y=kmeans.labels_
    )


    with open(pkl_file_addrss, "wb") as f:
        pkl.dump(
            obj = multi_class_model,
            file=f,
            protocol = pkl.HIGHEST_PROTOCOL
        )

else:


    with open(pkl_file_addrss, "rb") as f:
        multi_class_model = pkl.load(file=f)


## Feature Selection

The classes known to the classifier are:

In [49]:
multi_class_model.classes_

array([0, 1, 2, 3, 4, 5], dtype=int32)

The coefficients in the decision function:

In [65]:
multi_class_model.estimators_[0].coef_.T.shape

(45768, 1)

Each feature has a coefficient that represents its weight in the decision function.
<br>
In this case, we will rank them using the sum of their absolute values. 

In [109]:
frame_lst = [(
    pd.DataFrame(
        multi_class_model.estimators_[i].coef_.T, 
        # index=[f"gene_{i}" for i in range(1, (multi_class_model.estimators_[i].coef_.shape[1] + 1))],
        columns=[f"cluster"]
    )
    .assign(
        abs_weight = lambda X: X.cluster.map(lambda y: abs(y))
    )
    .rename(
        columns={
            "cluster":f"cluster_{i}",
            "abs_weight":f"abs_weight_cluster_{i}"
        }
    )

) for i in range(len(multi_class_model.estimators_))]

In [110]:
coeff_frame = frame_lst[0]

In [111]:
for frame_tmp in frame_lst[1:]:
    coeff_frame = coeff_frame.merge(
                        right = frame_tmp,
                        how='inner',
                        left_index=True,
                        right_index=True,
                        suffixes=(None, "_from_the_wild"),
                        validate="1:1"
                    )


In [112]:
coeff_frame = (
    coeff_frame
    .assign(
        sum_abs_weights = lambda X: X.apply(
            lambda y: y.abs_weight_cluster_0 + y.abs_weight_cluster_1 + y.abs_weight_cluster_2 + y.abs_weight_cluster_3 + y.abs_weight_cluster_4 + y.abs_weight_cluster_5,
            axis = 1
        ),
    )
    .sort_values(
        by='sum_abs_weights',
        ascending=False,
    )
)

In [None]:
coeff_frame

In [114]:
top_genes = coeff_frame.index[:100].to_list()