***Setting up the notebook***

In [None]:
import os
import numpy as np
import holoviews as hv
import upsetplot
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

import GSForge as gsf

hv.extension("bokeh")

***Declare paths used***

In [None]:
# OS-independent path management.
from os import fspath, environ
from pathlib import Path

In [None]:
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/osfstorage")).expanduser()
HYDRO_GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hydro_raw.nc")

LIT_DGE_GSC_PATH = OSF_PATH.joinpath("Collections", "literature", "DGE")
LIT_TF_PATH = OSF_PATH.joinpath("Collections", "literature", "TF")
BORUTA_GSC_PATH = OSF_PATH.joinpath("Collections", "boruta")

assert HYDRO_GEM_PATH.exists()

***Load an AnnotatedGEM***

In [None]:
agem = gsf.AnnotatedGEM(HYDRO_GEM_PATH)
agem

***Load GeneSetCollections***

In [None]:
lit_dge_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_DGE_GSC_PATH, name="Literature DGE")
lit_dge_coll

In [None]:
lit_tf_coll = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=LIT_TF_PATH, name="Literature TF")
lit_tf_coll

In [None]:
boruta_gsc = gsf.GeneSetCollection.from_folder(gem=agem, target_dir=BORUTA_GSC_PATH, name="Boruta Results")
boruta_gsc

### Create a Merged Collection

Since there are so many sets within the transcription factor collection, we will combine them into a single set.

In [None]:
tf_geneset = gsf.GeneSet.from_GeneSets(*list(lit_tf_coll.gene_sets.values()), name='transcription factors')
tf_geneset

In [None]:
combined_gsc = gsf.GeneSetCollection(gem=agem, gene_sets={**boruta_gsc.gene_sets, 
                                                          **lit_dge_coll.gene_sets,
#                                                           'transcription factors': tf_geneset
                                                         }
                                    )
combined_gsc

### View Collection Overlaps

In [None]:
overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc)
percent_overlap_heatmap = gsf.plots.collections.WithinCollectionOverlapHeatMap(combined_gsc, mode='percent')

(overlap_heatmap + percent_overlap_heatmap).opts(hv.opts.HeatMap(width=550, height=500))

In [None]:
upsetplot.UpSet(upsetplot.data.from_contents(combined_gsc.as_dict()), orientation='horizontal', subset_size='count')

## Compare Gene Selections using Machine-Learning Models

We can estimate how well a given subset of genes 'describes' a sample (phenotype) label by comparing how well they perform using a given machine learning model.

In the example below, a simple random forest is used.

In [None]:
# results = {}


# for collection in ["Boruta_Treatment", "deseq2_treatment"]:
    
#     counts, treatment = gsf.get_gem_data(combined_gsc, selected_gene_sets=[collection], annotation_variables="Treatment")
    
#     x_train, x_test, y_train, y_test = model_selection.train_test_split(counts, treatment)
    
#     rf_cls_model = RandomForestClassifier(
#         class_weight='balanced',
#         n_estimators=1000, 
#         n_jobs=-1)

#     rf_cls_model.fit(x_train, y_train)
    
#     results[collection] = rf_cls_model.score(x_test, y_test)

Train on the complete gene set.

In [None]:
# gsf.get_gem_data(combined_gsc, annotation_variables="Treatment")

In [None]:
counts, treatment = gsf.get_gem_data(combined_gsc, annotation_variables=["Treatment"])

x_train, x_test, y_train, y_test = model_selection.train_test_split(counts, treatment)

rf_cls_model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=1000, 
    n_jobs=-1)

rf_cls_model.fit(x_train, y_train)

results["complete"] = rf_cls_model.score(x_test, y_test)

In [None]:
results

In [None]:
hv.Bars(results, kdims=["Gene Selection Group"]
       ).opts(xrotation=90, invert_axes=True, width=600, ylim=(0, 1.1))

This does really mean that one set is more complete, or better than another.
Keep in mind:

+ A similar model was used to 'select' the features as was used to 'test'.
+ This has nothing (directly) to do with biology.
+ Model scores may not be stable.

With the above (and more) considerations in mind:
+ A 'good' selection should preform better than using the entire dataset.
+ Our selection should be better than guessing, or a random gene set of similar size.