In [None]:
from pybeataml.load_data_from_synpase import load_file, load_excel
from magine.enrichment.enrichr import Enrichr, _valid_libs, db_types

In [None]:
# will be used to map patient ids to clusters
mapping = load_file('syn26642544')
mapping.head(5)

In [None]:
meta =  load_excel('syn26532699')
meta.head(5)

In [None]:
# focusing on k=5 and k=8 
list_of_gene_sets = [
    #'syn26718015',
    #'syn26718016',
    'syn26718017',
    #'syn26718018',
    #'syn26718019',
    'syn26718020',
]

In [None]:
# enrichR engine. Basically use to pass a list, or list of genes, to enrichR, grab results.
# results are a MAGINE.enrichment_result, which is a fancy data class worth exploring!
e = Enrichr()

In [None]:
# organize output, gather cluster and data type together
def get_genes_per_cluster(feature_array):
    output_dict = {}
    for i, d in feature_array.groupby(['Cluster', 'data_type'])['feature']:
        output_dict[i] = list(set(d.values))
        if i[1] == 'Phospho':
            output_dict[(i[0], 'phospho_gene')] = list(set(i.split('-')[0] for i in d.values))
    return output_dict

In [None]:
k_equal_5 = 'syn26718017'
k5 = load_file(k_equal_5)
k5_clusters = get_genes_per_cluster(k5)

k_equal_8 = 'syn26718020'
k8 = load_file(k_equal_8)
k8_clusters = get_genes_per_cluster(k8)

In [None]:
k5_clusters[(1, 'Global')]

In [None]:
k5.groupby('Cluster').count()['feature']

In [None]:
k8.groupby('Cluster').count()['feature']

In [None]:
# prep sample names, will use cluster_data_type
k5_sample_names = [f'{cluster}_{d_type}' for cluster, d_type in k5_clusters.keys()]
k5_samples = [i for i in k5_clusters.values()]

# run enrichment
enrichment = e.run_samples(
    k5_samples, 
    k5_sample_names, 
    gene_set_lib='Reactome_2016'
)
# clean up names
enrichment.term_name = enrichment.term_name.str.split('_').str.get(0)

# create heatmap
enrichment.remove_redundant(
    level='dataframe', 
    sort_by='combined_score'
).heatmap(
    figsize=(4,8),
    linewidths=.01,
    y_tick_labels=True,
    cluster_col=True,
    cluster_row=True
);

In [None]:
# prep sample names, will use cluster_data_type
k8_sample_names = [f'{cluster}_{d_type}' for cluster, d_type in k8_clusters.keys()]
k8_samples = [i for i in k8_clusters.values()]

# run enrichment
k8_enrichment = e.run_samples(
    k8_samples, 
    k8_sample_names, 
    gene_set_lib='Reactome_2016'
)
# clean up names
k8_enrichment.term_name = k8_enrichment.term_name.str.split('_').str.get(0)

In [None]:
# create heatmap
k8_enrichment.remove_redundant(
    level='dataframe', 
    sort_by='combined_score'
).heatmap(
    figsize=(6, 12),
    linewidths=.01,
    y_tick_labels=True,
    cluster_col=True,
    cluster_row=True
);