Look at distances

In [3]:
import igraph as ig
import pandas as pd
import numpy as np
import seaborn as sns
from compress_pickle import load, dump
from statistics import mean, median
from c_means_clustering import network_c_means
from copy import deepcopy
import itertools as it
import matplotlib.pyplot as plt
from math import log10
from ease import EASE


plt.style.use('bmh')

def open_pickle(file):
    with open(path+file, 'rb') as pickle_file:
        return load(path=pickle_file, compression='infer')

path = './files/'

regulome_network_edges = pd.read_csv(filepath_or_buffer=path+'/human_regulome_pd.gz', compression='infer')
regulome_network_edges = regulome_network_edges.astype({'ProteinAid': 'str', 'ProteinBid':'str'})

regulome_graph = ig.Graph.DataFrame(regulome_network_edges, directed=False, use_vids=False)
proteins = pd.concat([regulome_network_edges['ProteinAid'], regulome_network_edges['ProteinBid']]).unique()

clusters_dict = open_pickle("c_means_leiden_clusters.gz")
clusters_dict = {cluster_id: cluster for (cluster_id, cluster) in clusters_dict.items() if len(cluster) > 10}

msigdb_c3_tft_dict = open_pickle('msigdb_c3_tft_dict.pkl')
results_99 = open_pickle("c_means_6iter_edge_99.gz")
results_98 = open_pickle("c_means_6iter_edge_98.gz")

In [None]:
protein_edges = pd.concat([regulome_network_edges['ProteinAid'], regulome_network_edges['ProteinBid']]).value_counts()
most_connected_proteins = tuple(protein_edges.index[:5])
least_connected_proteins = tuple(protein_edges.index[-5:])

In [None]:
from itertools import combinations

iter1_clusters = results_99[0][3]
clusters = [cluster for cluster in iter1_clusters.values() if len(clusters) if len(cluster) < 2000]
print(len(clusters))

overlap_coefficients = list()

for set_A, set_B in combinations(clusters, 2):
    ov_coeff = (len(set_A.intersection(set_B)))/(min(len(set_A), len(set_B)))
    overlap_coefficients.append(ov_coeff)

fig, ax = plt.subplots(1,1,figsize=[8,4], sharex=True)
title = fig.suptitle(f"Overlap coefficient of c-means clusters after 1 iteration (percentile = 99)", fontsize=15, y=1)

sns.histplot(overlap_coefficients, ax=ax)
ax.set_xlabel("Overlap Coefficient")
# ax.set_ylim([0, 2000])


plt.savefig('./images/c_means_overlap_coeff_edge_99_2.png', dpi=fig.dpi, bbox_inches='tight',bbox_extra_artists=[title])


In [None]:
gene_sets = pd.Series([set(gene_set) for gene_set in msigdb_c3_tft_dict.values()])  # some gene sets have no mapping aka have 0 genes
overlap_coefficients = list()

for set_A, set_B in combinations(gene_sets, 2):
    if len(set_A) == 0 or len(set_B) == 0: # tft 3 sets with 0
        continue
    ov_coeff = (len(set_A.intersection(set_B)))/(min(len(set_A), len(set_B)))
    overlap_coefficients.append(ov_coeff)

fig, ax = plt.subplots(1,1,figsize=[8,4], sharex=True)
title = fig.suptitle(f"Overlap coefficient of MSigDB c3 tft gene sets", fontsize=15, y=1)

sns.histplot(overlap_coefficients, ax=ax)
ax.set_xlabel("Overlap Coefficient")
ax.set_ylim([0, 10000])

plt.savefig('./images/c_means_overlap_coeff_tft.png', dpi=fig.dpi, bbox_inches='tight',bbox_extra_artists=[title])

In [47]:
def enrichment(clusters, gene_set):
    enrichment_dict = dict()
    for (clusterid, cluster) in clusters.items():

        enrichment = EASE(query_set=cluster, genesets=gene_set, PT= regulome_graph.vcount())
        enrichment_dict[clusterid] = (enrichment, len(cluster), cluster) # maybe save this as a namedtuple with enrihcment and subgraph
    return enrichment_dict

threshold = 0.01

def calculate_CCS(enrichment_dict, threshold=0.01):
    enrichments = [cluster[0][:][['pathway_id', 'FDR']] for cluster in enrichment_dict.values()]
    enrichments = pd.concat(enrichments)
    enriched_df = enrichments[enrichments['FDR'] < threshold]
    n_enriched_clusters = len(enrichments['pathway_id'].unique())
    avg_set_per_cluster = len(enriched_df) / n_enriched_clusters
    avg_cluster_per_set = enriched_df.groupby('pathway_id')['pathway_id'].count().mean()
    CCS = avg_set_per_cluster / avg_cluster_per_set

    return CCS, n_enriched_clusters, avg_set_per_cluster, avg_cluster_per_set

In [50]:
for iteration, clusters in enumerate(results_99[0]):
    CCS_df = pd.DataFrame()

    enrichment = enrichment(clusters, msigdb_c3_tft_dict)
    
    CCS_values = calculate_CCS(enrichment)
    CCS_df = pd.concat([CCS_df, pd.DataFrame({'iter': [iteration], 'n_enriched':[CCS_values[1]], 'avg_s/c': [CCS_values[2]], 'avg_c/s':[CCS_values[3]], 'CCS': []})])

    


ValueError: All arrays must be of the same length

Unnamed: 0,pathway_id,FDR
0,ADA2_TARGET_GENES,1.0
1,ADCYAP1_TARGET_GENES,1.0
2,ADNP_TARGET_GENES,1.0
3,AEBP2_TARGET_GENES,1.0
4,AHRR_TARGET_GENES,1.0
...,...,...
1110,TATAAA_TATA_01,1.0
1111,TGANNYRGCA_TCF11MAFG_01,1.0
1112,WGGAATGY_TEF1_Q6,1.0
1113,GCCATNTTG_YY1_Q6,1.0


In [16]:
enrichment_99

{0: (                   pathway_id  Overlap    Pvalue  FDR  FWER
  0           ADA2_TARGET_GENES       12  0.554180  1.0   1.0
  1        ADCYAP1_TARGET_GENES        0  1.000000  1.0   1.0
  2           ADNP_TARGET_GENES        6  0.882056  1.0   1.0
  3          AEBP2_TARGET_GENES       14  0.715053  1.0   1.0
  4           AHRR_TARGET_GENES       17  0.492761  1.0   1.0
  ...                       ...      ...       ...  ...   ...
  1110           TATAAA_TATA_01       14  0.907941  1.0   1.0
  1111  TGANNYRGCA_TCF11MAFG_01        5  0.593819  1.0   1.0
  1112         WGGAATGY_TEF1_Q6        5  0.761472  1.0   1.0
  1113         GCCATNTTG_YY1_Q6        6  0.706961  1.0   1.0
  1114                 ER_Q6_01        1  1.000000  1.0   1.0
  
  [1115 rows x 5 columns],
  205,
  {'330933',
   '331071',
   '331217',
   '331276',
   '331606',
   '331613',
   '331656',
   '331702',
   '331861',
   '331912',
   '331980',
   '332026',
   '332044',
   '332129',
   '332150',
   '332225',
   '3325