In [None]:
import os 
import itertools
import pandas as pd
pd.set_option('display.max_rows', None)

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2))) if len(s1)!=0 and len(s2)!=0 else 0

In [None]:
def fraction_from_min_set(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / min(len(s1),len(s2))) if len(s1)!=0 and len(s2)!=0 else 0

#### Compare the following networks

In [None]:
networks = ["PCNet", "STRING", "DIP", "HuRI"]

#### Load modules' genes

In [None]:
modules_dict={a: [] for a in networks}
for cur_network in networks:
    with open(os.path.join(os.path.join(f'ASD_{cur_network}', "modules", "modules.out"))) as f:
        for cur_line in f:
            modules_dict[cur_network].append(cur_line.strip()[1:-1].split(", "))

#### Calc similarity of genes between modules using Jaccard

In [None]:
df_gene_similarity=pd.DataFrame()            
for nwk1, nwk2, in itertools.combinations(networks,2):
    for module_nwk1, module_nwk2 in itertools.product(enumerate(modules_dict[nwk1]), enumerate(modules_dict[nwk2])):
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol1_network"]=nwk1
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol1_module_index"]=module_nwk1[0]+1
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol1_module_size"]=len(module_nwk1[1])
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol2_network_name"]=nwk2
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol2_module_index"]=module_nwk2[0]+1
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol2_module_size"]=len(module_nwk2[1])
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "jaccard_genes"]=jaccard_similarity(module_nwk1[1], module_nwk2[1])
        df_gene_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "overlap_fraction_from_min_gene_set"]=fraction_from_min_set(module_nwk1[1], module_nwk2[1])

df_gene_similarity=df_gene_similarity.astype({'sol1_module_index':int,'sol1_module_size':int,'sol2_module_index':int,'sol2_module_size':int})
df_gene_similarity=df_gene_similarity[(df_gene_similarity.loc[:,"jaccard_genes"]!=0) & (df_gene_similarity.loc[:,"sol1_module_size"]>3) & (df_gene_similarity.loc[:,"sol2_module_size"]>3)].sort_values(by='jaccard_genes', ascending=False)   

#### Load modules' enriched GO terms 

In [None]:
go_dict={a: [] for a in networks}
for cur_network in networks:
    with open(os.path.join(os.path.join(f'ASD_{cur_network}', "modules", "modules.out"))) as f:
        for i, cur_line in enumerate(f):
            df=pd.read_csv(os.path.join(f'ASD_{cur_network}', "go", f"module_go_{i+1}.tsv"), sep='\t')
            go_dict[cur_network].append(list(df[df.loc[:,"qval"]<=0.05].loc[:,"GO term"].values))

#### Calc similarity of GO terms using Jaccard

In [None]:
df_GO_similarity=pd.DataFrame()            
for nwk1, nwk2, in itertools.combinations(networks,2):
    for module_nwk1, module_nwk2 in itertools.product(enumerate(go_dict[nwk1]), enumerate(go_dict[nwk2])):
        df_GO_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol1_n_go_terms"]=len(module_nwk1[1])
        df_GO_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "sol2_n_go_terms"]=len(module_nwk2[1])
        df_GO_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "jaccard_GO"]=jaccard_similarity(module_nwk1[1], module_nwk2[1])
        df_GO_similarity.loc[f"{nwk1}_m{module_nwk1[0]+1}_{nwk2}_m{module_nwk2[0]+1}", "overlap_fraction_from_min_GO_set"]=fraction_from_min_set(module_nwk1[1], module_nwk2[1])
    
df_GO_similarity=df_GO_similarity.astype({'sol1_n_go_terms':int,'sol2_n_go_terms':int})
df_GO_similarity=df_GO_similarity[(df_GO_similarity.loc[:,"jaccard_GO"]!=0) ].sort_values(by='jaccard_GO', ascending=False)       

#### Summary

In [None]:
df_summary=pd.merge(df_gene_similarity, df_GO_similarity, left_index=True, right_index=True)
df_summary=df_summary.sort_values(by=['jaccard_genes','jaccard_GO'], ascending=False)
display(df_summary)