# PS enrichment and PPI-density measurement of gene clusters --python
# (data of pituitary as an example)

## load packages 

In [None]:
import pandas as pd
import numpy as np
import os,re
import scipy.stats as stats
import math
import csv
import pickle
import random
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import sys

import networkx as nx
import json
import networkx.algorithms.community as nx_comm
import seaborn as sns
from collections import defaultdict
import gseapy
from gseapy.plot import gseaplot

## load data

In [3]:
uni_gene = pd.read_csv('/data/uniprot_gene_saps.csv') 
gene_uni_dict={}
for i in range(len(uni_gene)):
    gene_uni_dict[uni_gene.iloc[i,5]] = uni_gene.iloc[i,1]

In [19]:
path = '/data/gene_cluster/'
files= os.listdir(path)
celltp_clu_uni = {}
celltp_clu_gene = {}
for file in files:
    celltype = file.split('_GeneOrder')[0]
    file_clu = pd.read_csv(path+'/'+file)
    file_clu = file_clu[file_clu['pearson']>0.8]
    clu_gene = defaultdict(list)
    clu_uni = defaultdict(list)
    for i in range(len(file_clu)):
        if file_clu.iloc[i,2] in gene_uni_dict:
            clu_uni['cluster'+str(file_clu.iloc[i,3])].append(gene_uni_dict[file_clu.iloc[i,2]])
            clu_gene['cluster'+str(file_clu.iloc[i,3])].append(file_clu.iloc[i,2])
    celltp_clu_uni[celltype] = clu_uni
    celltp_clu_gene[celltype] = clu_gene

In [None]:
human_ppi_f = pd.read_csv('/data/human_ppi_filter_exp_db.csv', index_col=0)

## enrichment of proteins with high PS tendency

In [None]:
data_df = uni_gene[['Gene','saps']]
pre_df = pd.DataFrame(data=None,columns=['Name','Term','ES','NES','NOM p-val','FDR q-val','FWER p-val','Tag %','Gene %','Lead_genes'])
for i in tqdm(celltp_clu_gene):
    genesets = celltp_clu_gene[i]
    pre_res = gseapy.prerank(rnk=data_df, gene_sets=genesets,
                     processes=4,
                     outdir='prerank_report_kegg', format='png', seed=6)
    pre_res.res2d['cell type'] = i
    pre_df = pre_df.append(pre_res.res2d,ignore_index=True)
    pre_df.to_csv('/data/all_gene_pre_res_origin_80.csv')

In [None]:
cluster_gsea = cluster_gsea[cluster_gsea['NOM p-val']<0.05].reset_index(drop=True)

cell_cluster_gsea = []
for i in cluster_gsea.index:
    cell_cluster_gsea.append(cluster_gsea.loc[i,'Term'])

## PPI density and max subset

In [None]:
def count_commu(protein_set):

    # all nodes
    dataGO = human_ppi_f[(human_ppi_f['pro1_gene'].isin(protein_set))&(human_ppi_f['pro2_gene'].isin(protein_set))]

    allnode = list(set(list(dataGO['pro1_gene'].values)+list(dataGO['pro2_gene'].values)))
    alledges = [(dataGO.loc[i,'pro1_gene'],dataGO.loc[i,'pro2_gene']) for i in dataGO.index]

    # build network
    G = nx.Graph()
    G.add_nodes_from(allnode)
    G.add_edges_from(alledges)

    # delete uncorrelated nodes
    dele_node = [i for i in allnode if G.degree(i)==1 and i not in protein_set]
    for exc in dele_node:
        G.remove_node(exc)
#     for c in nx.connected_components(G):
#         print(c)
    try:
        av_clu = nx.average_clustering(G) # average clustering coeficients
    except:
        av_clu = -1
    return av_clu

In [None]:
def count_subnet(protein_set):
    
    # all nodes
    dataGO = human_ppi_f[(human_ppi_f['pro1_gene'].isin(protein_set))&(human_ppi_f['pro2_gene'].isin(protein_set))]


    allnode = list(set(list(dataGO['pro1_gene'].values)+list(dataGO['pro2_gene'].values)))
    alledges = [(dataGO.loc[i,'pro1_gene'],dataGO.loc[i,'pro2_gene']) for i in dataGO.index]

    # build network
    G = nx.Graph()
    G.add_nodes_from(allnode)
    G.add_edges_from(alledges)

    # delete uncorrelated nodes
    dele_node = [i for i in allnode if G.degree(i)==1 and i not in protein_set]
    for exc in dele_node:
        G.remove_node(exc)
    try:
        largest_cc = list(max(nx.connected_components(G), key=len)) # max_sub
    except:
        av_clu = -1
    return largest_cc

In [None]:
# 计算平均 PPI density
cell_clu_avgclu = []
for i in cell_cluster_gsea:
    print(i)
    if len(celltp_clu_gene1[i])>20:
        avg_clu = count_commu(celltp_clu_gene1[i])
        if avg_clu >0.3:
            cell_clu_avgclu.append(i)
    print('-----')

In [None]:
cell_geneset_subnet = {}
for ct in tqdm(cell_clu_avgclu):
    cell_geneset_subnet[ct] = count_subnet(celltp_clu_gene1[ct])

## functional enrichment

In [None]:
for i in tqdm(cell_geneset_subnet):
    genelist = cell_geneset_subnet[i]
#     gene_sets=['GO_Biological_Process_2023']  # or other gene_sets, https://maayanlab.cloud/Enrichr/#libraries
    gene_sets = 'data/c5.go.bp.v2023.1.Hs.symbols.gmt' 

    enr = gp.enrichr(gene_list=genelist,
             gene_sets=gene_sets,
             organism='Human',  # don't forget to set organism to the one you desired! e.g. Yeast
             outdir= None,
             cutoff=1,  # test dataset, use lower value from range(0,1)
            )
    enr.results.Term = enr.results.Term.apply(lambda x: x[5:].replace('_', ' '))
    export_path = 'data/PS_module/'+i+'_GO_2023.csv'
    enr.results.to_csv(export_path)