## Importing modules

In [17]:
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
from gseapy.parser import Biomart
import os
import numpy as np
import seaborn as sns
from gseapy.plot import gseaplot
import fnmatch

In [18]:
def ConvertPairsToMatrix_SN(bayesian_metabol_df):
    bayesian_metabol_df.set_axis(['Gene1','Gene2','weight'], axis=1,inplace=True)
    a = np.unique(bayesian_metabol_df['Gene1'])
    b = np.unique(bayesian_metabol_df['Gene2'])
    c = np.union1d(a,b);
    data = np.zeros((len(c), len(c)));
    output_df = pd.DataFrame(data, index=c, columns=c)
    for values in bayesian_metabol_df.values: 
        output_df[values[0]][values[1]] = values[2];
        output_df[values[1]][values[0]]=values[2];
    np.fill_diagonal(output_df.values,1)
    return output_df

# Converts Wormbase IDs to gene IDs


In [19]:
def wb_to_gene(matrix):
    mapper_df=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv", 
                          header='infer',index_col=1)
    mapper_df=mapper_df.loc[mapper_df.index.dropna()]
    wb_to_gene = {};
    for wb in mapper_df.index:
        wb_to_gene[wb] = str(mapper_df.loc[wb]['GeneName']);
    matrix=matrix.rename(index=wb_to_gene,columns=wb_to_gene)
    return matrix

def gene_to_wb(matrix):
    mapper_df=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv",
                          header='infer',index_col=2)
    mapper_df=mapper_df.loc[mapper_df.index.dropna()]
    gene_to_wb = {};
    for gene in mapper_df.index:
        gene_to_wb[gene] = str(mapper_df.loc[gene]['WormBaseID']);
    matrix=matrix.rename(index=gene_to_wb,columns=gene_to_wb)
    return matrix

def SeqToWB(output_df):
    mapper_df=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv",
                          header='infer',index_col=3)
    mapper_df=mapper_df.loc[mapper_df.index.dropna()]
    Seq_to_Wb = {};
    mapper_df=mapper_df[mapper_df.index!=np.nan]
    for seq in mapper_df.index:
        Seq_to_Wb[seq] = str(mapper_df.loc[seq]['WormBaseID']);
    matrix=matrix.rename(index=Seq_to_Wb,columns=Seq_to_Wb)
    return matrix

def SeqToGene(matrix):
    mapper_df=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv", 
                          header='infer',index_col=3)
    mapper_df=mapper_df.loc[mapper_df.index.dropna()]
    Seq_to_Gene = {};
    mapper_df=mapper_df[mapper_df.index!=np.nan]
    for seq in mapper_df.index:
        Seq_to_Gene[seq] = str(mapper_df.loc[seq]['GeneName']);
    matrix=matrix.rename(index=Seq_to_Gene,columns=Seq_to_Gene)
    return matrix

def GeneToSeq(matrix):
    mapper_df=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv", 
                          header='infer',
                          index_col=2)
    mapper_df=mapper_df.loc[mapper_df.index.dropna()]
    Gene_to_Seq = {};
    mapper_df=mapper_df[mapper_df.index!=np.nan]
    for gene in mapper_df.index:
        Gene_to_Seq[gene] = str(mapper_df.loc[gene]['SequenceID']);
    matrix=matrix.rename(index=Gene_to_Seq,columns=Gene_to_Seq)
    return matrix

def PreRank(genes, outdir,gene_sets):
#     print("Genes: {}".format(genes));
    print("Length of genes:{}".format(len(genes)))
    genes=pd.DataFrame(genes)
    genes.set_index([0],inplace=True)
    genes=SeqToGene(genes)
    genes=list(genes.index)
    intersection_list = list(set(metabolic_corr_df.index).intersection(set(genes)))
#     print("intersection_list:{}".format(intersection_list))
    missing_genes=list(set(genes).difference(set(intersection_list)))
#     print("IntersectionList: {}".format(intersection_list));
#     print("Length of intersection list:{}".format(len(intersection_list)))
#     print('Missing genes:{}\n{}'.format(len(missing_genes),missing_genes))
    if(len(missing_genes) == len(genes)):
        return;
    Combined=metabolic_corr_df[intersection_list];
    Mean=Combined.mean(axis=1)
#    print("Mean before scaling:{}".format(Mean))
#     Mean=(Mean*2)-1
#     print("Mean after scaling to lie between -1 and +1:{}".format(Mean))
    Mean.dropna(inplace=True)
    rnk=Mean.sort_values(ascending=False)
    plt.rcParams["font.family"] = "Arial"
#     print("Rank: {}".format(rnk))    
    pre_res = gp.prerank(rnk=rnk, gene_sets=gene_sets, processes=4,min_size=2, outdir=outdir, format='svg', 
                         weighted_score_type=1,verbose=True)
    plt.close()
    return pre_res

def _is_regulated_pathway_(pre_res, pathway):
#     print('Hello There: {}'.format(pre_res));
#     print('Shivani Here: {}'.format(pre_res.res2d))
    if(pathway not in pre_res.res2d.index):
        return "NaN"
    else:
        pathway_pre_res = pre_res.res2d.loc[pathway];
#     is_regulated_pathway = pathway_pre_res.es >= 0.70 and pathway_pre_res.fdr <= 0.05
        is_regulated_pathway =  (pathway_pre_res.fdr <= 0.05) and (pathway_pre_res.nes>0) and (pathway_pre_res.nes!=np.inf) and (pathway_pre_res.es>0)
    return is_regulated_pathway;

def PlotEnrichment(pre_res,pathway, outdir):
    Sorted_values=pre_res.res2d.sort_values(ascending=False,by=['nes'])[0:40]
    fig = plt.figure(figsize=(8,15))
    df = pd.DataFrame({'Enrichment Score': Sorted_values.es,
                   'p-value': Sorted_values.pval,'FDR':Sorted_values.fdr}, index=Sorted_values.index)
    ax = df.plot.barh(rot=0)
    plt.legend(loc='best', bbox_to_anchor=(1, 1))
    plt.rcParams["font.family"] = "Arial"
    plt.savefig("{}/{}_plot.svg".format(outdir, pathway))
    plt.show()
    plt.close()
    
def PlotGSEA(pre_res, pathway, outdir,term):
    terms = pre_res.res2d.sort_values(by=['es'],ascending=False).index
#     print(terms[17])
    print("term is: {}".format(term))
    fig=gseaplot(rank_metric=pre_res.ranking,term=term, **pre_res.results[term],ofname='{}/{}_gsea.svg'.format(outdir,term))
    plt.show()
    

In [20]:
# ! mkdir /data/nandas/Coflux_matrix/FluxRed_102220/Product_Matrix_041122_4_4/


## Setting base directory

In [21]:
Base_dir='/data/nandas/Coflux_matrix/FluxRed_102220/Product_Matrix_062122/'
output_dir='/data/nandas/Coflux_matrix/FluxRed_102220/Product_Matrix_062122/'
os.chdir(Base_dir)

In [22]:
MetabolicPairs=pd.read_csv("/data/nandas/Combined_coexp/MetabolicCorrPairs_062321.dat",
                           sep='\t',header=None)

In [23]:
# !mkdir /data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/OverallPathwayEnrichment062421/

In [24]:
# metabolic_corr_df=ConvertPairsToMatrix_SN(MetabolicPairs)

## Reading required files: GeneSets(gmt), PathwayToGenes and Gene Correlations 

In [25]:
# pathway_filename1 = '/data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/Genesets_NAME_090320_LEVEL_1.gmt';
# pathway_filename2 = '/data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/Genesets_NAME_090320_LEVEL_2.gmt';
# pathway_filename3 = '/data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/Genesets_NAME_090320_LEVEL_3.gmt';
# # pathway_filename4 = '/data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/Genesets_NAME_090320_LEVEL_4.gmt';
# # metabolic_corr_df=pd.read_csv("/data/nandas/Combined_coexp/Sleipnir/Final_data_080620/UMN/MetabolicCorrMatrix_083120.csv",index_col=0,header='infer')
# Pathway_df1=pd.read_csv(pathway_filename1,index_col=0,sep='\t')
# Pathway_df2=pd.read_csv(pathway_filename2,index_col=0,sep='\t')
# Pathway_df3=pd.read_csv(pathway_filename3,index_col=0,sep='\t')
# # Pathway_df4=pd.read_csv(pathway_filename4,index_col=0,sep='\t')

In [26]:
pathway_filename4="/data/nandas/Coflux_matrix/FluxRed_102220/Product_Matrix_062122/ProductMatrix_SelectedClusterSets_062722_6_3.gmt"
Pathway_df4=pd.read_csv(pathway_filename4,sep='\t',index_col=0)

In [27]:
# for i in Pathway_df4.columns:
#     Pathway_df4.rename(columns={'{}'.format(i):'Gene{}'.format(i)},inplace=True)

In [28]:
Pathway_df4.dropna(axis=1,how='all',inplace=True)

In [81]:
x=pd.read_csv("/data/nandas/WormBase_282/MasterProteinCodingGenesAnnotation_WS282.csv",index_col=0)

In [83]:
x=x[x.Type=='protein_coding_gene']
x=x[x.Status=='Live']

In [96]:
pd.DataFrame(x['WormBaseID']).to_csv("/data/nandas/WormBase_282/AllCElegansLiveProteinCodingGenesWS282.csv",
                                     index=False,header=False)

In [93]:
# pd.DataFrame(x['WormBaseID'])

KeyError: 0

In [95]:
x

Unnamed: 0,WormBaseID,GeneName,SequenceID,Status,Type
0,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene
1,WBGene00000002,aat-1,F27C8.1,Live,protein_coding_gene
2,WBGene00000003,aat-2,F07C3.7,Live,protein_coding_gene
3,WBGene00000004,aat-3,F52H2.2,Live,protein_coding_gene
4,WBGene00000005,aat-4,T13A10.10,Live,protein_coding_gene
...,...,...,...,...,...
52080,WBGene00306070,W02A2.12,W02A2.12,Live,protein_coding_gene
52084,WBGene00306076,Y53C10A.25,Y53C10A.25,Live,protein_coding_gene
52087,WBGene00306080,Y71F9AM.11,Y71F9AM.11,Live,protein_coding_gene
52088,WBGene00306081,T21G5.12,T21G5.12,Live,protein_coding_gene


In [30]:
# Pathway_df4=Pathway_df4[['Mean Silhouette Values','Gene0', 'Gene1', 'Gene2', 'Gene3', 'Gene4', 'Gene5', 'Gene6', 'Gene7',
#        'Gene8', 'Gene9', 'Gene10', 'Gene11', 'Gene12', 'Gene13', 'Gene14',
#        'Gene15', 'Gene16', 'Gene17',
#        'GeneIsRegulated']]

In [31]:
# for i in Pathway_df4.index:
#     Pathway_df4.at[i,'ClusterNumber']="Cluster_{}".format(i)
    

In [32]:
# Pathway_df4.set_index(['ClusterNumber'],inplace=True)

In [33]:
Pathway_df4

Unnamed: 0_level_0,Mean Silhoutte Values,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_28,Gene_29,Gene_30,Gene_31,Gene_32,Gene_33,Gene_34,Gene_35,Gene_36,Gene_37
ClusterName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster_1,0.477038,nduo-4,ndfl-4,ctc-1,nduo-5,ctc-2,nduo-6,ctb-1,nduo-3,nduo-1,...,,,,,,,,,,
Cluster_2,0.361924,C31H2.4,gst-43,hgo-1,gst-42,Y53G8B.1,fah-1,hpd-1,,,...,,,,,,,,,,
Cluster_3,0.326963,sdha-1,sdhd-1,sdhb-1,cox-4,isp-1,mev-1,,,,...,,,,,,,,,,
Cluster_4,0.286806,gfat-1,T03F6.3,hxk-1,gln-6,gln-5,gfat-2,,,,...,,,,,,,,,,
Cluster_5,0.28172,spe-5,vha-17,vha-11,vha-19,vha-14,vha-4,vha-13,vha-5,vha-12,...,,,,,,,,,,
Cluster_6,0.239749,nduf-6,nuo-6,T20H4.5,F53F4.10,lpd-5,Y54F10AM.5,D2030.4,nuo-3,F42G8.10,...,,,,,,,,,,
Cluster_7,0.234638,fmo-5,fmo-2,fmo-1,argn-1,fmo-3,fmo-4,,,,...,,,,,,,,,,
Cluster_8,0.230072,cysl-4,T25D3.3,F59A7.7,mpst-4,mpst-5,mpst-6,mpst-2,,,...,,,,,,,,,,
Cluster_9,0.222639,gss-1,E01A2.1,gcs-1,lap-2,lap-1,pcs-1,,,,...,,,,,,,,,,
Cluster_10,0.211155,atp-1,atp-5,asg-2,ctps-1,atp-2,Y69A2AR.18,T26E3.7,asb-2,asg-1,...,,,,,,,,,,


In [34]:
# for i in Pathway_df4.index:
#     print(i)
#     Pathway_df4=Pathway_df4.rename(index={'{}'.format(i):'Cluster{}'.format(i)})

In [35]:
Pathway_df4

Unnamed: 0_level_0,Mean Silhoutte Values,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_28,Gene_29,Gene_30,Gene_31,Gene_32,Gene_33,Gene_34,Gene_35,Gene_36,Gene_37
ClusterName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster_1,0.477038,nduo-4,ndfl-4,ctc-1,nduo-5,ctc-2,nduo-6,ctb-1,nduo-3,nduo-1,...,,,,,,,,,,
Cluster_2,0.361924,C31H2.4,gst-43,hgo-1,gst-42,Y53G8B.1,fah-1,hpd-1,,,...,,,,,,,,,,
Cluster_3,0.326963,sdha-1,sdhd-1,sdhb-1,cox-4,isp-1,mev-1,,,,...,,,,,,,,,,
Cluster_4,0.286806,gfat-1,T03F6.3,hxk-1,gln-6,gln-5,gfat-2,,,,...,,,,,,,,,,
Cluster_5,0.28172,spe-5,vha-17,vha-11,vha-19,vha-14,vha-4,vha-13,vha-5,vha-12,...,,,,,,,,,,
Cluster_6,0.239749,nduf-6,nuo-6,T20H4.5,F53F4.10,lpd-5,Y54F10AM.5,D2030.4,nuo-3,F42G8.10,...,,,,,,,,,,
Cluster_7,0.234638,fmo-5,fmo-2,fmo-1,argn-1,fmo-3,fmo-4,,,,...,,,,,,,,,,
Cluster_8,0.230072,cysl-4,T25D3.3,F59A7.7,mpst-4,mpst-5,mpst-6,mpst-2,,,...,,,,,,,,,,
Cluster_9,0.222639,gss-1,E01A2.1,gcs-1,lap-2,lap-1,pcs-1,,,,...,,,,,,,,,,
Cluster_10,0.211155,atp-1,atp-5,asg-2,ctps-1,atp-2,Y69A2AR.18,T26E3.7,asb-2,asg-1,...,,,,,,,,,,


In [36]:
# metabolic_corr_df.to_csv("MetabolicCorrMatrix_021921.csv")

In [37]:
metabolic_corr_df=pd.read_csv("/data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/OrphanGenes/MetabolicCorrMatrix062321.csv",
                              index_col=0)

In [38]:
## To ignore self-correlations, remove diagonals of correlation matrix
np.fill_diagonal(metabolic_corr_df.values,np.nan)

In [39]:
metabolic_corr_df.min().min()

-0.36478

In [40]:
metabolic_corr_df.shape

(2505, 2505)

In [41]:
metabolic_corr_df=wb_to_gene(metabolic_corr_df)

In [42]:
metabolic_corr_df.to_csv("MetabolicCorrMatrix_GeneID_062822.csv")

In [43]:
# metabolic_corr_df=(metabolic_corr_df+1)/2

In [44]:
# metabolic_corr_df.min().min()

In [45]:
metabolic_corr_df['haly-1'].sort_values(ascending=False)[0:20]

Y51H4A.7     0.597110
pah-1        0.528318
F55A11.6     0.485382
hpd-1        0.453570
bus-8        0.447316
Y38C1AB.1    0.446178
vha-5        0.438398
sams-1       0.437492
ads-1        0.420730
dpyd-1       0.419898
K07E3.4      0.416554
F08F3.4      0.412088
faah-1       0.412026
argn-1       0.408304
aass-1       0.406326
gyg-1        0.406008
tatn-1       0.405482
ugt-54       0.394044
got-1.2      0.394002
T25B9.1      0.392174
Name: haly-1, dtype: float64

### Setting default coregulated state of pathway

In [46]:
# Pathway_df1['IsRegulated'] = False

# Pathway_df2['IsRegulated'] = False
# Pathway_df3['IsRegulated'] = False
Pathway_df4['IsRegulated'] = False


In [47]:
metabolic_corr_df=metabolic_corr_df[~metabolic_corr_df.index.duplicated(keep='first')]

## PreRank Gene set enrichment analyses for custom pathway annotations

In [48]:
metabolic_corr_df=gene_to_wb(metabolic_corr_df)
metabolic_corr_df=wb_to_gene(metabolic_corr_df)
metabolic_corr_df=SeqToGene(metabolic_corr_df)

In [49]:
# New_df1 = pd.DataFrame([])
# for pathway in Pathway_df1.index:
#     #pathway = 'GLY_CLEAVAGE_SYSTEM';
#     print(pathway)
# #     pathway = 'ALA_ASP_AND_GLU_METABOLISM';
#     genes = list(Pathway_df1_withoutIsRegulated.loc[pathway].dropna());
#     pre_res = PreRank(genes, pathway,gene_sets=pathway_filename1);
#     if(pre_res is None):
#         continue; 
#     Pathway_df1.at[pathway, 'IsRegulated'] = _is_regulated_pathway_(pre_res, pathway);
#     print("{} is regulated:{}".format(pathway,_is_regulated_pathway_(pre_res, pathway)))
#     PlotEnrichment(pre_res, pathway, outdir=pathway)
#     if(pathway in pre_res.res2d.index):
#         PlotGSEA(pre_res, pathway,pathway)
#         gsea_result_df=pre_res.res2d.loc[pathway];
#         New_df1=New_df1.append(gsea_result_df)
# # Pathway_df1.to_csv("Pathway_Regulation_status1.csv")
# New_df1.to_csv("Final_pathway1_gsea.csv")




In [50]:
# New_df2 = pd.DataFrame([])
# for pathway in Pathway_df2.index:
#     #pathway = 'GLY_CLEAVAGE_SYSTEM';
#     print(pathway)
# #     pathway = 'ALA_ASP_AND_GLU_METABOLISM';
#     genes = list(Pathway_df2_withoutIsRegulated.loc[pathway].dropna());
#     pre_res = PreRank(genes, pathway,gene_sets=pathway_filename2);
#     if(pre_res is None):
#         continue; 
#     Pathway_df2.at[pathway, 'IsRegulated'] = _is_regulated_pathway_(pre_res, pathway);
#     print("{} is regulated:{}".format(pathway,_is_regulated_pathway_(pre_res, pathway)))
#     PlotEnrichment(pre_res, pathway, outdir=pathway)
#     if(pathway in pre_res.res2d.index):
#         PlotGSEA(pre_res, pathway,pathway)
#         gsea_result_df=pre_res.res2d.loc[pathway];
#         New_df2=New_df2.append(gsea_result_df)
# # Pathway_df2.to_csv("Pathway_Regulation_status2.csv")
# New_df2.to_csv("Final_pathway2_gsea.csv")


In [51]:
# New_df3 = pd.DataFrame([])
# for pathway in Pathway_df3.index:
#     #pathway = 'GLY_CLEAVAGE_SYSTEM';
#     print(pathway)
# #     pathway = 'ALA_ASP_AND_GLU_METABOLISM';
#     genes = list(Pathway_df3_withoutIsRegulated.loc[pathway].dropna());
#     pre_res = PreRank(genes, pathway,gene_sets=pathway_filename3);
#     if(pre_res is None):
#         continue; 
#     Pathway_df3.at[pathway, 'IsRegulated'] = _is_regulated_pathway_(pre_res, pathway);
#     print("{} is regulated:{}".format(pathway,_is_regulated_pathway_(pre_res, pathway)))
#     PlotEnrichment(pre_res, pathway, outdir=pathway)
#     if(pathway in pre_res.res2d.index):
#         PlotGSEA(pre_res, pathway,pathway)
#         gsea_result_df=pre_res.res2d.loc[pathway];
#         New_df3=New_df3.append(gsea_result_df)
# # Pathway_df3.to_csv("Pathway_Regulation_status3.csv")
# New_df3.to_csv("Final_pathway3_gsea.csv")

In [52]:
Pathway_df4

Unnamed: 0_level_0,Mean Silhoutte Values,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_29,Gene_30,Gene_31,Gene_32,Gene_33,Gene_34,Gene_35,Gene_36,Gene_37,IsRegulated
ClusterName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster_1,0.477038,nduo-4,ndfl-4,ctc-1,nduo-5,ctc-2,nduo-6,ctb-1,nduo-3,nduo-1,...,,,,,,,,,,False
Cluster_2,0.361924,C31H2.4,gst-43,hgo-1,gst-42,Y53G8B.1,fah-1,hpd-1,,,...,,,,,,,,,,False
Cluster_3,0.326963,sdha-1,sdhd-1,sdhb-1,cox-4,isp-1,mev-1,,,,...,,,,,,,,,,False
Cluster_4,0.286806,gfat-1,T03F6.3,hxk-1,gln-6,gln-5,gfat-2,,,,...,,,,,,,,,,False
Cluster_5,0.28172,spe-5,vha-17,vha-11,vha-19,vha-14,vha-4,vha-13,vha-5,vha-12,...,,,,,,,,,,False
Cluster_6,0.239749,nduf-6,nuo-6,T20H4.5,F53F4.10,lpd-5,Y54F10AM.5,D2030.4,nuo-3,F42G8.10,...,,,,,,,,,,False
Cluster_7,0.234638,fmo-5,fmo-2,fmo-1,argn-1,fmo-3,fmo-4,,,,...,,,,,,,,,,False
Cluster_8,0.230072,cysl-4,T25D3.3,F59A7.7,mpst-4,mpst-5,mpst-6,mpst-2,,,...,,,,,,,,,,False
Cluster_9,0.222639,gss-1,E01A2.1,gcs-1,lap-2,lap-1,pcs-1,,,,...,,,,,,,,,,False
Cluster_10,0.211155,atp-1,atp-5,asg-2,ctps-1,atp-2,Y69A2AR.18,T26E3.7,asb-2,asg-1,...,,,,,,,,,,False


In [53]:
# !mkdir /data/nandas/Combined_coexp/Pathway_enrichment/NewSets_090420/ClusterSetsEnrichment_070721/

In [54]:
Pathway_df4.drop(columns=['Mean Silhoutte Values'],inplace=True)

In [55]:
Pathway_df4[0:20]

Unnamed: 0_level_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,Gene_10,...,Gene_29,Gene_30,Gene_31,Gene_32,Gene_33,Gene_34,Gene_35,Gene_36,Gene_37,IsRegulated
ClusterName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster_1,nduo-4,ndfl-4,ctc-1,nduo-5,ctc-2,nduo-6,ctb-1,nduo-3,nduo-1,nduo-2,...,,,,,,,,,,False
Cluster_2,C31H2.4,gst-43,hgo-1,gst-42,Y53G8B.1,fah-1,hpd-1,,,,...,,,,,,,,,,False
Cluster_3,sdha-1,sdhd-1,sdhb-1,cox-4,isp-1,mev-1,,,,,...,,,,,,,,,,False
Cluster_4,gfat-1,T03F6.3,hxk-1,gln-6,gln-5,gfat-2,,,,,...,,,,,,,,,,False
Cluster_5,spe-5,vha-17,vha-11,vha-19,vha-14,vha-4,vha-13,vha-5,vha-12,vha-1,...,,,,,,,,,,False
Cluster_6,nduf-6,nuo-6,T20H4.5,F53F4.10,lpd-5,Y54F10AM.5,D2030.4,nuo-3,F42G8.10,nuo-2,...,,,,,,,,,,False
Cluster_7,fmo-5,fmo-2,fmo-1,argn-1,fmo-3,fmo-4,,,,,...,,,,,,,,,,False
Cluster_8,cysl-4,T25D3.3,F59A7.7,mpst-4,mpst-5,mpst-6,mpst-2,,,,...,,,,,,,,,,False
Cluster_9,gss-1,E01A2.1,gcs-1,lap-2,lap-1,pcs-1,,,,,...,,,,,,,,,,False
Cluster_10,atp-1,atp-5,asg-2,ctps-1,atp-2,Y69A2AR.18,T26E3.7,asb-2,asg-1,asb-1,...,,,,,,,,,,False


In [56]:
# Pathway_df1_withoutIsRegulated = Pathway_df1.drop(['IsRegulated'], axis=1);
# Pathway_df2_withoutIsRegulated = Pathway_df2.drop(['IsRegulated'], axis=1);
# Pathway_df3_withoutIsRegulated = Pathway_df3.drop(['IsRegulated'], axis=1);
# Pathway_df4_withoutIsRegulated = Pathway_df4.drop(['IsRegulated'], axis=1);

In [57]:
# genes = list(Pathway_df4.loc[pathway].dropna());

In [58]:
# !rm Pathway_self_enrichment_Cluster_*_6_3.csv

In [59]:
New_df4 = pd.DataFrame([])
Final_gsea=pd.DataFrame([])
# count=1
for pathway in Pathway_df4.index:
#     pathway = 'Cluster_8';
#     count=count+1
    print(pathway)
#     pathway = 'ALA_ASP_AND_GLU_METABOLISM';
    genes = list(Pathway_df4.loc[pathway].dropna());
    file_exist = False;
    for file in os.listdir(output_dir):
        if fnmatch.fnmatch(file, "Pathway_self_enrichment_{}_6_3.csv".format(pathway)):
            print("File: {} found, skipping!!!".format(file))
            file_exist = True;
    if(not file_exist):
        pre_res = PreRank(genes=genes, outdir="{}/{}".format(output_dir,pathway),gene_sets=pathway_filename4);
        if(pre_res is None):
            continue; 
        Pathway_df4.at[pathway, 'IsRegulated'] = _is_regulated_pathway_(pre_res, pathway);
        print("{} is regulated:{}".format(pathway,_is_regulated_pathway_(pre_res, pathway)))
        PlotEnrichment(pre_res, pathway,"./{}".format(pathway))
        if(pathway in pre_res.res2d.index):
            PlotGSEA(pre_res, pathway,outdir="./{}".format(pathway),term=pathway)
            gsea_result_df=pre_res.res2d.loc[pathway];
            gsea_result_df=pd.DataFrame(gsea_result_df)
            gsea_result_df.to_csv("Pathway_self_enrichment_{}_6_3.csv".format(pathway))
            gsea_result_df=gsea_result_df.transpose()
#             gsea_result_df['Pathway_Main']=pathway
            New_df4=New_df4.append(gsea_result_df)
#             New_df4.at[pathway,'Pathway_main']=pathway
#             New_df4.to_csv("{}/Pathway_{}.csv".format(output_dir,pathway))
#         if count>2:
#     break;
# Pathway_df4.to_csv("Pathway_Regulation_status4.csv")
# New_df4.to_csv("{}/Final_pathway4_gsea.csv".format(output_dir))


Cluster_1
File: Pathway_self_enrichment_Cluster_1_6_3.csv found, skipping!!!
Cluster_2
File: Pathway_self_enrichment_Cluster_2_6_3.csv found, skipping!!!
Cluster_3
File: Pathway_self_enrichment_Cluster_3_6_3.csv found, skipping!!!
Cluster_4
File: Pathway_self_enrichment_Cluster_4_6_3.csv found, skipping!!!
Cluster_5
File: Pathway_self_enrichment_Cluster_5_6_3.csv found, skipping!!!
Cluster_6
File: Pathway_self_enrichment_Cluster_6_6_3.csv found, skipping!!!
Cluster_7
File: Pathway_self_enrichment_Cluster_7_6_3.csv found, skipping!!!
Cluster_8
File: Pathway_self_enrichment_Cluster_8_6_3.csv found, skipping!!!
Cluster_9
File: Pathway_self_enrichment_Cluster_9_6_3.csv found, skipping!!!
Cluster_10
File: Pathway_self_enrichment_Cluster_10_6_3.csv found, skipping!!!
Cluster_11
File: Pathway_self_enrichment_Cluster_11_6_3.csv found, skipping!!!
Cluster_12
File: Pathway_self_enrichment_Cluster_12_6_3.csv found, skipping!!!
Cluster_13
File: Pathway_self_enrichment_Cluster_13_6_3.csv found, ski

In [60]:
Pathway_df4[70:107]

Unnamed: 0_level_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,Gene_10,...,Gene_29,Gene_30,Gene_31,Gene_32,Gene_33,Gene_34,Gene_35,Gene_36,Gene_37,IsRegulated
ClusterName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [61]:
x=pd.read_csv("Pathway_self_enrichment_Cluster_19_6_3.csv",index_col=0)

In [62]:
x

Unnamed: 0,Cluster_19
es,0.7722177742193796
nes,1.7270182101372364
pval,0.0044004400440044
fdr,0.022204727806056423
geneset_size,7
matched_size,7
genes,secs-1;mett-10;nsun-5;pstk-1;seld-1;pps-1;trxr-1
ledge_genes,secs-1;mett-10;nsun-5;pstk-1;seld-1;pps-1;trxr-1


In [63]:
# New_df4=pd.read_csv("{}/Final_pathway4_gsea.csv".format(output_dir),index_col=0)

In [64]:
# New_df4.index=New_df4.index.str.replace("_"," ")

In [65]:
# New_df4.index=New_df4.index.str.title()

In [66]:
New_df4.to_csv("Final_pathway4_gsea.csv")

In [67]:
New_df4

In [68]:
Pathway_df4=Pathway_df4['IsRegulated']

In [69]:
Pathway_df4=pd.DataFrame(Pathway_df4)

In [70]:
Regulated=Pathway_df4[Pathway_df4['IsRegulated']==True]

In [71]:
NotRegulated=Pathway_df4[Pathway_df4['IsRegulated']!=True]

In [72]:
Enrichment=pd.read_csv("{}/Final_pathway4_gsea.csv".format(output_dir),index_col=0)

In [73]:
Regulated

Unnamed: 0_level_0,IsRegulated
ClusterName,Unnamed: 1_level_1


In [74]:
SelectedClusters=pd.read_csv("/data/nandas/Coflux_matrix/FluxRed_102220/Product_Matrix_063021/ProductMatrix_SelectedClusterSets_063021.gmt",
                            sep='\t',index_col=0)

In [75]:
for i in SelectedClusters.index:
    SelectedClusters.at[i,'ClusterNumber']="Cluster_{}".format(i)

In [76]:
SelectedClusters.set_index(['ClusterNumber'],inplace=True)

In [77]:
intersect=list(set(Regulated.index).intersection(set(SelectedClusters.index)))

In [78]:
RegulatedSelectedClusters=SelectedClusters.loc[intersect]

In [79]:
RegulatedSelectedClusters

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,Mean Silhouette Values
ClusterNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [80]:
Enrichment_result=New_df4[['es','nes','fdr']]

KeyError: "None of [Index(['es', 'nes', 'fdr'], dtype='object')] are in the [columns]"

In [None]:
Enrichment_result=Enrichment_result[Enrichment_result.fdr<=0.05]

In [None]:
Enrichment_result.sort_values(by=['fdr'],inplace=True,ascending=False)

In [None]:
Enrichment_result.rename(columns={'es':'Enrichment Score','nes':'Normalized Enrichment Score','fdr':'FDR'},inplace=True)

In [None]:
ax=plt.figure(figsize=(40,180))
plt.rcParams["font.family"] = "Arial"
# Enrichment_result=pd.DataFrame({'es':'Enrichment Score','nes':'Normalized Enrichment Score','fdr':'FDR'},index=Enrichment_result.index)
Enrichment_result.plot.barh()
plt.savefig("EnrichmentPlot.svg",dpi=300,format='svg')
plt.show()

In [None]:
Enrichment_result.shape

In [None]:
def Pathway_Enrichment_Of_Regulated_Genes(Base_dir,Pathway_df4,level):
    path =Base_dir
    print(path)
    dfs = []
    for x in Pathway_df4.index:
        files='{}/Pathway_{}.csv'.format(path,x)
        df=pd.read_csv(files)
        df['Pathway_main']=x;
#         df['Class']=RegulatedMetabolic.loc[x]['Class']
        df.set_index(['Pathway_main'],inplace=True)
        df.to_csv(files)
        print(files)
        dfs.append(pd.read_csv(files))

    # # Concatenate all data into one DataFrame

    All_TFs = pd.concat(dfs, ignore_index=False)
    # # Filtering out negative or not applicable enrichment¶
    All_TFs=All_TFs[All_TFs.nes!=np.inf]
    All_TFs=All_TFs[All_TFs.nes>=0]
    All_TFs.to_csv("Combined_TF_Pathway_Enrichement_{}.csv".format(level))
    return dfs,All_TFs

In [None]:
df4,All_TFs4=Pathway_Enrichment_Of_Regulated_Genes(Base_dir=".",Pathway_df4=Pathway_df4,level='Level_4')

In [None]:
New_df4=pd.read_csv("Combined_TF_Pathway_Enrichement_Level_4.csv",index_col=0)

In [None]:
# New_df4.reset_index(inplace=True)
Significant_Pathways4=New_df4[New_df4.fdr<=0.05]
Significant_Pathways4=Significant_Pathways4[Significant_Pathways4.nes>2]
Significant_Pathways4=Significant_Pathways4[['Pathway_main','Term','fdr']]

In [None]:
# Significant_Pathways4['fdr']=1-(Significant_Pathways4.fdr)

In [None]:
Significant_Pathways4.sort_values(by=['fdr'],ascending=True)

In [None]:
def ConvertPairsToMatrix(bayesian_metabol_df):
    a = np.unique(bayesian_metabol_df['Pathway_main'])
    b = np.unique(bayesian_metabol_df['Term'])
#     c = np.union1d(a,b);
    data = np.zeros((len(a), len(b)));
    output_df = pd.DataFrame(data, index=a, columns=b)
    for values in bayesian_metabol_df.values: 
        output_df.loc[values[0]][values[1]] = values[2];
#         output_df[values[1]][values[0]]=values[2];
#     np.fill_diagonal(output_df.values,1)
    return output_df

In [None]:
Significant_PathwaysMatrix4=ConvertPairsToMatrix(Significant_Pathways4)
Significant_PathwaysMatrix4=Significant_PathwaysMatrix4.transpose()

In [None]:
len(np.unique(Significant_Pathways4['Pathway_main']))

In [None]:
Significant_PathwaysMatrix4=Significant_PathwaysMatrix4.loc[Significant_PathwaysMatrix4.sum(axis=1)!=0]
#     Significant_PathwaysMatrix2=Significant_PathwaysMatrix2.loc[(Significant_PathwaysMatrix2.sum(axis=1)!=0).index]
# #     Significant_PathwaysMatrix2=Significant_PathwaysMatrix2.loc[Significant_PathwaysMatrix2.index.str.contains('nhr')==True]
# #     Significant_PathwaysMatrix2=Significant_PathwaysMatrix2[(Significant_PathwaysMatrix2.sum()!=0).index]
Significant_PathwaysMatrix4=Significant_PathwaysMatrix4.transpose()
# #     print(Significant_PathwaysMatrix2.shape)
#     Significant_PathwaysMatrix2=Significant_PathwaysMatrix2.loc[(Significant_PathwaysMatrix2.sum()!=0).index]
# #     Significant_PathwaysMatrix2.drop(index=Categories,inplace=True)
# #     print(Significant_PathwaysMatrix2.shape)
# # # Pathways1.set_index(['Gene'],inplace=True)
sns.clustermap(Significant_PathwaysMatrix4,figsize=(28, 28),method='average',cbar_kws={'label':'FDR'},col_cluster=True,
                  yticklabels=True,xticklabels=True)
plt.savefig("PathwayCluster_Leve4.png")

In [None]:
Significant_Pathways4

In [None]:
Significant_PathwaysMatrix4=0.05-Significant_PathwaysMatrix4

In [None]:
Significant_PathwaysMatrix4

In [None]:
status1=Pathway_df1['IsRegulated']
status1=pd.DataFrame(status1)
status2=Pathway_df1['IsRegulated']
status2=pd.DataFrame(status2)
status3=Pathway_df3['IsRegulated']
status3=pd.DataFrame(status3)
status4=Pathway_df4['IsRegulated']
status4=pd.DataFrame(status4)

In [None]:
status1.to_csv("Pathway_Regulation_status1.csv")
status2.to_csv("Pathway_Regulation_status2.csv")
status3.to_csv("Pathway_Regulation_status3.csv")
status4.to_csv("Pathway_Regulation_status4.csv")

In [None]:
status1=pd.read_csv("Pathway_Regulation_status1.csv",index_col=0)
status2=pd.read_csv("Pathway_Regulation_status2.csv",index_col=0)
status3=pd.read_csv("Pathway_Regulation_status3.csv",index_col=0)
status4=pd.read_csv("Pathway_Regulation_status4.csv",index_col=0)

In [None]:
status1.IsRegulated.groupby(status1.IsRegulated).count()
status2.IsRegulated.groupby(status2.IsRegulated).count()
status3.IsRegulated.groupby(status3.IsRegulated).count()
status4.IsRegulated.groupby(status4.IsRegulated).count()

In [None]:
#Status_1
my_labels='Non-Regulated','Regulated'
sums1 = status1.IsRegulated.groupby(status1.IsRegulated).count()
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

#Status_2
my_labels='Non-Regulated','Regulated'
sums2 = status2.IsRegulated.groupby(status2.IsRegulated).count()
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

#Status_3
my_labels='Non-Regulated','Regulated'
sums3 = status3.IsRegulated.groupby(status3.IsRegulated).count()
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct



In [None]:
#Status_4
my_labels='Non-Regulated','Regulated'
sums4 = status4.IsRegulated.groupby(status4.IsRegulated).count()
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

In [None]:
sums4

In [None]:
# Level_1
fig, ax = plt.subplots(figsize=(12,10))
size = 0.3
plt.pie(sums1,labels=my_labels,autopct=make_autopct(sums1))
plt.savefig("Overall_result_piechart_level_1.png")
plt.show()


In [None]:
# Level_2
fig, ax = plt.subplots(figsize=(12,10))
size = 0.3
plt.pie(sums2,labels=my_labels,autopct=make_autopct(sums2))
plt.savefig("Overall_result_piechart_level_2.png")


In [None]:
# Level 3
fig, ax = plt.subplots(figsize=(12,10))
size = 0.3
plt.pie(sums3,labels=my_labels,autopct=make_autopct(sums3))
plt.savefig("Overall_result_piechart_level_3.png")


In [None]:
# Level 4
fig, ax = plt.subplots(figsize=(12,10))
size = 0.3
plt.pie(sums4,labels=my_labels,autopct=make_autopct(sums4))
plt.savefig("Overall_result_piechart_level_4.png")
