Xavier Castellanos-Girouard <br>
Date First Created: Oct 22 2023 <br>
Date Last Modified: Jun 26 2024 <br>

# Import Libraries

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import math
from timeit import default_timer as timer
import pickle
import multiprocessing as mp
from itertools import repeat
from functools import reduce
from itertools import combinations
from itertools import product
from itertools import chain
import random
from tqdm import tqdm
import scipy
import sys
import statistics
import time

In [20]:
from platform import python_version
print(python_version())

3.9.19


# Define Functions

In [21]:
# Calculate Jaccard index for two sets of elements
def Jaccard(a, b):
    set_a = set(a)
    set_b = set(b)
    intersect_ab = set_a.intersection(set_b)
    union_ab = set_a.union(set_b)
    
    try:
        Jaccard_index = len(intersect_ab)/len(union_ab)
    
    except:
        Jaccard_index = np.nan
    
    return(Jaccard_index)

# Import and Format Data

In [22]:
### PPI Network Dataframes
# Import PPI network
Modular_PPI_Network_DF = pd.read_csv("../results/Yeast_PPI_Network_CPX.csv", index_col = 0)

# Convert PPI Complex strings to list
Modular_PPI_Network_DF['source_Complex'] = Modular_PPI_Network_DF['source_Complex'].str.split("|")
Modular_PPI_Network_DF['target_Complex'] = Modular_PPI_Network_DF['target_Complex'].str.split("|")

# Reset index
Modular_PPI_Network_DF = Modular_PPI_Network_DF.reset_index(drop = True)

### GI Network Dataframes
GI_Epsi_network_DF = pd.read_csv("../../Python_GI_Network_Formatting/data/Costanzo_GI_processed/costanzo_2016_longer_withoutReps.csv")


print(Modular_PPI_Network_DF.head(5))
print(GI_Epsi_network_DF.head(5))

  source_locus target_locus source_Complex target_Complex Interaction_Type
0      YNR010W      YPR070W     [CPX-3226]     [CPX-3226]      IntraModule
1      YNR010W      YPR168W     [CPX-3226]     [CPX-3226]      IntraModule
2      YNR010W      YOL135C     [CPX-3226]     [CPX-3226]      IntraModule
3      YNR010W      YOR174W     [CPX-3226]     [CPX-3226]      IntraModule
4      YNR010W      YOL051W     [CPX-3226]     [CPX-3226]      IntraModule
   pval  scores  ORF_query strain_query ORF_array strain_array
0   0.0 -0.0783  YNL138W-A       sn4590   YER061C      dma1363
1   0.0 -0.0126    YDR142C        sn312   YDL222C       dma813
2   0.0 -0.0598    YDL143W       tsq729   YIL036W      dma2279
3   0.0  0.0319    YIL076W        sn202   YBR269C       dma421
4   0.0 -0.6502    YPL233W       tsq450   YGL093W       tsa446


In [23]:
## Import other network attributes
GI_Modules = pd.read_excel("../data/Costanzo2016_DataFileS6.xlsx")

# Assign proper names to columns
GI_Modules.columns = list(GI_Modules.iloc[0].values)

# Remove row containing column names. Reset index
GI_Modules = GI_Modules.drop(index = 0, axis = 0, inplace = False).reset_index(drop = True)

# Remove duplicates
GI_Modules = GI_Modules[~GI_Modules.duplicated(['Systematic array ORF  name', 'Pathway/complex level (PCC > 0.4)'])]
GI_Modules = GI_Modules[~GI_Modules.duplicated(['Systematic array ORF  name'])].reset_index(drop = True)

GI_Modules.head()

Unnamed: 0,Systematic array ORF name,Standard array gene/allele,Array strain ID,Cell compartment level ( PCC <0.05),Bioprocess level (PCC > 0.2),Pathway/complex level (PCC > 0.4),Cell compartment enriched term,Select GO bioprocess enriched terms,Select Pathway/protein complex enriched terms
0,YJL115W,ASF1,dma2518,6,33,27,nucleus-enriched cluster,DNA repair (GO:0006281); DNA metabolic process...,DNA homologous recombination
1,YLR320W,MMS22,dma3369,6,33,27,nucleus-enriched cluster,DNA repair (GO:0006281); DNA metabolic process...,DNA homologous recombination
2,YER095W,RAD51,dma1402,6,33,27,nucleus-enriched cluster,DNA repair (GO:0006281); DNA metabolic process...,DNA homologous recombination
3,YML032C,RAD52,dma3531,6,33,27,nucleus-enriched cluster,DNA repair (GO:0006281); DNA metabolic process...,DNA homologous recombination
4,YDR076W,RAD55,dma885,6,33,27,nucleus-enriched cluster,DNA repair (GO:0006281); DNA metabolic process...,DNA homologous recombination


## Sort PPI ORFs into modules

In [24]:
## Get list of PPI modules

# Initiate list:
PPI_module_list = []

# Flatten list from interactorA complexes, and add to list
PPI_module_list.extend([CPX for sublist in Modular_PPI_Network_DF["source_Complex"] for CPX in sublist])

# Flatten list from interactorB complexes, and add to list
PPI_module_list.extend([CPX for sublist in Modular_PPI_Network_DF["target_Complex"] for CPX in sublist])

# Get unique list
PPI_module_list = list(np.unique(PPI_module_list))

# Make dictionary keys from list items
PPI_module_dict = {key: [] for key in PPI_module_list}

PPI_module_dict

# * Note: 'None' value is purpousfully kept in dict, it serves as a list of connectors

{'CPX-1015': [],
 'CPX-1016': [],
 'CPX-1018': [],
 'CPX-1021': [],
 'CPX-1022': [],
 'CPX-1026': [],
 'CPX-1028': [],
 'CPX-1036': [],
 'CPX-1037': [],
 'CPX-1039': [],
 'CPX-1040': [],
 'CPX-1041': [],
 'CPX-1042': [],
 'CPX-1043': [],
 'CPX-1044': [],
 'CPX-1045': [],
 'CPX-1047': [],
 'CPX-1052': [],
 'CPX-1053': [],
 'CPX-1068': [],
 'CPX-1071': [],
 'CPX-1101': [],
 'CPX-1102': [],
 'CPX-1103': [],
 'CPX-112': [],
 'CPX-1125': [],
 'CPX-1140': [],
 'CPX-1141': [],
 'CPX-1142': [],
 'CPX-1143': [],
 'CPX-1149': [],
 'CPX-1150': [],
 'CPX-1151': [],
 'CPX-1152': [],
 'CPX-1153': [],
 'CPX-1155': [],
 'CPX-1156': [],
 'CPX-1157': [],
 'CPX-1162': [],
 'CPX-1165': [],
 'CPX-1166': [],
 'CPX-1167': [],
 'CPX-1178': [],
 'CPX-1179': [],
 'CPX-1185': [],
 'CPX-1186': [],
 'CPX-1187': [],
 'CPX-1188': [],
 'CPX-1189': [],
 'CPX-1190': [],
 'CPX-1191': [],
 'CPX-1192': [],
 'CPX-1193': [],
 'CPX-1197': [],
 'CPX-1198': [],
 'CPX-1200': [],
 'CPX-1229': [],
 'CPX-1230': [],
 'CPX-1231': []

In [25]:
## Sort ORFs from interactorA column

# Pair ORF to list of complexes it is part of:
InteractorA_assign = zip(Modular_PPI_Network_DF["source_locus"], Modular_PPI_Network_DF["source_Complex"])

for ORF, CPX_ls in InteractorA_assign: # For every ORF and list of complexes
    for CPX in CPX_ls: # For every complex in list of complexes
        if (ORF not in PPI_module_dict[CPX]): # If the Complexes exists (i.e. ORF is in a complex)
            PPI_module_dict[CPX].append(ORF) # Assign ORF to complex in dictionnary

## Sort ORFs from interactorB column

# Pair ORF to list of complexes it is part of:
InteractorB_assign = zip(Modular_PPI_Network_DF["target_locus"], Modular_PPI_Network_DF["target_Complex"])

for ORF, CPX_ls in InteractorB_assign: # For every ORF and list of complexes
    for CPX in CPX_ls: # For every complex in list of complexes
        if (ORF not in PPI_module_dict[CPX]): # If the Complexes exists (i.e. ORF is in a complex) and ORF is not in dict
            PPI_module_dict[CPX].append(ORF) # Assign ORF to complex in dictionnary
            
PPI_module_dict

{'CPX-1015': ['YNL103W'],
 'CPX-1016': ['YNL103W'],
 'CPX-1018': ['YAL026C', 'YCR094W'],
 'CPX-1021': ['YER166W', 'YNL323W'],
 'CPX-1022': ['YDR093W', 'YNL323W'],
 'CPX-1026': ['YMR162C', 'YNR048W'],
 'CPX-1028': ['YDR141C', 'YBR164C', 'YNL297C'],
 'CPX-1036': ['YCR092C', 'YOL090W'],
 'CPX-1037': ['YDR097C', 'YOL090W'],
 'CPX-1039': ['YPL138C',
  'YHR119W',
  'YLR015W',
  'YBR258C',
  'YDR469W',
  'YAR003W',
  'YBR175W',
  'YKL018W'],
 'CPX-1040': ['YGL213C', 'YLR398C', 'YPR189W'],
 'CPX-1041': ['YDR016C',
  'YKR083C',
  'YKR037C',
  'YGL061C',
  'YKL052C',
  'YBR233W-A',
  'YDR320C-A',
  'YGR113W',
  'YDR201W',
  'YKL138C-A'],
 'CPX-1042': ['YML051W'],
 'CPX-1043': ['YML051W'],
 'CPX-1044': ['YML051W'],
 'CPX-1045': ['YML067C', 'YAL042W'],
 'CPX-1047': ['YNR046W', 'YCR047C'],
 'CPX-1052': ['YML014W', 'YNR046W'],
 'CPX-1053': ['YNL317W',
  'YPR107C',
  'YLR277C',
  'YKR002W',
  'YLR115W',
  'YKL018W',
  'YKL059C',
  'YGR156W',
  'YJR093C',
  'YDR301W',
  'YAL043C',
  'YDR195W',
  'YOR1

## Sort GI ORFs into clusters/modules

In [26]:
# Add ORF from correlation network
GI_ORF_list = []
GI_ORF_list.extend(GI_Epsi_network_DF['ORF_query'])
GI_ORF_list.extend(GI_Epsi_network_DF['ORF_array'])

# Make list unique
GI_ORF_list = list(np.unique(GI_ORF_list))


# Get list for only EpsiGI
all_EpsiGI_ORFs = []
all_EpsiGI_ORFs.extend(GI_Epsi_network_DF['ORF_query'])
all_EpsiGI_ORFs.extend(GI_Epsi_network_DF['ORF_array'])

all_EpsiGI_ORFs = list(np.unique(all_EpsiGI_ORFs))

GI_ORF_list

['YAL001C',
 'YAL002W',
 'YAL004W',
 'YAL005C',
 'YAL007C',
 'YAL008W',
 'YAL009W',
 'YAL010C',
 'YAL011W',
 'YAL012W',
 'YAL013W',
 'YAL014C',
 'YAL015C',
 'YAL016C-B',
 'YAL017W',
 'YAL018C',
 'YAL019W',
 'YAL020C',
 'YAL021C',
 'YAL022C',
 'YAL023C',
 'YAL024C',
 'YAL025C',
 'YAL027W',
 'YAL028W',
 'YAL029C',
 'YAL030W',
 'YAL031C',
 'YAL032C',
 'YAL033W',
 'YAL034C',
 'YAL034W-A',
 'YAL036C',
 'YAL037C-A',
 'YAL037W',
 'YAL038W',
 'YAL039C',
 'YAL040C',
 'YAL041W',
 'YAL042C-A',
 'YAL042W',
 'YAL043C',
 'YAL044C',
 'YAL044W-A',
 'YAL045C',
 'YAL046C',
 'YAL048C',
 'YAL049C',
 'YAL051W',
 'YAL053W',
 'YAL054C',
 'YAL055W',
 'YAL056C-A',
 'YAL056W',
 'YAL058W',
 'YAL059W',
 'YAL060W',
 'YAL061W',
 'YAL062W',
 'YAL063C',
 'YAL063C-A',
 'YAL064C-A',
 'YAL064W',
 'YAL065C',
 'YAL066W',
 'YAL067C',
 'YAL067W-A',
 'YAL068C',
 'YAR002C-A',
 'YAR002W',
 'YAR003W',
 'YAR007C',
 'YAR014C',
 'YAR015W',
 'YAR018C',
 'YAR019C',
 'YAR020C',
 'YAR023C',
 'YAR027W',
 'YAR028W',
 'YAR029W',
 'YAR030

In [27]:
## Get list of GI clusters/modules

# Get modules from module dataframe. Make list unique
GI_Modules_list = list(np.unique(GI_Modules['Pathway/complex level (PCC > 0.4)']))

# Make empty dictionary with keys as cluster number
GI_module_dict = {key: [] for key in GI_Modules_list}

# Make 'None' cluster to store connector ORFs
GI_module_dict['None'] = []


# Sort ORFs into dictionnary of GI modules
for ORF in GI_ORF_list:
    if ORF in list(GI_Modules['Systematic array ORF  name']): # If the ORF is a module
        
        # Find cluster to which the ORF belongs:
        cluster = GI_Modules[GI_Modules['Systematic array ORF  name'] == ORF]['Pathway/complex level (PCC > 0.4)'].values
        
        # Make sure ORF belongs to only one cluster
        assert(len(cluster) == 1)
        cluster = cluster[0] # Get value from array containing 1 item
        
        
        # If the ORF has not yet been sorted into this cluster:
        if ORF not in GI_module_dict[cluster]:
            GI_module_dict[cluster].append(ORF) # Add ORF to cluster in dict
        
        else:
            continue
        
    # If ORF is not in a cluster (not in a module):
    elif ORF not in list(GI_Modules['Systematic array ORF  name']):
        GI_module_dict['None'].append(ORF) # Add to non-module list in dict

GI_module_dict

{1: ['YDL077C', 'YML001W'],
 2: ['YBR288C',
  'YGR261C',
  'YHR012W',
  'YJL024C',
  'YJL053W',
  'YJL154C',
  'YLL040C',
  'YLR360W',
  'YMR004W',
  'YOR068C',
  'YOR132W',
  'YPL120W',
  'YPL195W'],
 3: ['YBR198C',
  'YDR145W',
  'YGL112C',
  'YML015C',
  'YML098W',
  'YMR005W',
  'YMR236W',
  'YOR194C'],
 4: ['YBR244W', 'YBR280C'],
 5: ['YNR017W', 'YPL063W'],
 6: ['YDL087C', 'YGR013W', 'YLR298C'],
 7: ['YGL031C', 'YKR027W', 'YOR167C'],
 8: ['YFL003C', 'YOL116W'],
 9: ['YDL020C', 'YDL126C', 'YDL190C'],
 10: ['YBR139W', 'YGR271W', 'YLL048C', 'YOL056W', 'YOR248W'],
 11: ['YBR215W', 'YJR140C', 'YMR028W', 'YMR073C', 'YNL091W', 'YOR014W'],
 12: ['YLR398C', 'YOR076C', 'YPR189W'],
 13: ['YFR028C', 'YKL022C'],
 14: ['YML126C', 'YNR043W'],
 15: ['YBR247C', 'YDL208W', 'YDR064W', 'YDR339C', 'YMR229C', 'YPL266W'],
 16: ['YBL087C',
  'YBR048W',
  'YBR181C',
  'YDL062W',
  'YDL083C',
  'YDL130W',
  'YDL136W',
  'YDR496C',
  'YFR031C-A',
  'YFR032C-A',
  'YGR034W',
  'YGR085C',
  'YGR118W',
  'YGR1

# Map GI modules to PPI modules

In [28]:
### Find Overlap between individual GI modules and PPI modules
GI_PPI_module_Overlap_matrix = pd.DataFrame(columns = list(PPI_module_dict.keys()), index = list(GI_module_dict.keys()))

# For every cell in matrix, calculate jaccard index between GI module (index) and PPI module (column)
for i in list(GI_module_dict.keys()):
    for j in list(PPI_module_dict.keys()):
        GI_PPI_module_Overlap_matrix.loc[i,j] = Jaccard(PPI_module_dict[j], GI_module_dict[i])

# Convert all values to numeric
for col_name in GI_PPI_module_Overlap_matrix.columns:
    GI_PPI_module_Overlap_matrix[col_name] = pd.to_numeric(GI_PPI_module_Overlap_matrix[col_name])
        
GI_PPI_module_Overlap_matrix

Unnamed: 0,CPX-1015,CPX-1016,CPX-1018,CPX-1021,CPX-1022,CPX-1026,CPX-1028,CPX-1036,CPX-1037,CPX-1039,...,CPX-950,CPX-956,CPX-961,CPX-962,CPX-963,CPX-964,CPX-977,CPX-995,CPX-999,None
1,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000456
2,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000453
3,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
4,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
5,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
110,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
111,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
112,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000


In [29]:
### Make heatmap of GI to PPI module overlap matrix

#fig, ax = plt.subplots(figsize=(48,24))         # Sample figsize in inches
#sns.heatmap(GI_PPI_module_Overlap_matrix, annot=False, ax = ax, linewidths=.5, vmin = 0, vmax = 1, cmap = "viridis")
#sns.heatmap(GI_PPI_module_Overlap_matrix, annot=False, ax = ax, linewidths=0, vmin = 0, vmax = 1, cmap = "viridis")
#plt.savefig("../graphs/GIvsPPI_module_overlap_matrix.eps", format = "eps")
#plt.show()
#plt.close()

In [30]:
## Alternative visualization
# Note: Remove rows or columns with no overlap.

# Make a matrix that will only be used for visualization
#GI_PPI_module_Overlap_matrix_ForViz = GI_PPI_module_Overlap_matrix.copy()

#GI_PPI_module_Overlap_matrix_ForViz = GI_PPI_module_Overlap_matrix_ForViz.drop('None', axis = 0)
#GI_PPI_module_Overlap_matrix_ForViz = GI_PPI_module_Overlap_matrix_ForViz.drop('None', axis = 1)

#GI_PPI_module_Overlap_matrix_ForViz = GI_PPI_module_Overlap_matrix_ForViz[GI_PPI_module_Overlap_matrix_ForViz.sum(axis=1, numeric_only=True) != 0]
#GI_PPI_module_Overlap_matrix_ForViz_T = GI_PPI_module_Overlap_matrix_ForViz.T

#GI_PPI_module_Overlap_matrix_ForViz_T = GI_PPI_module_Overlap_matrix_ForViz_T[GI_PPI_module_Overlap_matrix_ForViz_T.sum(axis=1, numeric_only=True) != 0]

#GI_PPI_module_Overlap_matrix_ForViz = GI_PPI_module_Overlap_matrix_ForViz_T.T

#GI_PPI_module_Overlap_matrix_ForViz

In [31]:
#fig, ax = plt.subplots(figsize=(34,20))         # Sample figsize in inches
#sns.heatmap(GI_PPI_module_Overlap_matrix, annot=False, ax = ax, linewidths=.5, vmin = 0, vmax = 1, cmap = "viridis")
#sns.heatmap(GI_PPI_module_Overlap_matrix_ForViz, annot=False, ax = ax, linewidths=0, vmin = 0, vmax = 1, cmap = "viridis")
#plt.savefig("../graphs/GIvsPPI_module_overlap_matrix.eps", format = "eps")
#plt.show()
#plt.close()

In [32]:
### Find best PPI complex for each GI cluster

# Get list of complexes, in the same order as columns of matrix
PPI_modules_matrix_list = GI_PPI_module_Overlap_matrix.columns.tolist()

# For every GI cluster, get the column index for which overlap is maximal
GI_PPI_optimal_overlap_DF = pd.DataFrame(GI_PPI_module_Overlap_matrix.apply(lambda x: x.tolist().index(max(x)), axis = 1), columns = ["CPX_index"])

# Get complex ID from index
GI_PPI_optimal_overlap_DF['CPX_ID'] = [PPI_modules_matrix_list[index] for index in GI_PPI_optimal_overlap_DF['CPX_index']]

# Make column for Jaccard index
GI_PPI_optimal_overlap_DF['Jaccard_index'] = GI_PPI_optimal_overlap_DF.apply(lambda x: GI_PPI_module_Overlap_matrix.loc[x.name, x['CPX_ID']], axis = 1)

# Make list of common ORF between GI and PPI modules for each pair
GI_PPI_optimal_overlap_DF['Common_Module_ORFs'] = [set(PPI_module_dict[CPX_ID]).intersection(set(GI_module_dict[cluster])) for CPX_ID, cluster in zip(GI_PPI_optimal_overlap_DF['CPX_ID'].tolist(), list(GI_PPI_optimal_overlap_DF.index))]

GI_PPI_optimal_overlap_DF

Unnamed: 0,CPX_index,CPX_ID,Jaccard_index,Common_Module_ORFs
1,153,CPX-1625,0.142857,{YDL077C}
2,480,CPX-535,0.307692,"{YJL024C, YGR261C, YBR288C, YPL195W}"
3,170,CPX-1642,0.437500,"{YMR005W, YGL112C, YML015C, YML098W, YBR198C, ..."
4,427,CPX-3254,0.200000,{YBR280C}
5,524,CPX-6127,1.000000,"{YNR017W, YPL063W}"
...,...,...,...,...
109,82,CPX-1315,1.000000,"{YGR072W, YHR077C, YMR080C}"
110,376,CPX-2948,1.000000,"{YMR264W, YMR022W}"
111,470,CPX-465,1.000000,"{YKR084C, YNL001W}"
112,569,CPX-867,1.000000,"{YDL017W, YDR052C}"


# Add PPI and GI Module ORFs

In [33]:
GI_PPI_optimal_overlap_DF['Cluster_ID'] = GI_PPI_optimal_overlap_DF.index
GI_PPI_optimal_overlap_DF = GI_PPI_optimal_overlap_DF.reset_index(drop = True)

# Add list of ORFs for GI modules
GI_PPI_optimal_overlap_DF['GI_Module_ORFs'] = [GI_module_dict[key_] for key_ in GI_PPI_optimal_overlap_DF['Cluster_ID']]

# Add list of ORFs for PPI modules
GI_PPI_optimal_overlap_DF['PPI_Module_ORFs'] = [PPI_module_dict[key_] for key_ in GI_PPI_optimal_overlap_DF['CPX_ID']]

In [34]:
## Export
GI_PPI_optimal_overlap_DF.to_csv('../results/GI_PPI_optimal_module_overlap.csv')

In [35]:
GI_PPI_optimal_overlap_DF = GI_PPI_optimal_overlap_DF[GI_PPI_optimal_overlap_DF['Common_Module_ORFs'].apply(lambda x: len(x) > 0)]
GI_PPI_optimal_overlap_DF = GI_PPI_optimal_overlap_DF[GI_PPI_optimal_overlap_DF['CPX_ID'] != 'None']
GI_PPI_optimal_overlap_DF

Unnamed: 0,CPX_index,CPX_ID,Jaccard_index,Common_Module_ORFs,Cluster_ID,GI_Module_ORFs,PPI_Module_ORFs
0,153,CPX-1625,0.142857,{YDL077C},1,"[YDL077C, YML001W]","[YDR080W, YDL077C, YLR396C, YMR231W, YLR148W, ..."
1,480,CPX-535,0.307692,"{YJL024C, YGR261C, YBR288C, YPL195W}",2,"[YBR288C, YGR261C, YHR012W, YJL024C, YJL053W, ...","[YGR261C, YJL024C, YBR288C, YPL195W]"
2,170,CPX-1642,0.437500,"{YMR005W, YGL112C, YML015C, YML098W, YBR198C, ...",3,"[YBR198C, YDR145W, YGL112C, YML015C, YML098W, ...","[YPL129W, YER148W, YPL011C, YMR227C, YML015C, ..."
3,427,CPX-3254,0.200000,{YBR280C},4,"[YBR244W, YBR280C]","[YDR054C, YDR328C, YDL132W, YBR280C]"
4,524,CPX-6127,1.000000,"{YNR017W, YPL063W}",5,"[YNR017W, YPL063W]","[YPL063W, YNR017W]"
...,...,...,...,...,...,...,...
106,240,CPX-1717,0.333333,{YNL006W},108,[YNL006W],"[YNL006W, YOL078W, YKL203C]"
107,82,CPX-1315,1.000000,"{YGR072W, YHR077C, YMR080C}",109,"[YGR072W, YHR077C, YMR080C]","[YGR072W, YMR080C, YHR077C]"
108,376,CPX-2948,1.000000,"{YMR264W, YMR022W}",110,"[YMR022W, YMR264W]","[YMR022W, YMR264W]"
109,470,CPX-465,1.000000,"{YKR084C, YNL001W}",111,"[YKR084C, YNL001W]","[YNL001W, YKR084C]"


In [36]:
## Export
GI_PPI_optimal_overlap_DF.to_csv('../results/GI_PPI_optimal_module_overlap_clean.csv')

# Randomization to get p-values for individual module pairings

In [107]:
Cluster_ID_iter_ls = []
Jaccard_index_iter_ls = []
pool_of_PPI_Module_ORFs_stable = [x for list_ in PPI_module_dict.values() for x in list_]

In [108]:
for i in tqdm(range(0,1000)):
    
    pool_of_PPI_Module_ORFs_random = pool_of_PPI_Module_ORFs_stable.copy()
    
    random.shuffle(pool_of_PPI_Module_ORFs_random)
    
    ## Map GI modules to PPI modules 
    ### Find Overlap between individual GI modules and PPI modules
    GI_PPI_module_Overlap_matrix_iter = pd.DataFrame(columns = list(PPI_module_dict.keys()), index = list(GI_module_dict.keys()))
    
    
    k = 0
    # For every cell in matrix, calculate jaccard index between GI module (index) and PPI module (column)
    for j in list(PPI_module_dict.keys()):
        #print(j)
        size = len(PPI_module_dict[j])
        random_PPI_module = pool_of_PPI_Module_ORFs_random[k:k+size]
        k = k+size
        for i in list(GI_module_dict.keys()):
            #print(i)
            GI_PPI_module_Overlap_matrix_iter.loc[i,j] = Jaccard(random_PPI_module, GI_module_dict[i])
            #print(k, size, random_PPI_module, Jaccard(random_PPI_module, GI_module_dict[i]))
        #time.sleep(60)
    
    # Convert all values to numeric
    for col_name in GI_PPI_module_Overlap_matrix_iter.columns:
        GI_PPI_module_Overlap_matrix_iter[col_name] = pd.to_numeric(GI_PPI_module_Overlap_matrix_iter[col_name])
        
    ### Find best PPI complex for each GI cluster
    
    # Get list of complexes, in the same order as columns of matrix
    PPI_modules_matrix_list_iter = GI_PPI_module_Overlap_matrix_iter.columns.tolist()
    
    # For every GI cluster, get the column index for which overlap is maximal
    GI_PPI_optimal_overlap_DF_iter = pd.DataFrame(GI_PPI_module_Overlap_matrix_iter.apply(lambda x: x.tolist().index(max(x)), axis = 1), columns = ["CPX_index"])
    
    # Get complex ID from index
    GI_PPI_optimal_overlap_DF_iter['CPX_ID'] = [PPI_modules_matrix_list_iter[index] for index in GI_PPI_optimal_overlap_DF_iter['CPX_index']]
    
    # Make column for Jaccard index
    GI_PPI_optimal_overlap_DF_iter['Jaccard_index'] = GI_PPI_optimal_overlap_DF_iter.apply(lambda x: GI_PPI_module_Overlap_matrix_iter.loc[x.name, x['CPX_ID']], axis = 1)
    
    # Make list of common ORF between GI and PPI modules for each pair
    #GI_PPI_optimal_overlap_DF_iter['Common_Module_ORFs'] = [set(PPI_module_dict[CPX_ID]).intersection(set(GI_module_dict[cluster])) for CPX_ID, cluster in zip(GI_PPI_optimal_overlap_DF_iter['CPX_ID'].tolist(), list(GI_PPI_optimal_overlap_DF_iter.index))]
    
    Cluster_ID_iter_ls.extend(GI_PPI_optimal_overlap_DF_iter.index.copy().tolist())
    Jaccard_index_iter_ls.extend(GI_PPI_optimal_overlap_DF_iter['Jaccard_index'].copy().tolist())

100%|███████████████████████████████████████████████████████████| 1000/1000 [1:19:10<00:00,  4.75s/it]


In [113]:
randomized_module_overlap_DF = pd.DataFrame({"Cluster_ID":Cluster_ID_iter_ls, "Jaccard_index":Jaccard_index_iter_ls})
randomized_module_overlap_DF

Unnamed: 0,Cluster_ID,Jaccard_index
0,1,0.000977
1,2,0.066667
2,3,0.111111
3,4,0.333333
4,5,0.333333
...,...,...
111995,109,0.090909
111996,110,0.000989
111997,111,0.083333
111998,112,0.000989


In [118]:
randomized_module_overlap_DF.to_csv("../results/GI_PPI_randomized_ModuleOverlap.csv")

### Calculate a z-score for every overlapping module

In [158]:
# Make a dictionary holding random overlaps for each GI Cluster
random_overlap_dict = dict.fromkeys(list(GI_module_dict.keys()))

for key in list(random_overlap_dict.keys()):
    random_overlap_dict[key] = randomized_module_overlap_DF['Jaccard_index'][randomized_module_overlap_DF['Cluster_ID'] == key].tolist()

In [165]:
zscores_ls = []
for i in GI_PPI_optimal_overlap_DF.index:
    zscores_ls.append((GI_PPI_optimal_overlap_DF['Jaccard_index'][i] - np.mean(random_overlap_dict[i]))/statistics.stdev(random_overlap_dict[i]))

  zscores_ls.append((GI_PPI_optimal_overlap_DF['Jaccard_index'][i] - np.mean(random_overlap_dict[i]))/statistics.stdev(random_overlap_dict[i]))


In [175]:
GI_PPI_optimal_overlap_DF['zscore'] = zscores_ls
GI_PPI_optimal_overlap_DF['pvalue'] = GI_PPI_optimal_overlap_DF['zscore'].apply(lambda x: scipy.stats.norm.sf(abs(x))) 

significant_ls = 

GI_PPI_optimal_overlap_DF['Significant'] = GI_PPI_optimal_overlap_DF['pvalue'].apply(lambda x: True if x < 0.05 else False)
GI_PPI_optimal_overlap_DF

Unnamed: 0,CPX_index,CPX_ID,Jaccard_index,Common_Module_ORFs,zscore,pvalue,Significant
1,153,CPX-1625,0.142857,{YDL077C},-0.034911,4.860752e-01,False
2,480,CPX-535,0.307692,"{YBR288C, YPL195W, YGR261C, YJL024C}",24.814872,3.097648e-136,True
3,170,CPX-1642,0.437500,"{YML015C, YMR236W, YBR198C, YGL112C, YDR145W, ...",21.043517,1.310953e-98,True
4,427,CPX-3254,0.200000,{YBR280C},0.841149,2.001321e-01,False
5,524,CPX-6127,1.000000,"{YNR017W, YPL063W}",6.312815,1.370022e-10,True
...,...,...,...,...,...,...,...
109,82,CPX-1315,1.000000,"{YGR072W, YMR080C, YHR077C}",9.279429,8.519473e-21,True
110,376,CPX-2948,1.000000,"{YMR022W, YMR264W}",6.190406,3.000477e-10,True
111,470,CPX-465,1.000000,"{YKR084C, YNL001W}",6.472446,4.821459e-11,True
112,569,CPX-867,1.000000,"{YDR052C, YDL017W}",6.546603,2.943022e-11,True


In [179]:
#GI_PPI_optimal_overlap_DF.to_csv('../results/GI_PPI_optimal_module_overlap.csv')

[{'YDL077C'}, {'YBR288C', 'YPL195W', 'YGR261C', 'YJL024C'}, {'YML015C', 'YMR236W', 'YBR198C', 'YGL112C', 'YDR145W', 'YML098W', 'YMR005W'}, {'YBR280C'}, {'YNR017W', 'YPL063W'}, {'YLR298C', 'YDL087C', 'YGR013W'}, {'YKR027W'}, set(), {'YDL190C', 'YDL126C'}, {'YGR271W'}, {'YJR140C', 'YBR215W'}, {'YPR189W', 'YLR398C'}, {'YFR028C'}, {'YNR043W'}, {'YDL208W'}, {'YFR032C-A'}, {'YOR326W', 'YBR109C'}, {'YMR049C', 'YGR103W'}, {'YKL203C'}, {'YBR103W', 'YCR033W', 'YGL194C', 'YKR029C'}, {'YDL105W'}, {'YGL087C'}, {'YPR070W', 'YGL151W'}, {'YLR430W'}, {'YJL124C', 'YCR077C'}, {'YPL047W', 'YMR223W'}, {'YJL047C'}, {'YBR255W'}, {'YPR055W', 'YLR166C', 'YGL233W'}, {'YMR299C', 'YKR054C', 'YDR424C'}, {'YBR195C', 'YML102W', 'YPR018W'}, {'YBR079C', 'YOR361C'}, {'YLR378C'}, set(), {'YMR135C', 'YGL227W'}, {'YHL025W'}, {'YMR153W', 'YAR002W', 'YML103C', 'YLR335W', 'YJL039C', 'YJR042W', 'YKR082W', 'YLR018C', 'YIL115C'}, {'YDL074C'}, {'YPR179C', 'YNL021W'}, {'YMR186W'}, {'YKR058W', 'YJL155C'}, {'YER162C'}, {'YKL216W'},

In [180]:
GI_PPI_optimal_overlap_sig_DF = GI_PPI_optimal_overlap_DF[GI_PPI_optimal_overlap_DF['Significant']==True].copy()
GI_PPI_optimal_overlap_sig_DF

Unnamed: 0,CPX_index,CPX_ID,Jaccard_index,Common_Module_ORFs,zscore,pvalue,Significant
2.0,480,CPX-535,0.307692,"{YBR288C, YPL195W, YGR261C, YJL024C}",24.814872,3.097648e-136,True
3.0,170,CPX-1642,0.4375,"{YML015C, YMR236W, YBR198C, YGL112C, YDR145W, ...",21.043517,1.310953e-98,True
5.0,524,CPX-6127,1.0,"{YNR017W, YPL063W}",6.312815,1.370022e-10,True
9.0,89,CPX-1323,0.5,"{YDL190C, YDL126C}",3.980045,3.445105e-05,True
11.0,59,CPX-124,0.25,"{YJR140C, YBR215W}",4.332681,7.365224e-06,True
12.0,10,CPX-1040,0.5,"{YPR189W, YLR398C}",3.851019,5.88137e-05,True
17.0,341,CPX-2225,0.285714,"{YOR326W, YBR109C}",6.690953,1.108612e-11,True
18.0,296,CPX-1862,0.125,"{YMR049C, YGR103W}",6.856812,3.520702e-12,True
20.0,94,CPX-1342,0.363636,"{YBR103W, YCR033W, YGL194C, YKR029C}",13.859889,5.542312999999999e-44,True
22.0,351,CPX-2541,0.5,{YGL087C},3.716913,0.0001008359,True


In [181]:
GI_PPI_optimal_overlap_sig_DF.shape[0]

54