In [1]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from itertools import combinations
import requests
import re

## How to obtain information about the pathways of a given gene
First, the list of genes is loaded, as well as the index for the counts (the index of the counts is to filter out genes that I identified in papers but are not in our database)

In [2]:
genes = list(pd.read_csv('genes.csv')['name'])
index = pd.read_csv('pe-rna-counts.csv', index_col=0).index

Then a dictionary is created to convert the common name to the ensembl code, and the genes that we don't have in our database are filtered out to 71 genes

Then duplicates are removed to yeild 66 individual genes

In [3]:
to_ensembl = pd.read_csv('genes.csv').set_index("name")["code"].to_dict()
genes = [g for g in genes if to_ensembl[g] in index]
print("should be 78", len(genes)) 
genes = list(set(genes))
print("should be 73", len(genes)) 

should be 78 78
should be 73 73


So now we have our list of genes that we need to find pathways of

Then, a dictionary is created with keys of genes and values of pathways, using the PathwayCommons API. This takes around 17 seconds because APIs can take time 

In [4]:
d = {}
for g in tqdm(genes):
    d[g] = re.findall(r'"name":"(.*?)","dataSource":', requests.get("https://www.pathwaycommons.org/pc2/top_pathways?q=" + g).text)
d

  0%|          | 0/73 [00:00<?, ?it/s]

100%|██████████| 73/73 [00:22<00:00,  3.22it/s]


{'SH3BP5': ['BCR signaling pathway'],
 'ARNT2': [],
 'FN1': ['Extracellular matrix organization',
  'Beta3 integrin cell surface interactions',
  'Angiopoietin receptor Tie2-mediated signaling',
  'Alpha4 beta1 integrin signaling events',
  'Syndecan-4-mediated signaling events',
  'Integrin signaling pathway ( Integrin signaling pathway )',
  'VEGFR3 signaling in lymphatic endothelium',
  'Integrins in angiogenesis',
  'Beta5 beta6 beta7 and beta8 integrin cell surface interactions',
  'Integrin signalling pathway',
  'Beta1 integrin cell surface interactions',
  'Urokinase-type plasminogen activator (uPA) and uPAR-mediated signaling',
  'Syndecan-2-mediated signaling events',
  'Alpha9 beta1 integrin signaling events',
  'Hemostasis'],
 'RPL34': ['Protein Synthesis: Glutamic Acid',
  'Protein Synthesis: Isoleucine',
  'Protein Synthesis: Serine',
  'Protein Synthesis: Threonine',
  'Protein Synthesis: Valine',
  'Protein Synthesis: Alanine',
  'Protein Synthesis: Tryptophan',
  'Prot

This is backwards from what we need though - what we need is a dictionary that maps pathways to genes, not genes to pathways, so we create the `pathways_to_genes` dict

In [5]:
pathways_to_genes = defaultdict(list)
for gene, pathways in d.items():
    for pathway in pathways:
        pathways_to_genes[pathway].append(gene)
path_to_genes = dict(pathways_to_genes)
path_to_genes

{'BCR signaling pathway': ['SH3BP5', 'LYN'],
 'Extracellular matrix organization': ['FN1', 'SERPINE1', 'TNF'],
 'Beta3 integrin cell surface interactions': ['FN1', 'VEGFA', 'ITGB3'],
 'Angiopoietin receptor Tie2-mediated signaling': ['FN1', 'NOS3', 'TNF'],
 'Alpha4 beta1 integrin signaling events': ['FN1'],
 'Syndecan-4-mediated signaling events': ['FN1'],
 'Integrin signaling pathway ( Integrin signaling pathway )': ['FN1', 'ITGB3'],
 'VEGFR3 signaling in lymphatic endothelium': ['FN1', 'VEGFC'],
 'Integrins in angiogenesis': ['FN1', 'VEGFA', 'CSF1', 'IGF1', 'ITGB3'],
 'Beta5 beta6 beta7 and beta8 integrin cell surface interactions': ['FN1'],
 'Integrin signalling pathway': ['FN1'],
 'Beta1 integrin cell surface interactions': ['FN1', 'VEGFA'],
 'Urokinase-type plasminogen activator (uPA) and uPAR-mediated signaling': ['FN1',
  'SERPINE1',
  'FPR3',
  'ITGB3'],
 'Syndecan-2-mediated signaling events': ['FN1'],
 'Alpha9 beta1 integrin signaling events': ['FN1', 'VEGFA', 'VEGFC'],
 'Hem

Now we need to get all the pairs which share pathways and all the pairs that do not - which are the two groups, A and B, that you mentioned a few days ago

In [6]:
A = []
B = []
key_pairs = combinations(d.keys(), 2)
for key_pair in key_pairs:
    key1, key2 = key_pair
    if bool(set(d[key1]) & set(d[key2])):
        A.append((key1, key2))
    else:
        B.append((key1, key2))
print(len(A))
print(len(B))

170
2458
