In this notebook we download orthologus sequences for all the human protein-coding genes from Ensembl using their REST API (Downloaded in June, 2022)

In [8]:
import requests, sys
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
import concurrent.futures
from os import listdir
from os.path import isfile, join
from typing import Dict, Tuple


In [7]:
''' Get list of genes (all human protein coding genes)'''
genes_df_path =  "../co_trans_data/gene_basic_info_df.pickle" #contains gene symbol, ensembl gene id, 
#transcript id, chromosome, strand and ccds id. obtained using biomart. 
genes = pd.read_pickle(genes_df_path)['gene_id']
genes = genes[~genes.isna()]


### Download orthologs per gene

In [4]:
''' For each gene we use Ensembl API to dowload all available orthologus sequences for each CDS '''
def get_orthologs_single_gene(gene:str) -> Tuple[Dict, Dict]:

    server = "https://rest.ensembl.org"
    try:

        ext = "/homology/id/"+gene+"?type=orthologues;aligned=0;sequence=cdna" #get cdna sequence of all orthologs of this gene
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()

        cdna_decoded = r.json()

        ext = "/homology/id/"+gene+"?type=orthologues;aligned=0;" #get amino-acid sequence of all orthologs of this gene
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()

        aa_decoded = r.json()

        with open(f'../Data/AllGenes/orthologs/cdna_dict/cdna_{gene}.pickle', 'wb') as handle:
            pickle.dump(cdna_decoded, handle)
        with open(f'../Data/AllGenes/orthologs/aa_dict/aa_{gene}.pickle', 'wb') as handle:
            pickle.dump(aa_decoded, handle)
 
    except Exception as e:
        print(f"Error in gene {gene}! {e}") 

In [40]:
''' Do this parallely '''
with concurrent.futures.ProcessPoolExecutor(max_workers=60) as executor:
    futures = []
    for gene in genes:
        futures.append(executor.submit(get_orthologs_single_gene, gene=gene)) 

### Get the protein_id of all the genes (to know which version of CDS we are using through our analysis

In [23]:
''' "genes_for_msa.pickle" contains the list of genes that we were able to download orthologous sequences for '''

genes = pd.read_pickle("../Data/AllGenes/genes_for_msa.pickle")
gene_protein = {}
for gene in tqdm(genes):
    nt_dict_cur_gene = pd.read_pickle(f"../Data/AllGenes/orthologs/cdna_dict/cdna_{gene}.pickle.gz")
    protein_id = nt_dict_cur_gene['data'][0]['homologies'][0]['source']['protein_id']
    gene_protein[gene] = protein_id

100%|██████████| 17946/17946 [05:50<00:00, 51.19it/s] 


In [25]:
''' Create a dictionary that maps between the gene id (ENSG) and the protein id for the CDS they used (ENSP) '''
with open(f'../Data/AllGenes/gene_protein_dict.pickle', 'wb') as handle:
    pickle.dump(gene_protein, handle)
