In order to claculate the CAI weights we need to find a group of HEG (Highly expressed genes) for each of our organizms. We will get protein abundance measurements for organisms that have them (from PAXdb). 
Then, for those who don not,  we will get mRNA abundance levels for those who have them (expression atlas). 
For oganisms who do not have PA or MRNA measurements we will compute ENC to get HEG, in the notebook "get_CAI_from_ENC.ipynb". The CAI weights for all organisms are saved in "../Results/codon_usage_tables/". 

*this notebook was ran on my local computer, not power*


## Imports

In [1]:
import pandas as pd
import os
import re
import mygene
import requests, sys
from tqdm import tqdm
import numpy as np
from CAI import CAI, relative_adaptiveness
import pickle
import xlrd
from Bio import SeqIO
import re



## Get a list of all species:

In [2]:
species = []
for filename in os.listdir("../Data/orthologs_df_per_gene"):
    cur_df = pd.read_pickle('../Data/orthologs_df_per_gene/'+filename)
    cur_species = cur_df['species'].to_list()
    species.extend(cur_species)


In [3]:
# we have a total of 283 species in our data!
len(list(set(species)))

283

In [206]:
unique_species = list(set(species))
list(np.sort(unique_species))

['acanthochromis_polyacanthus',
 'accipiter_nisus',
 'ailuropoda_melanoleuca',
 'amazona_collaria',
 'amphilophus_citrinellus',
 'amphiprion_ocellaris',
 'amphiprion_percula',
 'anabas_testudineus',
 'anas_platyrhynchos',
 'anas_platyrhynchos_platyrhynchos',
 'anas_zonorhyncha',
 'anolis_carolinensis',
 'anser_brachyrhynchus',
 'anser_cygnoides',
 'aotus_nancymaae',
 'apteryx_haastii',
 'apteryx_owenii',
 'apteryx_rowi',
 'aquila_chrysaetos_chrysaetos',
 'astatotilapia_calliptera',
 'astyanax_mexicanus',
 'astyanax_mexicanus_pachon',
 'athene_cunicularia',
 'balaenoptera_musculus',
 'betta_splendens',
 'bison_bison_bison',
 'bos_grunniens',
 'bos_indicus_hybrid',
 'bos_mutus',
 'bos_taurus',
 'bos_taurus_hybrid',
 'bubo_bubo',
 'buteo_japonicus',
 'caenorhabditis_elegans',
 'cairina_moschata_domestica',
 'calidris_pugnax',
 'calidris_pygmaea',
 'callithrix_jacchus',
 'callorhinchus_milii',
 'camarhynchus_parvulus',
 'camelus_dromedarius',
 'canis_lupus_dingo',
 'canis_lupus_familiaris'

## Get CUB wieghts for organisms with known PA levels (downloaded from PAXdb):

In [358]:
# get top 15% proteins-
data = pd.read_csv('../Data/codon_usage/PA/sus_scrofa.txt', skiprows = 11, sep = '\t')
data = data.sort_values(by = ['abundance'], ascending = False)

In [359]:
threshold = data['abundance'].quantile(.85)

In [360]:
highly_expressed_proteins = data[data['abundance'] >= threshold]['string_external_id']

In [361]:
len(highly_expressed_proteins)

444

In [364]:
def retrieve_ensembl_id(ID):
    match = re.search("\.(.*)", ID)
    wanted = match[1]
    return(wanted)

In [365]:
ensemble_ids = highly_expressed_proteins.apply(lambda x: retrieve_ensembl_id(x)).to_list()

Now we have the protein ids, and we need their sequence *in nucleotides*.For some reason we can't retrieve this from Ensemble directly,
we need to convert to transcripts and then take their cds seqence. 

In [366]:
# convert from protein ids to transcrpit ids
# PROBLEM: there are multiple transcripts related to the same protein because of differences in the UTR. 
#fortunatly, this doesnt matter to us here as we only want the cds. 
mg = mygene.MyGeneInfo()
out = mg.querymany(ensemble_ids, scopes='ensembl.protein', fields='ensembl.transcript',returnall=True)


querying 1-444...done.
Finished.
2 input query terms found dup hits:
	[('ENSSSCP00000013714', 2), ('ENSSSCP00000000402', 2)]
106 input query terms found no hit:
	['ENSSSCP00000021419', 'ENSSSCP00000006164', 'ENSSSCP00000025444', 'ENSSSCP00000021888', 'ENSSSCP000


In [367]:
cds_df = pd.DataFrame(columns = ["protein_id","transcript_id","cds_sequence"])
for ind in tqdm(range(0,len(out['out']))):
    try:
        transcript_id = out['out'][ind]['ensembl']['transcript']
        if isinstance(transcript_id, list): #it's str only if there is a single tranascript
            transcript_id = transcript_id[0]#taking the first transcript of the protein. It is the main, 
            #most common one and also they are all supposed to have the same cds sequences. 
        
        server = "https://rest.ensembl.org"
        ext = "/sequence/id/"+transcript_id+"?type=cds"
 
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        decoded = r.json()
        
        cds_df.loc[ind] = [out['out'][ind]['query'], transcript_id, decoded['seq']]

    except:
        try:
            cds_df.loc[ind] = [out['out'][ind]['query'],transcript_id, np.NaN] #genes that have  transcript id but do not have a sequence in ENSEMBL
        except:
            cds_df.loc[ind] = [out['out'][ind]['query'],np.NaN, np.NaN] #genes that dont have a transcript in ENSEMBL


100%|██████████| 446/446 [00:56<00:00,  7.90it/s]


In [369]:
cds_df_with_seqs = cds_df[~cds_df['cds_sequence'].isna()].copy()
cds_df_with_seqs['remainder_devision_3'] = cds_df_with_seqs['cds_sequence'].apply(lambda x: len(x)%3) #check for errors on cds length
cds_df_with_seqs = cds_df_with_seqs[cds_df_with_seqs['remainder_devision_3'] == 0] #taking only cds's that can be devided by three
print(cds_df_with_seqs.shape) #see if we are left with enough sequences for the analysis to be significant
seqs = cds_df_with_seqs['cds_sequence'].to_list()

(68, 4)


In [370]:
weights = relative_adaptiveness(sequences=seqs)

In [372]:
with open('../Results/codon_usage_tables/sus_scrofa.pickle', 'wb') as handle:
    pickle.dump(weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Get CUB wieghts for organisms with known mRNA levels (downloaded from the Expression Atlas):

In [4]:
# get top 15% mRNAs-
data = pd.read_csv('../Data/codon_usage/MRNA/ovis_aries.tsv', skiprows = 4, sep = '\t')
data = data.drop(columns=['Gene Name'])
data = data.set_index(['Gene ID'])
data[data.isna()] = 0
data['averge'] = data.mean(axis=1)
data = data.sort_values(by = ['averge'], ascending = False)

In [5]:
threshold = data['averge'].quantile(.85)

In [6]:
highly_expressed_genes = data[data['averge'] >= threshold].index.to_list()

In [7]:
cds_df = pd.DataFrame(columns = ["gene_id","cds_sequence"])
for ind in tqdm(range(0,len(highly_expressed_genes))):
    gene_id = highly_expressed_genes[ind]
    try:        
        server = "https://rest.ensembl.org"
        ext = "/sequence/id/"+gene_id+"?type=cds"
 
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        decoded = r.json()
        
        cds_df.loc[ind] = [gene_id, decoded['seq']]

    except:
            cds_df.loc[ind] = [gene_id, np.NaN] #genes that have  transcript id but do not have a sequence in ENSEMBL
        

100%|██████████| 3073/3073 [2:01:31<00:00,  2.37s/it]  


In [13]:
cds_df.shape

(3073, 2)

In [14]:
cds_df_with_seqs = cds_df[~cds_df['cds_sequence'].isna()].copy()
cds_df_with_seqs['remainder_devision_3'] = cds_df_with_seqs['cds_sequence'].apply(lambda x: len(x)%3) #check for errors on cds length
cds_df_with_seqs = cds_df_with_seqs[cds_df_with_seqs['remainder_devision_3'] == 0] #taking only cds's that can be devided by three
print(cds_df_with_seqs.shape) #see if we are left with enough sequences for the analysis to be significant
seqs = cds_df_with_seqs['cds_sequence'].to_list()

(2729, 3)


In [15]:
weights = relative_adaptiveness(sequences=seqs)

In [16]:
with open('../Results/codon_usage_tables/ovis_aries.pickle', 'wb') as handle:
    pickle.dump(weights, handle, protocol=pickle.HIGHEST_PROTOCOL)