In this notebook we find positions with significantly low/fast CAI. 


## Imports

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle as pickle
import io
from Utils_Tal import SynonymousCodons, AAs, reverse_dict, aa_positions_to_nt_positions

from Bio import AlignIO
from Bio import SeqIO
import concurrent.futures
import scipy.stats
import statsmodels.stats.multitest



## Functions

In [2]:
''' In this function we will create a single dictionary holding dictionaries with the CAI weights per
organism, for all our orthologs'''
def create_CAI_dict_all_organisms(codon_usage_path:str) -> dict:
    weight_dicts_all_organisms = {}
    for filename in os.listdir(codon_usage_path):
        if filename.endswith(".pickle"):
            weight_dict = pd.read_pickle(codon_usage_path + filename)
            organism = filename.split(".")[0]
            weight_dicts_all_organisms[organism] = weight_dict
    return(weight_dicts_all_organisms)

In [3]:
'''This function recieves the alignment in nucleotides and returns the alignment in codons'''
def get_original_codons_array(alignment_NT) -> np.ndarray:
    #create codon array from nucleotide array
    nuc_array = np.array(alignment_NT) # an array where each column is a nucleotide position
    #lets merge every three columns into one, to obtain a numpy of codon positions instead of nucleotide positions
    codons_array = np.zeros((nuc_array.shape[0],nuc_array.shape[1]//3),dtype=object)
    for col in range(0,nuc_array.shape[1],3):
        merged_to_codon = np.char.add(np.char.add(nuc_array[:,col],nuc_array[:,col+1]),nuc_array[:,col+2]) 
        codons_array[:,col//3] = merged_to_codon
    return(codons_array)

In [4]:
'''This function recreates the randomizations. In order to save memory we only saved the original msa 
and a dictionary with the location of the differences of each randomization and the original msa. 
There are two keys in the dictionary:
1. 'locations': the locations of the differences in the msa
2. 'codons': the codons in the randomization in these locations'''

def get_randomized_codons_array(randomization_path:str, original_codons_array:np.ndarray, num_rands:int) -> np.ndarray:
    diff_dict = pd.read_pickle(randomization_path)
    rand = original_codons_array.copy()
    rand = np.tile(rand, (num_rands, 1, 1))
    rand[diff_dict['locations']] = diff_dict['codons']
    return(rand)

In [6]:
'''This function maps between the row index in the msa and the organism which it stands for''' 
def create_ind_to_name_dict(path_NT:str) -> dict:
    ind_to_name = {}
    records = SeqIO.parse(open(path_NT),'fasta')
    for record in records:
        index = int(record.description.split("_")[0])
        name = "_".join(record.description.split("_")[1:])
        ind_to_name[index] = name
    return(ind_to_name)

In [7]:
'''This function creates the CAI matrices by mapping a codon to its eqivalent CAI weight.'''
def from_codon_to_CAI(codons_list:list, slice_cur_ortho:np.ndarray, weight_dict:dict) -> np.ndarray:
    #initilize results matrix
    res = np.empty(slice_cur_ortho.shape)
    res[:] = np.nan
    for cur_codon in codons_list:
        if cur_codon in weight_dict.keys():
            res[slice_cur_ortho == cur_codon] = weight_dict[cur_codon]
    return(res) 

In [10]:
''' This functions returns the z-scores and respective p-values for each CDS position. '''
def get_scores(original_scores: np.ndarray, random_scores: np.ndarray, good_mask: np.ndarray) -> pd.DataFrame:
    
    #get the parameters of the normal distribution
    miu = np.mean(random_scores, axis = 0)
    sigma = np.std(random_scores, axis = 0)
    
    #get the z-scores
    z_scores = (original_scores - miu) / sigma
    
    #get the one-sided p-values 
    p_vals = scipy.stats.norm.sf(abs(z_scores))
    # correct FDR. we have nan values (for positions where the randomizations are the same and so sigma = 0) and the FDR correction function doesnt work with nans, 
    # so we will use a mask
    mask = np.isfinite(p_vals) #mask contains "True" only the non-nan positions
    pval_corrected = np.empty(p_vals.shape) #initilize the final result in the right dimensions
    pval_corrected.fill(np.nan) #fill it with nans
    pval_corrected[mask] = statsmodels.stats.multitest.multipletests(p_vals[mask],method='fdr_bh')[1] #insert the corrected p-vals at the non-nan positions
    
    res_df = pd.DataFrame(columns = ["z-score", "p-value", "corrected p-value", "good_position"])
    res_df.index.name='position of CDS'
    res_df["z-score"] = z_scores
    res_df["p-value"] = p_vals
    res_df["corrected p-value"] = pval_corrected
    res_df["good_position"] = good_mask

    return(res_df)  

In [11]:
'''This function calls our main function "get_significant_cai_positions_single_gene" for a batch of genes'''
def do_for_single_batch(single_batch_genes: list, num_randomizations:int) -> None:
    for gene in single_batch_genes:
        get_cai_zscores_single_gene(gene, num_randomizations)
        

In [12]:
'''This function takes the original and randomized versions of the msa of a single gene, creates cai
matrices according to the msas, and finds positions with significant low/high cai scores. 
It saves both the cai scores of each of the positions and the significant positions. '''
def get_cai_zscores_single_gene(gene:str, num_randomizations:int) -> None:
    
    try:
        ''' (1) Download data'''
        #original MSA in nucleotides - unzip and read alignment
        path_NT = f"../co_trans_data/orthologs/nt_after_msa/{gene}.fasta.gz" #MSA in nucleotides
        local_path = f"./{gene}.fasta" #save to machine we are running on 
        !gzip -c -d $path_NT > $local_path #unzip localy, without erasing the zipped version
        alignment_NT = AlignIO.read(local_path, "fasta") #read the alignement
        codons_array = get_original_codons_array(alignment_NT) #from nt to codons
        
        #get column and vertical randomizations
        path_col_rand = f"../Results/AllGenes/column_permutations/{gene}.pickle.gz" #MSA in nucleotides
        codons_array_col = get_randomized_codons_array(path_col_rand, codons_array, num_randomizations)
        path_ver_rand = f"../Results/AllGenes/vertical_permutations/{gene}.pickle.gz" #MSA in nucleotides
        codons_array_ver = get_randomized_codons_array(path_ver_rand, codons_array, num_randomizations)

        '''Remove MSA positions where the human ortholog has indels (these positions do not interest us
        in this project and we want our coordinates system to be the same as the human cds) '''
        human_indels_locs = np.where(codons_array[0,:] == '---')[0]
        codons_array = np.delete(codons_array, human_indels_locs, axis = 1)
        codons_array_col = np.delete(codons_array_col, human_indels_locs, axis = 2)
        codons_array_ver = np.delete(codons_array_ver, human_indels_locs, axis = 2)

        '''Change from codons to CAI weights. Keep in mind that the weights are different for each ortholog. 
        First, we create CAI weight matrices. These matrices have nans in the indel positions and also in 
        positions of codons with ambigious alphabet. All other cells have CAI weights according to the 
        specific ortholog's codon usage biases'''
        num_randomizations, num_orthologs, num_codons =  codons_array_col.shape
        
        #initilize CAI weight matrices
        orig_CAI = np.empty(codons_array.shape)
        orig_CAI[:] = np.nan
        col_CAI = np.empty(codons_array_col.shape)
        col_CAI[:] = np.nan
        ver_CAI = np.empty(codons_array_ver.shape)
        ver_CAI[:] = np.nan

        '''the keys in the "weight_dicts_all_organisms" dictionary are the organisms names. But when we
        iterate over our codon matrices *we dont have* the organisms names. However, the organisms are ordered
        in the matrix as they are ordered in the alignement. So, we will get the fasta file that contains the
        CDSs of our gene and create a dictinary: dict[matrix_row_index] = organism_name.'''
        ind_to_name = create_ind_to_name_dict(local_path)
        os.remove(local_path)

        for cur_ortho in range(num_orthologs): #iterate over orthologs
            cur_ortho_name = ind_to_name[cur_ortho] #current ortholog name
            cur_ortho_weights = CAI_all_organisms[cur_ortho_name] #a dictionary with the weights of the current ortholog

            #get the parts of the codon matrices that are of the current ortholog
            orig_cur_ortho = codons_array[cur_ortho,:]
            col_cur_ortho = codons_array_col[:,cur_ortho,:] 
            ver_cur_ortho = codons_array_ver[:,cur_ortho,:]

            codons_list = list(reverse_dict.keys()) #list of all codons

            #update CAI weight matrices
            orig_CAI[cur_ortho,:] = from_codon_to_CAI(codons_list, orig_cur_ortho, cur_ortho_weights)
            col_CAI[:,cur_ortho,:] = from_codon_to_CAI(codons_list, col_cur_ortho, cur_ortho_weights)
            ver_CAI[:,cur_ortho,:] = from_codon_to_CAI(codons_list, ver_cur_ortho, cur_ortho_weights)

        '''Calculate average CAI per position:
        Average over orthologs to get the mean cai per position, for both original and randomized data. '''
        mean_cai_orig = np.nanmean(orig_CAI, axis = 0)
        mean_cai_col = np.nanmean(col_CAI, axis = 1)
        mean_cai_ver = np.nanmean(ver_CAI, axis = 1)
        
        '''Create a mask of valid positions: positions for which less than 50% of orthologs have dels and 
        whose variance between randomizations is not practically zero'''
        percent_nans = np.sum(np.isnan(orig_CAI),axis = 0) / num_orthologs
        allowed_nans = 0.5
        good_positions_mask1 = percent_nans < allowed_nans # a "good position" is defined as a position where less than "allowed nans" of orthologs had a deletion
        
        good_positions_mask_2v = np.round(np.std(mean_cai_ver,axis = 0),5) != 0   #find positions where the variation between the randomizations is larger than 0.00005
        good_positions_mask_2c = np.round(np.std(mean_cai_col,axis = 0),5) != 0   #find positions where the variation between the randomizations is larger than 0.00005
        
        good_positions_mask_v = good_positions_mask1 & good_positions_mask_2v
        good_positions_mask_c = good_positions_mask1 & good_positions_mask_2c

        ''' Get the z-score and p-value for each position '''
        res_ver = get_scores(mean_cai_orig, mean_cai_ver, good_positions_mask_v) 
        res_col = get_scores(mean_cai_orig, mean_cai_col, good_positions_mask_c)

        '''save scores:'''
        #column
        path = f"../Results/AllGenes/CAI/z-scores/{gene}_column.pickle"
        with open(path, 'wb') as handle:
            pickle.dump(res_col, handle)
        #vertical
        path = f"../Results/AllGenes/CAI/z-scores/{gene}_vertical.pickle"
        with open(path, 'wb') as handle:
            pickle.dump(res_ver, handle)
  
    except Exception as e:
        error_path = f"../Results/AllGenes/CAI/z-scores/error_z_scores.txt"
        file_object = open(error_path, 'a')
        file_object.write(f"gene {gene} failed with error: {e}")
        file_object.close()


## Main

In [24]:
# create a dictionary to be used by all genes
cai_path = '../Results/CAI_tables/' #CAI per codon dictionay for each organism in our orthologs group
CAI_all_organisms = create_CAI_dict_all_organisms(cai_path)

In [12]:
''' split genes to batchs to improve preformance '''
genes_list = pd.read_pickle("../co_trans_data/genes_with_rands.pickle") #genes with rands: a pickle with a list of genes for which we have randomizations and didnt fall in the pipeline

#Create 30 batches of genes
num_wanted_cpus = 30
num_genes = len(genes_list)
num_genes_per_batch = int(np.round(num_genes /num_wanted_cpus))
batches_of_genes = [genes_list[x:x+num_genes_per_batch] for x in range(0, num_genes, num_genes_per_batch)]



In [None]:
''' run in parallel ''' 
with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = []
    for single_batch in batches_of_genes:
        futures.append(executor.submit(do_for_single_batch, single_batch_genes = single_batch, num_randomizations = 101))
                      