In this notebook we want to create randomizations for an MSA. The randomizations have to preserve certain qualities. Here, we will choose two columns of codons with the same dominant amino-acid and shuffle the synonymous codons of that amino-acid that are in the same row. 
For example, column_1 = ['TAT'(Y),'TAC'(Y),'CCT'(P),'TAT'(Y)], column_2 = ['TAC'(Y),'TAC'(Y),'ATG'(M),'TAC'(Y)]. The dominant amino-acid here is Y. so, we swap the synonynous codons coding Y, whom are at the same position -> column1_swaped = ['TAC','TAC','CCT','TAC'],
column2_swaped = ['TAT','TAC','ATG',TAT'].

In the first version of this code we chose two columns until all columns were shuffled. This was'nt random enough and created biases. Here, we shufle many more times (10Xlength of protein sequence for each aa)

## Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from Bio import AlignIO
from scipy import stats
import concurrent.futures
from pathlib import Path
from Utils_co_trans import SynonymousCodons, AAs, reverse_dict
from tqdm import tqdm


## Functions

In [4]:
def permute_between_pair(aa_array: np.ndarray,codons_array_cur_rand: np.ndarray, pair: np.ndarray, cur_AA:str) -> np.ndarray:
    '''
    This function takes two columns with the same dominant amino-acid and swaps between synonymous codons coding
    for that amino-acid on the same row in the other column. It returns a the MSA in codons, after the randomizations of the two columns. 
    aa_array: a matrix containing the MSA in AAs (2D numpy array)
    codons_array: a matrix containing the MSA in codons (2D numpy array)
    pair: the pair of column indices chosen to be permuted (1D numpy array with two elements)
    cur_AA: the amino-acid that is dominated in both of these columns (str)
    '''
    #find positions which have the current AA for both of these columns:
    pos_this_pair_this_AA = (aa_array[:,pair[0]] == cur_AA) & (aa_array[:,pair[1]] == cur_AA)
    temp_copy = codons_array_cur_rand[:,pair[0]].copy() #save a copy of the original first column
    codons_array_cur_rand[:,pair[0]][pos_this_pair_this_AA] = codons_array_cur_rand[:,pair[1]][pos_this_pair_this_AA] #switch the codons in the first random column with the codons in the second random column for this AA
    codons_array_cur_rand[:,pair[1]][pos_this_pair_this_AA] = temp_copy[pos_this_pair_this_AA] #switch the codons in the second random column with the codons in the first random column for this AA
    return(codons_array_cur_rand)

In [5]:
def create_single_randomization(codons_array:np.ndarray, AAs:list, positions_dominant_AA:dict, len_protein:float, aa_array:np.ndarray) -> np.ndarray:
    ''' 
    Create a single randomized MSA. for each amino acid, we choose two columns whose dominant amino acid is the current one and
    swap between their synonymous codons. We perform len_protein * 10 swaps for each AA.
    '''
    codons_array_cur_rand = codons_array.copy()
    for cur_AA in AAs:
        positions_this_AA = positions_dominant_AA[cur_AA]
        if len(positions_this_AA) > 1:
            #get all positions of columns for which the current AA is dominant 
            #we now need to choose random pairs of these columns and then permute synonymous codons between them.
            #each time we will choose two columns randomly and permute their codons of this_AA. 
            #we will do this 10X(len_protein) times
            for shuffle in range(int(len_protein * 10)):
                pair = np.random.choice(positions_this_AA, 2, replace=False) #choose two columns of this dominant AA, randomly
                codons_array_cur_rand = permute_between_pair(aa_array,codons_array_cur_rand,pair,cur_AA)
    return(codons_array_cur_rand)


In [6]:
def compress_rands(randomizations: np.ndarray, codons_array_original:np.ndarray) -> dict:
    '''
    This function receives the randomizations of the msa of a single gene and the original msa of that gene, and returns 
    a dictionary with the positions where the randomizations and the original gene differ, and the difference. 
    Saving only the differences saves a lot of memory.
    '''
    locations_of_diff = np.where(randomizations != codons_array_original)
    codons_of_diff = randomizations[locations_of_diff]
    diff_dict = {}
    diff_dict['locations'] = locations_of_diff
    diff_dict['codons'] = codons_of_diff
    return(diff_dict)
    

In [8]:
def create_randomizations_single_gene(gene:str, num_randomizations:int) -> None:
    '''
    Create all randomized MSAs for a single gene
    '''
    try:
        #get nt and aa msas of the current gene 
        path_AA = f"../co_trans_data/orthologs/aa_after_msa/{gene}.fasta.gz" #MSA in amino-acids
        path_NT = f"../co_trans_data/orthologs/nt_after_msa/{gene}.fasta.gz" #MSA in nucleotides
        local_aa_path = f"./{gene}_aa.fasta"
        local_nt_path = f"./{gene}_nt.fasta"
        !gzip -c -d $path_NT > $local_nt_path 
        !gzip -c -d $path_AA > $local_aa_path 
        
        alignment_AA = AlignIO.read(local_aa_path, "fasta")
        aa_array = np.array(alignment_AA) #changing to numpy for code efficiency
        alignment_NT = AlignIO.read(local_nt_path, "fasta")
        os.remove(local_aa_path)
        os.remove(local_nt_path)

        #create codon array from nucleotide array
        nuc_array = np.array(alignment_NT) # an array where each column is a nucleotide position
        #lets merge every three columns into one, to obtain a numpy of codon positions instead of nucleotide positions
        codons_array = np.zeros((nuc_array.shape[0],nuc_array.shape[1]//3),dtype=object)
        for col in range(0,nuc_array.shape[1],3):
            merged_to_codon = np.char.add(np.char.add(nuc_array[:,col],nuc_array[:,col+1]),nuc_array[:,col+2]) 
            codons_array[:,col//3] = merged_to_codon
            
        #discard positions for which we know the AA but do not know the NTs. 
        codons_array_translation = codons_array.copy()
        for cur_codon in reverse_dict.keys():
            codons_array_translation[codons_array_translation == cur_codon] = reverse_dict[cur_codon] #translate codons to AAs
        mask = codons_array_translation != aa_array #our translation will not translate "GCN" to Ala but "aa_array" will have Ala
        aa_array[mask] = '-'
        codons_array[mask] = '---'

        num_organisms,num_positions = aa_array.shape

        vals,counts = stats.mode(aa_array) 
        #vals: a 1D numpy array of length "num_positions" (codon positions) holding the dominant amino-acid of each column. *could also be a deletion*
        #counts: a 1D numpy array of the same length, indicating how many times the dominant aa appeared in the column.

        percent_required_same_AA = 0.5 #take only columns where more than 50% of the organisms have the same AA
        percent_in_reality = counts/num_organisms

        mask = percent_in_reality > percent_required_same_AA # a 1D boolian vector of length "num_positions".
        #True indicating columns where the dominant amino-acid is more prevalent than "percent_required_same_AA" 
        vals[~mask] = '-' #we dont care about the AAs that are *less* common than the required threshold

        #creating a dictionary that holds the positions where each AA is the dominant AA
        positions_dominant_AA = {}
        for cur_AA in AAs:
            positions_dominant_AA[cur_AA] = np.where(np.isin(vals,cur_AA))[1]
            
        # creating randomizations of the gene
        randomizations = np.zeros((num_randomizations,num_organisms,num_positions),dtype = 'object')
        protein_data = pd.read_pickle(f"../co_trans_data/orthologs/aa_dict/aa_{gene}.pickle.gz")
        len_protein = len(protein_data['data'][0]['homologies'][0]['source']['seq']) 
        
        for cur_rand in range(num_randomizations):
            this_rand = create_single_randomization(codons_array,AAs,positions_dominant_AA, len_protein, aa_array)
            randomizations[cur_rand,:,:] = this_rand 
        
        #save a dictionary with *the differences* of each randomization from the original msa (memory efficient)
        compressed_rands = compress_rands(randomizations, codons_array)
            
        #save the results of the current gene in a pickle
        output_path = f"../Results/column_permutations/{gene}.pickle"
        with open(output_path, 'wb') as handle:
            pickle.dump(compressed_rands, handle)
        #zip it
        !gzip $output_path
        

    except Exception as e:
        file_object = open(f"../Results/error_genes_column_permutations.txt", 'a')
        file_object.write(f"gene {gene} failed with error: {e}")
        file_object.close()

            
            



In [9]:
def do_for_single_batch(single_batch_genes: list, num_randomizations:int) -> None:
    ''' This function calls "create_randomizations_single_gene" sequentialy for each gene in the batch ''' 
    for gene in single_batch_genes:
        create_randomizations_single_gene(gene, num_randomizations)
        

## Main

In [12]:
genes_list = pd.read_pickle('../co_trans_data/genes_for_msa.pickle') 

'''Create batches of genes'''
num_wanted_cpus = 50
num_genes = len(genes_list)
num_genes_per_batch = int(np.round(num_genes /num_wanted_cpus))
batches_of_genes = [genes_list[x:x+num_genes_per_batch] for x in range(0, num_genes, num_genes_per_batch)]


In [13]:
''' Call the batches parallely '''
with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = []
    for single_batch in batches_of_genes:
        futures.append(executor.submit(do_for_single_batch, single_batch_genes = single_batch, num_randomizations = 101))
                      

gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000204403.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000271503.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000172752.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000250913.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000169246.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000146707.fasta.gz: No such file or directory
gzip: ../Data/AllGenes/orthologs/aa_after_msa/ENSG00000211454.fasta.gz: No such file or directory
