In this notebook we want to create randomizations for an MSA. The randomizations have to preserve certain qualities. Here, we will only permute synonymous codons in each column of the MSA. 

The output of this notebook will be the randomized versions of the MSAs. For each gene we will save a dictionary holding the locations were the randomizations are different than the original MSA (to save space), and the codons in those locations.  


*this notebook should run on the power cluster*

## Imports

In [1]:
#general imports, common to all projects
import numpy as np
import pandas as pd
import os
import sys
import pickle
from tqdm import tqdm

In [2]:
#import relevant for this notebook
from Bio import AlignIO
import random
import concurrent.futures
import gc
from Bio import SeqIO
from pathlib import Path
import io
from Utils_Tal import SynonymousCodons, AAs, reverse_dict
from scipy import stats


## Functions

In [3]:
'''This function gets the msa and checks which positions have a dominant amino-acid (an amino-acid that is
prevalent in more than "threshold_same_AA" of the orthologs. It returns a boolian vector, True for the
positions that do have a dominant amino acid. '''

def get_mask_dominant_aa(aa_array:np.ndarray, threshold_same_AA:float, num_organisms:int,num_randomizations:int) -> np.ndarray:
    vals,counts = stats.mode(aa_array) 
    #vals: a 1D numpy array of length "num_positions" (codon positions) holding the dominant amino-acid of each column. *could also be a deletion*
    #counts: a 1D numpy array of the same length, indicating how many times the dominant aa appeared in the column.
    counts[vals == '-'] = 0 #dont consider positions where the dominant character is a deletion
    
    percent_in_reality = counts/num_organisms
    mask_dominant_aa = percent_in_reality > threshold_same_AA # a 1D boolian vector of length "num_positions".
    mask_dominant_aa = mask_dominant_aa + np.full((num_randomizations,aa_array.shape[0], aa_array.shape[1]), False) #broadcast to the shape of the randomizations
    return(mask_dominant_aa)


In [9]:
'''
This function takes a matrix and shuffles its non-zero elements that are in the same column.
It returns the shuffled version of the matrix. 

threeD_array: a 3D numpy array (size: (num_orthologs,num_codon_positions, num_randomizations)) holding all codons of a single amino acid, duplicated "num_randomizations" times in the third axis. The rest of the positions are 0.
For example- lets say this is our MSA: ['ATG','TAT','AAG'],['ATG','TAT,'TAC'] (2 orthologs, 3 codons)
and in amino acids it is ['M','Y','K],['M','Y','Y'] and we are now cosidering amino-acid Y- 
then, threeD_array would be [0,'TAT',0],[0,'TAT','TAC'], duplicated "num_randomizations" times at the third
axis. This function would permute the non-zero elements, meaning the synonymous codons of Y, in the column axis only. 
This solution is based on [https://stackoverflow.com/questions/45764955/shuffling-non-zero-elements-of-each-row-in-an-array-python-numpy]
and it was done so we can shuffle in a vectorized manner.
i: the column indices of the non-zeros elements (the elements to shuffle)
j: the row indices of the non-zeros elements (the elements to shuffle)
random_nums: random numbers assigned for each randomization (numpy array of size (num_randomizations,len(i)) 

'''

def permute_non_zeros(threeD_array: np.ndarray, i:list, j:list, random_nums:np.ndarray) -> np.ndarray:
    for d in range(len(threeD_array)): #d - depth. we iterate through the randomizations,0-999.
        cur_randomization = threeD_array[d,:,:]
        cur_random_nums = random_nums[d,:] #take the random numbers assigned for the current randomization
        k = np.argsort(i + cur_random_nums)
        cur_randomization[i,j] = cur_randomization[i,j[k]]
        threeD_array[d,:,:] = cur_randomization
    return(threeD_array)

In [10]:
'''This function receives the randomizations of the msa of a single gene and the original msa of that gene, and returns 
a dictionary with the positions where the randomizations and the original gene differ, and the difference. 
Saving only the differences saves a lot of memory.'''

def compress_rands(randomizations: np.ndarray, codons_array_original:np.ndarray) -> dict:
    locations_of_diff = np.where(randomizations != codons_array_original)
    codons_of_diff = randomizations[locations_of_diff]
    diff_dict = {}
    diff_dict['locations'] = locations_of_diff
    diff_dict['codons'] = codons_of_diff
    return(diff_dict)
    

In [11]:
''' Create all randomized MSAs for a single gene'''
def create_randomizations_single_gene(gene:str, num_randomizations:int) -> None:
    try:
        
        path_AA = f"../co_trans_data/orthologs/aa_after_msa/{gene}.fasta.gz" #MSA in amino-acids
        path_NT = f"../co_trans_data/orthologs/nt_after_msa/{gene}.fasta.gz" #MSA in nucleotides
        local_aa_path = f"./{gene}_aa.fasta"
        local_nt_path = f"./{gene}_nt.fasta"
        !gzip -c -d $path_NT > $local_nt_path 
        !gzip -c -d $path_AA > $local_aa_path 
        
        alignment_AA = AlignIO.read(local_aa_path, "fasta")
        aa_array = np.array(alignment_AA) #changing to numpy for code efficiency
        alignment_NT = AlignIO.read(local_nt_path, "fasta")
        os.remove(local_aa_path)
        os.remove(local_nt_path)
        
        #create codon array from nucleotide array
        nuc_array = np.array(alignment_NT) # an array where each column is a nucleotide position
        #lets merge every three columns into one, to obtain a numpy of codon positions instead of nucleotide positions
        codons_array = np.zeros((nuc_array.shape[0],nuc_array.shape[1]//3),dtype=object)
        for col in range(0,nuc_array.shape[1],3):
            merged_to_codon = np.char.add(np.char.add(nuc_array[:,col],nuc_array[:,col+1]),nuc_array[:,col+2]) 
            codons_array[:,col//3] = merged_to_codon

        #remove positions of AAs with ambigious nts:
        codons_array_translation = codons_array.copy()
        for cur_codon in reverse_dict.keys():
            codons_array_translation[codons_array_translation == cur_codon] = reverse_dict[cur_codon] #translate valid codons to AAs
        mask = codons_array_translation != aa_array #our translation will not translate "GCN" to Ala but "aa_array" will have Ala in that position. this will find ambigious codons. 
        aa_array[mask] = '-' #put this where there are ambigious nts (meaning, we are treating them the same as deletions)
        codons_array[mask] = '---' #again, treat as deletions in the nt version of the matrix as well
        
        mask_dominant_aa = get_mask_dominant_aa(aa_array, threshold_same_AA = 0.5, num_organisms = aa_array.shape[0], num_randomizations = num_randomizations)
    
        # duplicating the aa array and nucleotide array "num_randomizations" times for the creations of the randomizations. 
        repeated_AAs = np.tile(aa_array,(num_randomizations,1,1)) 
        repeated_codons = np.tile(codons_array,(num_randomizations,1,1)) 
        repeated_AAs[~mask_dominant_aa] = '-' #for columns without a dominant aa: replace the codons with "-"

        # for each AA, get the locations of the codons coding for that AA and then perform a vertical permutation between them. 
        for AA in tqdm(AAs):
            repeated_this_AA = repeated_codons.copy() #create a copy of the original codons matrix
            mask_this_AA = np.zeros(np.shape(repeated_codons),dtype=bool) # create a mask of the same size
            this_AA_locations = np.where(np.isin(repeated_AAs,AA)) 
            mask_this_AA[this_AA_locations] = True # assign "True" in the mask where there are synonymous codons of the current amino acids
            repeated_this_AA[~mask_this_AA] = 0 #take the copy of our original codon matrix and keep only the synonymous codons of this AA (the rest are zeros)

            # now we want to shuffle all non-zero element column-wise. 
            # this solution is based on https://stackoverflow.com/questions/45764955/shuffling-non-zero-elements-of-each-row-in-an-array-python-numpy
            #but is changed a bit to fit a 3D array
            repeated_this_AA = repeated_this_AA.transpose(0,2,1) #we transpose because we know how to perform a shuffle in-row and not in-column
            i, j = np.nonzero(repeated_this_AA[0].astype(bool)) # i, j are the same for all randomizations as the locations of the non-zeros elements are the same for them
            random_nums = np.random.rand(i.size * num_randomizations) #create the random numbers that are needed for all the "num_randomizations" permutations (faster then doing it "num_randomizations" times)
            random_nums = random_nums.reshape((num_randomizations, i.size)) #reshape to split it to the "num_randomizations" randomizations
            repeated_this_AA = permute_non_zeros(repeated_this_AA,i,j,random_nums) # perform the shuffling
            repeated_this_AA = repeated_this_AA.transpose(0,2,1) #tranpose back
            repeated_codons[this_AA_locations] = repeated_this_AA[this_AA_locations] #assign our randomized version of this AA's codons to the original data
             #save a dictionary with *the differences* of each randomization from the original msa (memory efficient)
        
        compressed_rands = compress_rands(repeated_codons, codons_array)
        
        #save the results of the current gene in a pickle
        output_path = f"../Results/AllGenes/vertical_permutations/{gene}.pickle"
        with open(output_path, 'wb') as handle:
            pickle.dump(compressed_rands, handle)
        #zip it
        !gzip $output_path
   
    except Exception as e:
        error_path = f"../Results/AllGenes/vertical_permutations/error_genes_vertical_permutations.txt"
        file_object = open(error_path, 'a')
        file_object.write(f"gene {gene} failed with error: {e}")
        file_object.close()



In [3]:
''' This function calls "create_randomizations_single_gene" sequentialy for each gene in the batch ''' 
def do_for_single_batch(single_batch_genes: list, num_randomizations:int) -> None:
    for gene in single_batch_genes:
        create_randomizations_single_gene(gene, num_randomizations)
        

## Main

In [None]:
'''Create batches of genes'''
num_wanted_cpus = 50
num_genes = len(genes_list)
num_genes_per_batch = int(np.round(num_genes /num_wanted_cpus))
batches_of_genes = [genes_list[x:x+num_genes_per_batch] for x in range(0, num_genes, num_genes_per_batch)]


In [None]:
''' Call the batches parallely '''
with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = []
    for single_batch in batches_of_genes:
        futures.append(executor.submit(do_for_single_batch, single_batch_genes = single_batch, num_randomizations = 101))
                      