In this notebook we calculate the minimum free energy (mfe) of 39 long sliding windows for a CDS. 
A single CDS has X orthologs. and each ortholog has 200 randomizations (100 column, 100 vertical). We perform the mfe sliding window calculation for all of these. 

We will do the following:
1. Download the sequences (both original and randomized MSAs of the gene)
2. Preprocess the sequences as needed for the mfe calulation (DNA to RNA, remove deletions). 
3. Find invalid windows: some of our sequences contain ambigous alphabet (such as "N", positions where we do not know the nts). *the mfe of windows containing ambigious alphabet will be nan.* 
4. Calculate mfe per window for all valid windows, for both original sequences and randomizations


## Imports 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import io
import RNA #the ViennaRNA package
from Bio import SeqIO, AlignIO
import gzip
import concurrent.futures
import time
import re
from Utils_Tal import aa_positions_to_nt_positions


## Functions 

In [9]:
'''This function recieves the alignment in nucleotides and returns the alignment in codons'''
def get_original_codons_array(alignment_NT) -> np.ndarray:
    #create codon array from nucleotide array
    nuc_array = np.array(alignment_NT) # an array where each column is a nucleotide position
    #lets merge every three columns into one, to obtain a numpy of codon positions instead of nucleotide positions
    codons_array = np.zeros((nuc_array.shape[0],nuc_array.shape[1]//3),dtype=object)
    for col in range(0,nuc_array.shape[1],3):
        merged_to_codon = np.char.add(np.char.add(nuc_array[:,col],nuc_array[:,col+1]),nuc_array[:,col+2]) 
        codons_array[:,col//3] = merged_to_codon
    return(codons_array)

In [10]:
'''This function recreates the randomizations. In order to save memory we only saved the original msa 
and a dictionary with the location of the differences of each randomization and the original msa. 
There are two keys in the dictionary:
1. 'locations': the locations of the differences in the msa
2. 'codons': the codons in the randomization in these locations'''

def get_randomized_codons_array(randomization_path:str, original_codons_array:np.ndarray, num_rands:int) -> np.ndarray:
    diff_dict = pd.read_pickle(randomization_path)
    rand = original_codons_array.copy()
    rand = np.tile(rand, (num_rands, 1, 1))
    rand[diff_dict['locations']] = diff_dict['codons']
    return(rand)

In [11]:
'''This function receives a sequence, processes it to fit the mfe calculation and returns its deletion positions, the length
after preprocessing and the processed sequence.
'''
def preprocess_original_sequence(sequence:np.ndarray) -> (str,int,list):
    #get locations of deletions of original sequence:
    del_locs_aa = np.where(sequence == '---')[0]
    del_locs_nt = aa_positions_to_nt_positions(del_locs_aa)
    #preprocess original sequence (dna to rna, remove dels, make sure capital letters only)
    sequence = ''.join(sequence).replace("-","").replace("T","U").upper()
    len_sequence = len(sequence) #original sequence length, without deletions and without the stop codon. 
    return(sequence,len_sequence,del_locs_nt)

In [12]:
'''Our sequences contain invalid alphabet such as N (a position that could be A/C/G/U). We do not calculate the mfe for windows
containing ambigious characters. This functions takes a single sequence (str) and returns a list of the valid windows - the windows
with no ambigiuos characters, for which we will calculate the mfe score.'''
def find_valid_windows(sequence:str, window_size:int, valid_chars:set) -> list:
    num_windows = len(sequence) - window_size + 1
    valid_windows = []
    for window in range(num_windows):
        cur_window_seq = sequence[window:window+window_size]
        if len(set(cur_window_seq) - valid_chars) == 0: #meaning: if there are no ambigious alphabet
            valid_windows.append(window)
    return(valid_windows)

In [2]:
''' This function calculates the MFE of all 39 nt-long windows in a given sequence '''
def calc_mfe_single_sequence(sequence:str, max_num_windows:int, window_size:int, valid_windows:list) -> np.ndarray:
    num_windows = len(sequence) - window_size + 1
    
    #initialize results vector
    mfe_single_seq = np.empty((1,max_num_windows))
    mfe_single_seq[:] = np.nan
    #iterate over windows and calculate mfe
    for window in valid_windows:
        assert(len(sequence[window:window+window_size]) == 39)
        (_, mfe) = RNA.fold(sequence[window:window+window_size])
        mfe_single_seq[0,window] = mfe
    return(mfe_single_seq)


In [3]:
''' Calculates the MFE per window scores for the original CDS sequences of all orthologs '''
def calc_mfe_original(sequences:list, gene:str, num_orthologs:int, max_num_windows:int, window_size:int, valid_windows:list) -> None:
    mfe_results = np.empty((num_orthologs,max_num_windows)) #initilize results matrix
    mfe_results[:] = np.nan

    for cur_ortholog in range(num_orthologs):
        mfe_results[cur_ortholog,:] = calc_mfe_single_ortholog_sequence(sequences[cur_ortholog], max_num_windows, window_size,valid_windows[cur_ortholog])
    return(mfe_results)


In [4]:
''' Zip and save the window mfe scores for the original MSA and the randomized MSAs '''
def zip_and_save_results(gene:str, mfe_orig:np.ndarray, mfe_col:np.ndarray, mfe_ver:np.ndarray) -> None:
    output_path = f"../Results/AllGenes/mfe/window_mfe_scores/original/{gene}.pickle"
    with open(output_path, 'wb') as handle:
        pickle.dump(mfe_orig, handle, protocol = 4)
    #zip it
    !gzip $output_path

    output_path = f"../Results/AllGenes/mfe/window_mfe_scores/column/{gene}.pickle"

    with open(output_path, 'wb') as handle:
        pickle.dump(mfe_col, handle, protocol = 4)
    #zip it
    !gzip $output_path

    output_path = f"../Results/AllGenes/mfe/window_mfe_scores/vertical/{gene}.pickle"
    with open(output_path, 'wb') as handle:
        pickle.dump(mfe_ver, handle, protocol = 4)
    #zip it
    !gzip $output_path

In [5]:
''' Calculate the MFE per window scores for the original and permutated MSAs, for a single gene ''' 

def calc_mfe_windows_single_gene(gene:str, num_randomizations:int, window_size:int) -> None:
    try:

        ''' (1) Download data'''
        #original MSA in nucleotides - unzip and read alignment
        path_NT = f"../co_trans_data/orthologs/nt_after_msa/{gene}.fasta.gz" #MSA in nucleotides
        local_path = f"./{gene}.fasta"
        !gzip -c -d $path_NT > $local_path #unzip without erasing the zipped version
        alignment_NT = AlignIO.read(local_path, "fasta")
        codons_array = get_original_codons_array(alignment_NT)
        os.remove(local_path)

        #randomizations
        path_col_rand = f"../Results/AllGenes/column_permutations/{gene}.pickle.gz" #MSA in nucleotides
        codons_array_col = get_randomized_codons_array(path_col_rand, codons_array, num_randomizations)
        path_ver_rand = f"../Results/AllGenes/vertical_permutations/{gene}.pickle.gz" #MSA in nucleotides
        codons_array_ver = get_randomized_codons_array(path_ver_rand, codons_array, num_randomizations)

        ''' (2) Preprocess original and random sequences and get needed info (sequence lengths, deletion locations...).
        Preprocessing includes: transcribing, removing deletions, asserting capital letters. '''
        num_orthologs = codons_array_col.shape[1]
        #preprocess original
        len_sequences_dict,deletion_pos_dict,original_sequences,len_sequences = {}, {}, [], [] # we need this for "find_significant_folding_positions.ipynb"
        for cur_ortholog in range(num_orthologs) :
            processed_seq, length_seq, dels = preprocess_original_sequence(codons_array[cur_ortholog,:])
            len_sequences_dict[cur_ortholog] = length_seq
            deletion_pos_dict[cur_ortholog] = dels
            original_sequences.append(processed_seq)
            len_sequences.append(length_seq)
        #save dicts needed for "mfe_positions_zscores.ipynb"
        with open(f"../co_trans_data/cds_lengths/{gene}.pickle", 'wb') as handle:
            pickle.dump(len_sequences_dict, handle)
        with open(f"../co_trans_data/del_positions_orig/{gene}.pickle", 'wb') as handle:
            pickle.dump(deletion_pos_dict, handle)
        #preprocess randomizations
        rand_sequences_dict = {} #will hold the random sequences after preprocessing
        rand_sequences_dict['column'] = {}
        rand_sequences_dict['vertical'] = {}

        for rand in range(num_randomizations):
            rand_sequences_dict['column'][rand] = {}
            rand_sequences_dict['vertical'][rand] = {}
            for cur_ortholog in range(num_orthologs):
                #column
                sequence = codons_array_col[rand,cur_ortholog,:]
                rand_sequences_dict['column'][rand][cur_ortholog] = ''.join(sequence).replace("-","").replace("T","U").upper()
                #vertical
                sequence = codons_array_ver[rand,cur_ortholog,:]
                rand_sequences_dict['vertical'][rand][cur_ortholog] = ''.join(sequence).replace("-","").replace("T","U").upper()

        ''' (3) Finding valid windows: windows that do not contain any ambigious alphabet.
        The locations of the ambigious alphabet are the same between the original and the randomized msas, 
        so we will find the valid windows only once, using the original msa, and use it for all sequences.'''

        valid_windows = [None]*num_orthologs #an empty list of size "num_orthologs", will hold the valid windows of each ortholog
        valid_chars = {'A','C','G','U'}
        for cur_ortholog in range(num_orthologs):
            valid_windows_cur_ortholog = find_valid_windows(original_sequences[cur_ortholog], window_size, valid_chars)
            valid_windows[cur_ortholog] = valid_windows_cur_ortholog

        ''' (4) Calculate mfe per window for all valid windows, for both original sequences and randomizations '''
        max_num_windows = max(len_sequences) - window_size + 1 
        #original
        mfe_orig = np.empty((num_orthologs,max_num_windows)) #initilize results matrix
        mfe_orig[:] = np.nan
        for cur_ortholog in range(num_orthologs):
            mfe_orig[cur_ortholog,:] = calc_mfe_single_sequence(original_sequences[cur_ortholog], max_num_windows, window_size,valid_windows[cur_ortholog])
        #column
        mfe_col = np.empty((num_randomizations,num_orthologs,max_num_windows)) #initilize results matrix
        mfe_col[:] = np.nan
        mfe_ver = mfe_col.copy()
        for cur_rand in range(num_randomizations):
            for cur_ortholog in range(num_orthologs):
                mfe_col[cur_rand,cur_ortholog,:] = calc_mfe_single_sequence(rand_sequences_dict['column'][cur_rand][cur_ortholog], max_num_windows, window_size,valid_windows[cur_ortholog])
                mfe_ver[cur_rand,cur_ortholog,:] = calc_mfe_single_sequence(rand_sequences_dict['vertical'][cur_rand][cur_ortholog], max_num_windows, window_size,valid_windows[cur_ortholog])

        zip_and_save_results(gene, mfe_orig, mfe_col, mfe_ver)
        
    except Exception as e:
        file_object = open(f"../Results/AllGenes/mfe/window_mfe_scores/error_window_mfe.txt", 'a')
        file_object.write(f"gene {gene} failed with error: {e}")
        file_object.close()



## Main

In [6]:
genes_list = pd.read_pickle("../co_trans_data/genes_with_rands.pickle")

In [8]:
'''Create batches of genes'''
num_wanted_cpus = 30
num_genes = len(genes_list)
num_genes_per_batch = int(np.round(num_genes /num_wanted_cpus))
batches_of_genes = [genes_list[x:x+num_genes_per_batch] for x in range(0, num_genes, num_genes_per_batch)]


In [9]:
'''This function calls our main function "get_significant_cai_positions_single_gene" for  batch of genes'''
def do_for_single_batch(single_batch_genes: list, num_randomizations:int, window_size:int) -> None:
    for gene in single_batch_genes:
        calc_mfe_windows_single_gene(gene, num_randomizations, window_size)
        

In [None]:
''' Run parallely '''
with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = []
    for single_batch in batches_of_genes:
        futures.append(executor.submit(do_for_single_batch, single_batch_genes = single_batch, num_randomizations = 101, window_size = 39))
                      