We have the ITGA2 example in the MDR1 paper. We created random synonymous G->T changes in this gene. Here we calculate the mfe change they cause and compare it to the actuall change caused by the mutation. 

## Import

In [2]:
import pandas as pd
import numpy as np
import pickle
from Utils_MDR1 import *


## Functions

In [3]:
def get_delta_mfe_single_rand(nt_CDS_mutated: str, mfe_positions: np.array, chosen_pos: int) -> float:
    ''' 
    This function creates a single randomization, calculates the mfe of the mutated sequence and
    returns the delta mfe. 

    * nt_CDS_mutated: the mutated CDS sequence
    * chosen_pos: the mutated position in 0-base
    * mfe_positions: the mfe_scores of the original sequence
    '''

    #calculate window mfe for the current version of the gene
    mfe_windows_mutated = calc_windows_mfe(nt_CDS_mutated, 39) #calculate mfe per window
    #calculate mfe per position for the current version of the gene
    mfe_positions_mutated = calc_mfe_per_position(mfe_windows_mutated, 'original', 38, 39)

    delta_mfe = (mfe_positions_mutated[0,chosen_pos] - mfe_positions[0,chosen_pos])
    
    return(delta_mfe)

In [4]:
def get_delta_mfes_all_rands(path_to_rands: str, num_randomizations: int, mfe_positions: np.array) -> np.array:
    ''' 
    This function creates "num_randomizations" randomized sequences and calculate the delta mfe for each
    randomization 
    '''
    random_df = pd.read_pickle(path_to_rands) #contains sequences with all possible substitutions according to the selected criterion.
    num_mutated_sequences = random_df.shape[0] #choose "num_randomizations" sequences from the "num_mutated_sequences" and calculate the delta mfe caused by each change
    chosen_rows = np.random.choice(num_mutated_sequences, num_randomizations) #choose rows from the df randomly
    random_df = random_df.iloc[chosen_rows] #keep only the chosen rows
    random_delta_mfes = random_df.apply(lambda x: get_delta_mfe_single_rand(x.Sequence, mfe_positions, int(x.CDS_position_0_based)), axis = 1) #calculate the delta mfe caused by the random change
    
    return(random_delta_mfes.values)
        

## Main

In [5]:
''' Get the true delta mfe caused by our position '''

#  ENSG00000100320:chr22:35768281:Silent:G:T - RBFOX2

cds_position = 732 #1-based
mut_allele = "A" #strand sensitive

# Get the CDS sequence
gene_id = "ENSG00000100320" #RBFOX2
genes_dict = pd.read_pickle(f"../Data/cdna_{gene_id}.pickle.gz")
nt_CDS = genes_dict['data'][0]['homologies'][0]['source']['seq'][:-3] #removing stop codon -> msa was on aas and ttanslated back, so no info on stop codons

#get mfe profile of the original sequence
mfe_windows = calc_windows_mfe(nt_CDS, 39) #calculate mfe per window
mfe_positions = calc_mfe_per_position(mfe_windows, 'original', 38, 39) #change to mfe per position

#get mfe profile of the mutated sequence
nt_CDS_mut = mutate_cds_sequence(nt_CDS, cds_position, mut_allele) #mutate the sequence
mfe_windows_mut = calc_windows_mfe(nt_CDS_mut, 39) #calculate mfe per window
mfe_positions_mut = calc_mfe_per_position(mfe_windows_mut, 'original', 38, 39) #change to mfe per position

delta_mfe = (mfe_positions_mut[0,cds_position - 1] - mfe_positions[0,cds_position - 1])



In [6]:
delta_mfe

3.756410274750147

In [8]:
''' Get the delta mfes caused by 100 random synonymous G->T variants (to compare with G636T) '''

num_randomizations = 100
path_to_rands = "../Results/RBFOX2/synonymous_C2A_RBFOX2.pickle"
random_deltas = get_delta_mfes_all_rands(path_to_rands, num_randomizations)
with open("../Results/RBFOX2/random_delta_mfe_C2A_syn_RBFOX2.pickle",'wb') as f:
    pickle.dump(random_deltas, f)


In [None]:
''' Calculate the p-values for each mutation ''' 

In [9]:
pval = get_pvalue(delta_mfe, random_deltas, mfe_positions)
pval



0.0