In this notebook we compare the change in MFE caused by T1236C, T2677G and T3435C to the delta MFE caused by similar, random changes to calculate an empirical p-value. 
We use the dfs of random mutations, created in the notebook "create_MDR1_valid_mutations_dfs.ipynb". 



## Import

In [10]:
import pandas as pd
import numpy as np
import pickle
from Utils_MDR1 import *



## Functions

In [11]:
''' This function creates a single randomization, calculates the mfe of the mutated sequence and
returns the delta mfe. 

* nt_CDS_mutated: the mutated CDS sequence
* chosen_pos: the mutated position in 0-base
* mfe_positions: the mfe_scores of the original sequence '''

def get_delta_mfe_single_rand(nt_CDS_mutated: str, mfe_positions: np.array, chosen_pos: int) -> float:

    ''' Calculate delta-mfe for the change'''
    #calculate window mfe for the current version of the gene
    mfe_windows_mutated = calc_windows_mfe(nt_CDS_mutated, 39) #calculate mfe per window
    #calculate mfe per position for the current version of the gene
    mfe_positions_mutated = calc_mfe_per_position(mfe_windows_mutated, 'original', 38, 39)

    delta_mfe = (mfe_positions_mutated[0,chosen_pos] - mfe_positions[0,chosen_pos])
    
    return(delta_mfe)



In [12]:
''' This function creates "num_randomizations" randomized sequences and calculate the delta mfe for each
randomization '''
def get_delta_mfes_all_rands(rand_type: str, num_randomizations: int) -> np.array:
    
    random_df = pd.read_pickle(f"../Data/random_mutations_for_pvals/{rand_type}.pickle") #contains sequences with all possible substitutions according to the selected criterion.
    num_mutated_sequences = random_df.shape[0] #choose "num_randomizations" sequences from the "num_mutated_sequences" and calculate the delta mfe caused by each change
    chosen_rows = np.random.choice(num_mutated_sequences, num_randomizations) #choose rows from the df randomly
    random_df = random_df.iloc[chosen_rows] #keep only the chosen rows
    random_delta_mfes = random_df.apply(lambda x: get_delta_mfe_single_rand(x.Sequence, mfe_positions, int(x.CDS_position_0_based)), axis = 1) #calculate the delta mfe caused by the random change
    return(random_delta_mfes.values)
        

## Main

In [13]:
''' Get MDR1 CDS sequence'''
gene = 'ENSG00000085563' #MDR1/ABCB1 gene
genes_dict = pd.read_pickle(f"../Data/cdna_{gene}.pickle.gz")
nt_CDS = genes_dict['data'][0]['homologies'][0]['source']['seq'][:-3] #removing stop codon -> msa was on aas and ttanslated back, so no info on stop codons


In [14]:
''' Get the mfe scores for the original sequence '''
mfe_windows = calc_windows_mfe(nt_CDS, 39) #calculate mfe per window
mfe_positions = calc_mfe_per_position(mfe_windows, 'original', 38, 39) 


In [15]:
''' Get the true delta mfe caused by our three positions '''

positions_of_interest = [variant_info[1]["cds_position"] - 1, variant_info[2]["cds_position"] - 1, variant_info[3]["cds_position"] - 1] #our positions, 0-based. 
changed_to = [variant_info[1]["change_to"],variant_info[2]["change_to"],variant_info[3]["change_to"]] #the nucleotide changed that occured there

true_delta_mfes = []
for chosen_pos, mutation  in zip(positions_of_interest, changed_to): 
    nt_CDS_mutated = mutate_cds_sequence(nt_CDS, chosen_pos + 1, mutation)
    #calculate window mfe for the current version of the gene
    mfe_windows_mutated = calc_windows_mfe(nt_CDS_mutated, 39) #calculate mfe per window
    #calculate mfe per position for the current version of the gene
    mfe_positions_mutated = calc_mfe_per_position(mfe_windows_mutated, 'original', 38, 39)
    delta_mfe = mfe_positions_mutated[0,chosen_pos] - mfe_positions[0,chosen_pos]
    
    true_delta_mfes.append(delta_mfe)
    
delta_T1236C, delta_T2677G, delta_T3435C = true_delta_mfes


In [13]:
''' Get the delta mfes caused by 100 random synonymous T->C variants (to compare with T1236C, T3435C) '''

num_randomizations = 100
rand_type = "synonymous_T2C"
random_deltas = get_delta_mfes_all_rands(rand_type, num_randomizations)
with open(f"../Results/mfe/delta_mfe/random_delta_mfe_{rand_type}.pickle",'wb') as f:
    pickle.dump(random_deltas, f)


In [16]:
''' Get the delta mfes caused by 100 random non-synonymous T->G variants (to compare with T2677G) '''

num_randomizations = 100
rand_type = "nonsynonymous_T2G"
random_deltas = get_delta_mfes_all_rands(rand_type, num_randomizations)
with open(f"../Results/mfe/delta_mfe/random_delta_mfe_{rand_type}.pickle",'wb') as f:
    pickle.dump(random_deltas, f)


In [None]:
''' Calculate the p-values for each mutation ''' 

In [49]:
# T1236C
rand_type = "synonymous_T2C"
random_deltas = pd.read_pickle(f"../Results/mfe/delta_mfe/random_delta_mfe_{rand_type}.pickle")

pval_1236 = get_pvalue(delta_T1236C, random_deltas)
with open(f"../Results/mfe/delta_mfe/pval_T1236C.pickle",'wb') as f:
    pickle.dump(pval_1236, f)



In [50]:
pval_1236

0.010000000000000009

In [17]:
# T2677G
rand_type = "nonsynonymous_T2G"
random_deltas = pd.read_pickle(f"../Results/mfe/delta_mfe/random_delta_mfe_{rand_type}.pickle")

pval_2677 = get_pvalue(delta_T2677G, random_deltas)
with open(f"../Results/mfe/delta_mfe/pval_T2677G.pickle",'wb') as f:
    pickle.dump(pval_2677, f)


In [18]:
pval_2677

0.020000000000000018

In [53]:
# T3435C
rand_type = "synonymous_T2C"
random_deltas = pd.read_pickle(f"../Results/mfe/delta_mfe/random_delta_mfe_{rand_type}.pickle")

pval_3435 = get_pvalue(delta_T3435C, random_deltas)
with open(f"../Results/mfe/delta_mfe/pval_T3435C.pickle",'wb') as f:
    pickle.dump(pval_3435, f)


In [54]:
pval_3435

0.040000000000000036