In this notebook we compare the change in CAI, FPTC and tAI caused by T1236C, T2677G and T3435C to the deltas caused by similar, random variants,  to calculate an empirical p-value. 
We use the dfs of random mutations, created in the notebook "create_MDR1_valid_mutations_dfs.ipynb". 


In [12]:
import pandas as pd
import numpy as np
import pickle
from Utils_MDR1 import variant_info, get_codon_and_freq, get_pvalue
from typing import Union


## Functions

In [2]:
''' Calculates the changes in the measure caused by the three variants '''

def delta_true_variants(weights_dict: dict, variant_info: dict) -> Union[float, float, float]:
    delta_T1236C = weights_dict[variant_info[1]["codon_after"]] - weights_dict[variant_info[1]["codon_before"]]
    delta_T2677G = weights_dict[variant_info[2]["codon_after"]] - weights_dict[variant_info[2]["codon_before"]]
    delta_T3435C = weights_dict[variant_info[3]["codon_after"]] - weights_dict[variant_info[3]["codon_before"]]

    return(delta_T1236C, delta_T2677G, delta_T3435C)


In [3]:
''' 
Calculates the change in the measure caused by a single random variant

'''

def get_delta_cub_single_rand(nt_CDS: str, cds_position_0_based: int, changed_to: str, changed_from: str, cub_dict: dict) -> float:
    
    assert(nt_CDS[cds_position_0_based] == changed_from) #nt_CDS in the position of the variant should have the same nt as "changed_from"
    
    position_in_codon = cds_position_0_based % 3 #is the mutation in the first, second or third position in the codon? 
    
    if position_in_codon == 0: #mutate the first position
        original_codon = nt_CDS[cds_position_0_based : cds_position_0_based + 3]
        mutated_codon = changed_to + nt_CDS[cds_position_0_based + 1: cds_position_0_based + 3]
    
    elif position_in_codon == 1: #mutate the second position
        original_codon = nt_CDS[cds_position_0_based - 1 : cds_position_0_based + 2]
        mutated_codon = nt_CDS[cds_position_0_based - 1] + changed_to + nt_CDS[cds_position_0_based + 1]
        
    elif position_in_codon == 2: #mutate the third position   
        original_codon = nt_CDS[cds_position_0_based - 2 : cds_position_0_based + 1]
        mutated_codon = nt_CDS[cds_position_0_based - 2: cds_position_0_based] + changed_to
           
    stop_codons = ["TAG","TGA", "TAA"]
    if (original_codon not in stop_codons) & (mutated_codon not in stop_codons):
        
        original_score = cub_dict[original_codon]
        mutated_score = cub_dict[mutated_codon]
        delta_score = mutated_score - original_score
    
        return(delta_score)
    
    else:
        return(np.nan)


In [4]:
''' 
Calculates the change in the measure caused by a "num_randomizations" random variants. 
"rand_type" - either "synonymous_T2C" or "nonsynonymous_T2G". Type of variant to use to create randomizations. 
For example, T1236C is a synonymous T->C variant, so we will use "synonymous_T2C" variants
to create similar random seqeunces. 


'''
def get_delta_cub_all_rands(rand_type: str, num_randomizations: int, cub_dict: dict, nt_CDS: str) -> np.ndarray:
    
    random_df = pd.read_pickle(f"../Data/random_mutations_for_pvals/{rand_type}.pickle") #contains sequences with all possible substitutions according to the selected criterion.
    num_mutated_sequences = random_df.shape[0] #choose "num_randomizations" sequences from the "num_mutated_sequences" and calculate the delta mfe caused by each change
    chosen_rows = np.random.choice(num_mutated_sequences, num_randomizations) #choose rows from the df randomly
    random_df = random_df.iloc[chosen_rows] #keep only the chosen rows
    random_delta_cub = random_df.apply(lambda x: get_delta_cub_single_rand(nt_CDS, x.CDS_position_0_based, x.Changed_to, x.Changed_from, cub_dict), axis = 1)
    return(random_delta_cub.values)
    


## Main 

In [5]:
''' Get MDR1 CDS sequence'''
gene = 'ENSG00000085563' #MDR1 gene
genes_dict = pd.read_pickle(f"../Data/cdna_{gene}.pickle.gz")
nt_CDS = genes_dict['data'][0]['homologies'][0]['source']['seq'][:-3] #removing stop codon -> msa was on aas and ttanslated back, so no info on stop codons


### CUB

In [8]:
''' Get the human Codon:CAI dictionary for T1236C and T2677G '''
CAI_human = pd.read_pickle("../Data/homo_sapiens_CAI.pickle")

''' Get the human 1000 codons frequency for T2677G '''
codon_usage_csv = pd.read_csv("../Data/Human_codon_frequency.txt", skiprows = 5, header = None) #human
codons_and_freq1000 =  codon_usage_csv[0].apply(lambda x: get_codon_and_freq(x))
freq1000_human = dict((codon.replace("U", "T"), float(freq)) for codon, freq in codons_and_freq1000) #the csv is in RNA al;phabet and we want DNA alphabet to match our CAI dictionary


In [9]:
''' Get the CUB deltas for the true variants '''

delta_T1236C_cai,_, delta_T3435C_cai = delta_true_variants(CAI_human, variant_info)
_,delta_T2677G_fptc, _ = delta_true_variants(freq1000_human, variant_info)

In [10]:
''' Define paramters for randomizations '''
possible_rand_types = ["synonymous_T2C", "nonsynonymous_T2G"]
num_randomizations = 100


In [13]:
''' Get delta CUB for the randomizations''' 
random_delta_cai = get_delta_cub_all_rands("synonymous_T2C", num_randomizations, CAI_human,nt_CDS)
random_delta_fptc = get_delta_cub_all_rands("nonsynonymous_T2G", num_randomizations, freq1000_human,nt_CDS)

'''Compare the random deltas to the real deltas to get s p-value per variant '''
pval_1236 = get_pvalue(delta_T1236C_cai, random_delta_cai)
pval_3435 = get_pvalue(delta_T3435C_cai, random_delta_cai)
pval_2677 = get_pvalue(delta_T2677G_fptc, random_delta_fptc)



In [80]:
''' Save '''
with open(f"../Results/CAI/delta_CAI/pval_1236.pickle",'wb') as f:
    pickle.dump(pval_1236, f)
with open(f"../Results/CAI/delta_CAI/pval_3435.pickle",'wb') as f:
    pickle.dump(pval_3435, f)
with open(f"../Results/FPTC/delta_FPTC/pval_2677.pickle",'wb') as f:
    pickle.dump(pval_2677, f)

### tAI

In [14]:
''' Get the tAI dictionary '''
tai_human = pd.read_pickle("../Data/tAI_human_tissue_specific.pickle")
tissues_mdr1 = ["KIRP", "KIRC", "KICH", "LIHC", "GBM_", "COAD"] #tissues in organs where MDR1 is naturally highly expressed


In [86]:
''' 
Analysis per tissue

'''
for tissue in tissues_mdr1:
    tai_tissue = tai_human[tissue]
    
    #get the true delta tAIs of the three variants
    delta_T1236C_tai, delta_T2677G_tai, delta_T3435C_tai = delta_true_variants(tai_tissue, variant_info)
    
    #get the delta tAIs of random (similar) changes
    random_delta_tai_synonynous = get_delta_cub_all_rands("synonymous_T2C", num_randomizations, tai_tissue,nt_CDS)
    random_delta_tai_nonsynonynous = get_delta_cub_all_rands("nonsynonymous_T2G", num_randomizations, tai_tissue,nt_CDS)
    
    #calculate p-value
    pval_1236 = get_pvalue(delta_T1236C_tai, random_delta_tai_synonynous)
    pval_3435 = get_pvalue(delta_T3435C_tai, random_delta_tai_synonynous)
    pval_2677 = get_pvalue(delta_T2677G_tai, random_delta_tai_nonsynonynous)
    
    with open(f"../Results/tAI/delta_tAI/{tissue}/pval_1236.pickle",'wb') as f:
        pickle.dump(pval_1236, f)
    with open(f"../Results/tAI/delta_tAI/{tissue}/pval_3435.pickle",'wb') as f:
        pickle.dump(pval_3435, f)
    with open(f"../Results/tAI/delta_tAI/{tissue}/pval_2677.pickle",'wb') as f:
        pickle.dump(pval_2677, f)

