We have the ITGA2 example in the MDR1 paper. Here we create random synonymous G->T changes in this gene to 
compare how significant is the actuall mfe change caused by the real variant. 

## Imports

In [1]:
import pandas as pd
from Utils_MDR1 import *

## Main

## NM_006306.4(SMC1A):c.1923G>A (p.Leu641=)

Let's get information that is relevant to multiple dfs / all dfs, such as the nucleotide MDR1 sequence, the synonumous substitutiuon matrix of MDR1, the TCGA mutations of MDR1 and the conserved regions.  

In [2]:
''' Get  CDS sequence '''
gene = 'ENSG00000072501' 
genes_dict = pd.read_pickle(f"../Data/cdna_{gene}.pickle.gz")
nt_CDS = genes_dict['data'][0]['homologies'][0]['source']['seq'][:-3] #removing stop codon -> msa was on aas and ttanslated back, so no info on stop codons

In [None]:
''' get a binary table of shape (4, gene_length) indicating the possible synonymous substitutions of each
position in the gene. Each row coresponds to a nucleotide in the dna alphabet, sorted alphabetically. 
For example, if we have "1" in position [2,300] it means that changing the nucleotide in position 300 to a 
"G" would result in a synonymous substitution. '''

possible_syn_replacements_for_gene = get_possib_syn_sub_per_positon(nt_CDS ,codons_syn_maps_dict)

In [4]:
''' Get the dictionary from cds relative position to chromosome relative position '''

mapping_dict = pd.read_pickle("../Data/AllGenes/cds_to_chrom_dict_with_protein_id.pickle")
gene_to_protein_dict = pd.read_pickle("../Data/AllGenes/gene_protein_dict.pickle")
#get the mapping of our specific gene
protein_id = gene_to_protein_dict[gene]
mapping_cur_gene = mapping_dict[gene,protein_id]


## (1) A df with positions of all possible G->A synonymous mutations

In [7]:
''' Get the pool of possible synonymous changes of G->A in SMC1A''' 

changed_from = "G"
change_to = "A"

pos_G_nuc = [position for position, nucleotide in enumerate(nt_CDS) if nucleotide == changed_from] #positions of "G" nucleotide
pos_can_change_to_A = np.where(possible_syn_replacements_for_gene[nucs_dict[change_to],:] == 1)[0] #positions that can be *synonymously* changed to T
positions_pool = [pos for pos in pos_G_nuc if pos in pos_can_change_to_A] #the intersection is our pool to choose from


In [8]:
len(positions_pool)

457

In [10]:
''' Create a df with the position (cds relative and chromosome relative, and with the subsequent mutated sequence '''

df_syn_G_A = pd.DataFrame()
df_syn_G_A["CDS_position_0_based"] = positions_pool
df_syn_G_A["Chromosome_position_1_based"] = df_syn_G_A["CDS_position_0_based"].apply(lambda x: mapping_cur_gene[x] + 1)
df_syn_G_A["Sequence"] = df_syn_G_A.apply(lambda x: mutate_cds_sequence(sequence = nt_CDS, position = x.CDS_position_0_based + 1, change_to = change_to), axis = 1)
df_syn_G_A["Changed_from"] = changed_from #strand sensitive
df_syn_G_A["Changed_to"] = change_to #strand sensitive
print(f"There are {df_syn_G_A.shape[0]} possible synonymous G->A substitutions in the SMC1A gene")
display(df_syn_G_A.head())

There are 457 possible synonymous G->A substitutions in the SMC1A gene


Unnamed: 0,CDS_position_0_based,Chromosome_position_1_based,Sequence,Changed_from,Changed_to
0,5,53422595,ATGGGATTCCTGAAACTGATTGAGATTGAGAACTTTAAGTCGTACA...,G,A
1,11,53422589,ATGGGGTTCCTAAAACTGATTGAGATTGAGAACTTTAAGTCGTACA...,G,A
2,17,53422583,ATGGGGTTCCTGAAACTAATTGAGATTGAGAACTTTAAGTCGTACA...,G,A
3,23,53422577,ATGGGGTTCCTGAAACTGATTGAAATTGAGAACTTTAAGTCGTACA...,G,A
4,29,53422571,ATGGGGTTCCTGAAACTGATTGAGATTGAAAACTTTAAGTCGTACA...,G,A


In [11]:
to_remove = [1923 - 1] #the CDS position of the variant
df_syn_G_A = df_syn_G_A[~df_syn_G_A["CDS_position_0_based"].isin(to_remove)]

In [13]:
'''Save to pickle'''
df_syn_G_A.to_pickle("../Results/SMC1A/synonymous_G2A_SMC1A.pickle")
