Post-transcriptional editing, such as A-to-I changes and methylation can be affected by the variants' effect on the mRNA secondary structure. 
In this notebook we check whether T1236C, T2677G or T3435C are predicted to change A-to-I sites, m6A methylation sites or RNA binding protein binding sites. 


In [2]:
import pandas as pd
import numpy as np
from Utils_MDR1 import mutate_cds_sequence, reverse_complement
from Bio import SeqIO


## Functions

In [3]:
''' Extract a part of a sequence from a fasta file '''

def extract_sequence(fasta_file: str, start: int, end:int) -> str:
    for record in SeqIO.parse(fasta_file, "fasta"):
        return str(record.seq[start-1:end])
    return None


## Predict A-to-I sites with AIRliner (http://alpha.dmi.unict.it/airliner/)

In [5]:
''' Get the reference sequence '''

fasta_file_chr7 = "../Data/Genomes/Human/human_hg38/Chromosome/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa"
transcript_start = 87503859 #obtained from biomart for transcript ENST00000265724
transcript_end = 87713323 #obtained from biomart for transcript ENST00000265724

pre_mrna = extract_sequence(fasta_file_chr7, transcript_start, transcript_end) #extracting the pre-mRNA of MDR1 from chr 7

In [6]:
''' Create the mutated sequences '''

transcript_len = transcript_end - transcript_start + 1
assert(transcript_len == len(pre_mrna))

#grch38 variant positions
pos_1236 = 87550285
pos_2677 = 87541302
pos_3435 = 87509329

transcript_pos_1236 = pos_1236 - transcript_start
transcript_pos_2677 = pos_2677 - transcript_start
transcript_pos_3435 = pos_3435 - transcript_start

assert(pre_mrna[transcript_pos_1236] == "A") # A on the forward is T on the reverse
assert(pre_mrna[transcript_pos_2677] == "A") # A on the forward is T on the reverse
assert(pre_mrna[transcript_pos_3435] == "A") # A on the forward is T on the reverse

margin = 50  #margin around the mutations

#T1236C
change_to = "G" #G on forward - > C on reverse
mut_1236 = mutate_cds_sequence(pre_mrna, transcript_pos_1236 + 1, change_to)
mut_1236 = mut_1236[transcript_pos_1236 - margin: transcript_pos_1236 + margin + 1]
ref_1236 = pre_mrna[transcript_pos_1236 - margin: transcript_pos_1236 + margin + 1]

#T2677G
change_to = "C" #C on forward - > G on reverse
mut_2677 = mutate_cds_sequence(pre_mrna, transcript_pos_2677 + 1, change_to)
mut_2677 = mut_2677[transcript_pos_2677 - margin: transcript_pos_2677 + margin + 1]
ref_2677 = pre_mrna[transcript_pos_2677 - margin: transcript_pos_2677 + margin + 1]

#T3435C
change_to = "G" #G on forward - > C on reverse
mut_3435 = mutate_cds_sequence(pre_mrna, transcript_pos_3435 + 1, change_to)
mut_3435 = mut_3435[transcript_pos_3435 - margin: transcript_pos_3435 + margin + 1]
ref_3435 = pre_mrna[transcript_pos_3435 - margin: transcript_pos_3435 + margin + 1]


In [7]:
''' Preprocess to fit the Airliner predictor ''' 

# reverse complement and transcribe
ref_1236 = reverse_complement(ref_1236).replace("T", "U").upper()
mut_1236 = reverse_complement(mut_1236).replace("T", "U").upper()
ref_2677 = reverse_complement(ref_2677).replace("T", "U").upper()
mut_2677 = reverse_complement(mut_2677).replace("T", "U").upper()
ref_3435 = reverse_complement(ref_3435).replace("T", "U").upper()
mut_3435 = reverse_complement(mut_3435).replace("T", "U").upper()


In [8]:
''' 
We used the web interface to run the model on all 6 sequences (ref and mut for the three variants)
and downloaded the results. Now we will analyze the predictions
'''

preds_ref_1236 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/ref_1236_output.csv", sep = ",")
preds_ref_1236 = preds_ref_1236.reset_index()
preds_ref_1236 = preds_ref_1236.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Ref_1236_ENST00000265724": "flanks"})

preds_mut_1236 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/mut_1236_output.csv", sep = ",")
preds_mut_1236 = preds_mut_1236.reset_index()
preds_mut_1236 = preds_mut_1236.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Mutant_1236_ENST00000265724": "flanks"})



### T1236C

In [9]:
delta = preds_mut_1236["site_probability"] - preds_ref_1236["site_probability"]

In [11]:
delta[163]

0.05955388068651901

### T2677G

In [15]:
preds_ref_2677 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/ref_2677_output.csv", sep = ",")
preds_ref_2677 = preds_ref_2677.reset_index()
preds_ref_2677 = preds_ref_2677.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Ref_2677_ENST00000265724": "flanks"})

preds_mut_2677 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/mut_2677_output.csv", sep = ",")
preds_mut_2677 = preds_mut_2677.reset_index()
preds_mut_2677 = preds_mut_2677.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Mutant_2677_ENST00000265724": "flanks"})



In [16]:
delta = preds_ref_2677["site_probability"] - preds_mut_2677["site_probability"]

In [17]:
np.argmax(delta)

152

In [18]:
delta[152]

0.005096134843388889

In [20]:
preds_ref_3435 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/ref_3435_output.csv", sep = ",")
preds_ref_3435 = preds_ref_3435.reset_index()
preds_ref_3435 = preds_ref_3435.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Ref_3435_ENST00000265724": "flanks"})

preds_mut_3435 = pd.read_csv("../Results/post_transcriptional_modifications/Airliner/mut_3435_output.csv", sep = ",")
preds_mut_3435 = preds_mut_3435.reset_index()
preds_mut_3435 = preds_mut_3435.rename(columns={"level_0": "index_of_A", "level_1": "site_probability", ">Mutant_3435_ENST00000265724": "flanks"})



In [21]:
delta = preds_ref_3435["site_probability"] - preds_mut_3435["site_probability"]

In [22]:
delta[np.argmax(delta)]

0.01608705706229796

We see that the larget change is ~5%, which is not very high. we can conclude that it is not likely to effect A2I sites. 

## Find whether there is a change in ribosome binding protein sites
A sequence of 101 nts surrounding the variant was the input to the catRAPID omics web-tool (http://service.tartaglialab.com/page/catrapid_group)

### Analyze the results

The z-score is the normalized probability of binding of a pair of RNA-protein. 
We require a change that is larger than 1. 

In [24]:
ref_1236 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_ref_1236_len101.txt", sep = "\t")
mut_1236 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_mut_1236_len101.txt", sep = "\t")

delta_zscores = mut_1236["Z-score"] - ref_1236["Z-score"]
np.sum(abs(delta_zscores) > 1)

0

In [25]:
ref_2677 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_ref_2677_len101.txt", sep = "\t")
mut_2677 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_mut_2677_len101.txt", sep = "\t")

delta_zscores = mut_2677["Z-score"] - ref_2677["Z-score"]
np.sum(abs(delta_zscores) > 1)

0

In [26]:
ref_3435 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_ref_3435_len101.txt", sep = "\t")
mut_3435 = pd.read_csv("../Results/post_transcriptional_modifications/rna_binding_proteins/catrapid_mut_3435_len101.txt", sep = "\t")

delta_zscores = mut_3435["Z-score"] - ref_3435["Z-score"]
np.sum(abs(delta_zscores) > 1)

0

## Predict whether a sequence contains m6A sites (Deepm6ASeq)

In [52]:
ref_1236 = "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/ref_1236_len101"
ref_1236 = pd.read_csv(ref_1236, header = None, sep = "\t")
ref_1236 = ref_1236.rename(columns={0: "sequence_name", 1: "probability_m6A"})

mut_1236 = "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/mut_1236_len101"
mut_1236 = pd.read_csv(mut_1236, header = None, sep = "\t")
mut_1236 = mut_1236.rename(columns={0: "sequence_name", 1: "probability_m6A"})

res_1236 = mut_1236["probability_m6A"].values[0] - ref_1236["probability_m6A"].values[0] >= 0.5

if res_1236:
    print("There are changed m6A sites near T1236C")
else:
    print("No changed m6A sites detected")


No changed m6A sites detected


In [55]:
ref_2677= "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/ref_2677_len101"
ref_2677 = pd.read_csv(ref_2677, header = None, sep = "\t")
ref_2677 = ref_2677.rename(columns={0: "sequence_name", 1: "probability_m6A"})

mut_2677 = "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/mut_2677_len101"
mut_2677 = pd.read_csv(mut_2677, header = None, sep = "\t")
mut_2677 = mut_2677.rename(columns={0: "sequence_name", 1: "probability_m6A"})

res_2677= mut_2677["probability_m6A"].values[0] - ref_2677["probability_m6A"].values[0] >= 0.5

if res_2677:
    print("There are changed m6A sites near T2677G")
else:
    print("No changed m6A sites detected")


No changed m6A sites detected


In [56]:
ref_3435= "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/ref_3435_len101"
ref_3435 = pd.read_csv(ref_3435, header = None, sep = "\t")
ref_3435 = ref_3435.rename(columns={0: "sequence_name", 1: "probability_m6A"})

mut_3435 = "../Results/post_transcriptional_modifications/DeepM6ASeq/predictions/mut_3435_len101"
mut_3435 = pd.read_csv(mut_3435, header = None, sep = "\t")
mut_3435 = mut_3435.rename(columns={0: "sequence_name", 1: "probability_m6A"})

res_3435= mut_3435["probability_m6A"].values[0] - ref_3435["probability_m6A"].values[0] >= 0.5

if res_3435:
    print("There are changed m6A sites near T3435C")
else:
    print("No changed m6A sites detected")


No changed m6A sites detected
