In this notebook we use spliceAI to check if the mutations possibly cause changes in splicing (through removing/creating a donor/acceptor site). 
For this purpose we use SpliceAI. 

In [64]:
import pandas as pd
import numpy as np
import os, sys, re
import pickle
import matplotlib.pyplot as plt
import glob

from keras.models import load_model
from pkg_resources import resource_filename
from spliceai.utils import one_hot_encode
from Bio import SeqIO
from scipy import stats
from tqdm import tqdm


## Functions

In [65]:
''' 
Obtains a sequence around a mutation to be used as input of spliceAI. 
SpliceAIs input is a vector that is built in this form: [M positions for context, N positions to predict on, X mutated positions, 
N positions to predict on, M positions for context]. This function takes the wanted_context_length (from 1 side), wanted_segment_length (from 1 side)
and pulls the sequence for the fasta file containing the full chromosome sequence.
'''

def get_context_for_position(position_info: pd.DataFrame, chromosome_path: str, wanted_context_length: int, wanted_segment_length :int) -> [str, str]:
    
    cur_chrom_path = os.path.join(chromosome_path, f"Homo_sapiens.GRCh38.dna_sm.chromosome.{position_info['Chromosome']}.fa")
    mutation_start = position_info["Start_Position"] #1-based
    mutation_end = position_info["End_Position"] #1-based
    forward_strand = position_info["is_forward"]
    reference_allele = position_info["Reference_Allele"]
    mut_allele = position_info["Tumor_Seq_Allele2"]
    mut_type = position_info["Variant_Type"]
    
    # get sequence of entire chromosome
    fasta_sequences = SeqIO.parse(open(cur_chrom_path),'fasta')
    for fasta in fasta_sequences:
        this_chr_seq = fasta.seq.upper()
    
    # get context before and after mutated position ("wanted_context_length" + "wanted_segment_length" nts from each side)
    # note that these bit is true specifically for snps (which our three mutations are)
    context_before = this_chr_seq[mutation_start - wanted_segment_length - wanted_context_length -1 : mutation_start - 1]
    context_after = this_chr_seq[mutation_end : mutation_end + wanted_segment_length + wanted_context_length]

    assert(reference_allele == this_chr_seq[mutation_start-1:mutation_end])
        
    reference_w_context = context_before + reference_allele + context_after
    mut_w_context = context_before + mut_allele + context_after
        
    #if we are on the reverse strand - reverse complement
    if not forward_strand:
        reference_w_context = reference_w_context.reverse_complement()
        mut_w_context = mut_w_context.reverse_complement()

    return(str(reference_w_context),str(mut_w_context))
     
    

In [66]:
''' 
Runs spliceAI on the input sequence using 5 different weights sets. Averages the predictions and returns the probability of each position in the 
prediction range of being a donor/acceptor site
'''

def SpliceAI_on_single_seq(input_sequence: str,models: list) -> [np.array, np.array]:
    x =  one_hot_encode(input_sequence)[None, :]
    y = np.mean([models[m].predict(x) for m in range(5)], axis=0)

    acceptor_prob = y[0, :, 1]
    donor_prob = y[0, :, 2]
    
    return (acceptor_prob, donor_prob)

In [67]:
''' 
Finds the positions in the sequence (relative to the predicted range) of  new/missed donor and acceptor sites. 
These are positions whose probability of being a donor/acceptor site changed significantly (more than "threshold", currently 0.5). 
'''
def location_new_and_missed_sites(acceptor_delta: np.array,donor_delta: np.array, threshold: float = 0.5) -> [list, list, list, list]:
        missed_acceptor_sites = np.where(acceptor_delta > threshold)[0]
        new_acceptor_sites = np.where(acceptor_delta < -threshold)[0]
        missed_donor_sites = np.where(donor_delta > threshold)[0]
        new_donor_sites = np.where(donor_delta < -threshold)[0]
        return(missed_acceptor_sites,new_acceptor_sites,missed_donor_sites,new_donor_sites)


## Main

In [68]:
chromosomes_path = "/tamir2/lab_resources/Genomes/Human/human_hg38/Chromosome"

In [69]:
# a dataframe that has basic info about the three mutations, for using SpliceAI
mutations_df = pd.read_pickle("../Data/MDR1_3_muts_df.pickle")
display(mutations_df)

Unnamed: 0,mut_id,gene_affected,Gene,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,Variant_Type,Variant_Classification,Transcript_ID,is_forward,cds_position
0,87509329:87509329:chr7:A:G,ENSG00000085563,ABCB1,7,87509329,87509329,A,G,SNP,Silent,ENST00000622132,False,3435
1,87550285:87550285:chr7:A:G,ENSG00000085563,ABCB1,7,87550285,87550285,A,G,SNP,Silent,ENST00000622132,False,1236
2,87531302:87531302:chr7:A:C,ENSG00000085563,ABCB1,7,87531302,87531302,A,C,SNP,Missense_Mutation,ENST00000622132,False,2677


In [70]:
#iterate over all mutations and add the reference and mutated sequences (5500 nts downstream + mutation/reference_nt + 5500 nts upstream) for each one of the mutations.
#These will be used as input for the spliceAI model. 

wanted_context_length = 5000
wanted_segment_length = 500

for mutation_ind in tqdm(mutations_df.index.to_list()):
    cur_mut = mutations_df.loc[mutation_ind,:]
    ref_seq, mut_seq = get_context_for_position(cur_mut, chromosomes_path, wanted_context_length,wanted_segment_length)
    mutations_df.loc[mutation_ind,'reference_w_context'] = ref_seq
    mutations_df.loc[mutation_ind,'mut_w_context'] = mut_seq

100%|██████████| 3/3 [00:04<00:00,  1.59s/it]


In [71]:
# Load the model
paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
models = [load_model(resource_filename('spliceai', x)) for x in paths]




In [72]:
threshold = 0.5 #decide on a threshold for significant thresholds. 

### T1236C

In [73]:
#get the reference and mutated sequences of this mutation
cur_mut = mutations_df[mutations_df["cds_position"] == 1236]
ref_seq = cur_mut["reference_w_context"].values[0]
mut_seq = cur_mut["mut_w_context"].values[0]

#use spliceAI to calculate each position's probability for being a donor or an acceptor site
ref_acceptor, ref_donor = SpliceAI_on_single_seq(ref_seq,models) 
mut_acceptor, mut_donor = SpliceAI_on_single_seq(mut_seq,models)

#calculate the change that occured because of the mutation
delta_acceptor = mut_acceptor - ref_acceptor
delta_donor = mut_donor - ref_donor

#Find donor & acceptor sites that were canceled or created because of the mutation. (these could change the splicing and hence the protein)
# We detect positions where there was more than 0.5 change in the probability. For example, if a position had a probability of 0.98 of being a 
# donor site and after the mutation it has a probability of smaller than 0.48, we consider it a missed donor site.  
missed_acceptor_sites,new_acceptor_sites,missed_donor_sites,new_donor_sites = location_new_and_missed_sites(delta_acceptor, delta_donor, threshold)

print(f"Missed acceptors: {missed_acceptor_sites},\nNew acceptors: {new_acceptor_sites},\nMissed donors: {missed_donor_sites},\nNew donors: {new_donor_sites}")
    

Missed acceptors: [],
New acceptors: [],
Missed donors: [],
New donors: []


### T2677G

In [74]:
#get the reference and mutated sequences of this mutation
cur_mut = mutations_df[mutations_df["cds_position"] == 2677]
ref_seq = cur_mut["reference_w_context"].values[0]
mut_seq = cur_mut["mut_w_context"].values[0]

#use spliceAI to calculate each position's probability for being a donor or an acceptor site
ref_acceptor, ref_donor = SpliceAI_on_single_seq(ref_seq,models) 
mut_acceptor, mut_donor = SpliceAI_on_single_seq(mut_seq,models)

#calculate the change that occured because of the mutation
delta_acceptor = mut_acceptor - ref_acceptor
delta_donor = mut_donor - ref_donor

#Find donor & acceptor sites that were canceled or created because of the mutation. (these could change the splicing and hence the protein)
# We detect positions where there was more than 0.5 change in the probability. For example, if a position had a probability of 0.98 of being a 
# donor site and after the mutation it has a probability of smaller than 0.48, we consider it a missed donor site.  
missed_acceptor_sites,new_acceptor_sites,missed_donor_sites,new_donor_sites = location_new_and_missed_sites(delta_acceptor, delta_donor, threshold)

print(f"Missed acceptors: {missed_acceptor_sites},\nNew acceptors: {new_acceptor_sites},\nMissed donors: {missed_donor_sites},\nNew donors: {new_donor_sites}")
 

Missed acceptors: [],
New acceptors: [],
Missed donors: [],
New donors: []


### T3435C

In [75]:
#get the reference and mutated sequences of this mutation
cur_mut = mutations_df[mutations_df["cds_position"] == 3435]
ref_seq = cur_mut["reference_w_context"].values[0]
mut_seq = cur_mut["mut_w_context"].values[0]

#use spliceAI to calculate each position's probability for being a donor or an acceptor site
ref_acceptor, ref_donor = SpliceAI_on_single_seq(ref_seq,models) 
mut_acceptor, mut_donor = SpliceAI_on_single_seq(mut_seq,models)

#calculate the change that occured because of the mutation
delta_acceptor = mut_acceptor - ref_acceptor
delta_donor = mut_donor - ref_donor

#Find donor & acceptor sites that were canceled or created because of the mutation. (these could change the splicing and hence the protein)
# We detect positions where there was more than 0.5 change in the probability. For example, if a position had a probability of 0.98 of being a 
# donor site and after the mutation it has a probability of smaller than 0.48, we consider it a missed donor site.  
missed_acceptor_sites,new_acceptor_sites,missed_donor_sites,new_donor_sites = location_new_and_missed_sites(delta_acceptor, delta_donor, threshold)

print(f"Missed acceptors: {missed_acceptor_sites},\nNew acceptors: {new_acceptor_sites},\nMissed donors: {missed_donor_sites},\nNew donors: {new_donor_sites}")


Missed acceptors: [],
New acceptors: [],
Missed donors: [],
New donors: []


No new/missed donor/acceptor site are predicted using SpliceAI. 