<a href="https://colab.research.google.com/github/StuteePatil/Masters-Dissertation/blob/master/Model_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload the pre-trained model(model.h5) and tokenisers(tags_tokenizer.pkl and words_tokenizer.pkl) from the 'Pre-trained model' folder
from google.colab import files
file = files.upload()

Saving aa_tokenizer.pkl to aa_tokenizer.pkl
Saving cd_tokenizer.pkl to cd_tokenizer.pkl
Saving model .h5 to model .h5


In [2]:
# function to convert a nucleotide sequence into a sequence of codons 
# by grouping three consecutive nucleotides to form a codon

clen = 3

def codon(seq):
    codon_lst = []
    for i in range(len(seq)//3):
        cd = seq[i*clen : (i+1)*clen]
        codon_lst.append(cd)
    return codon_lst

In [3]:
# function to translate a sequence of codons into the corresponding sequence of amino acids

def translate(seq): 
    # dictionary that maps each codon to corresponding amino acid it codes for
    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    
    protein = [] 
    for i in seq:
        protein.append(table[i])  
    return protein 

In [4]:
# load the pre-trained models to make predictions on new test data
from pickle import load
from keras.models import load_model

model = load_model('model .h5')                             # loads bi-directonal LSTM model
aa_tokenizer = load(open('aa_tokenizer.pkl', 'rb'))         # loads amino acid tokenizer
cd_tokenizer = load(open('cd_tokenizer.pkl', 'rb'))         # loads codon tokenizer

In [5]:
from keras.preprocessing.sequence import pad_sequences
import statistics
import numpy as np

# method to calculate similarity between true and predicted sequences
def cal_acc(ytrue, ypred):
    count = 0
    min_len = min(len(ytrue), len(ypred))
    for i in range(min_len):
        if ytrue[i]==ypred[i]:      # compares corresponding values at each iteration
            count+=1
    return (count/len(ytrue))

# method to reverse map token to its corresponding codon 
def token_to_word(yhat):
    for word, index in cd_tokenizer.word_index.items():
        if index == yhat:
            out_word = word
            return out_word

# method that uses pre-trained sequence-to-sequence model for making predictions on new test sequences
def model_predict(seq):
    yhat = []
   
    test_cds = codon(seq.upper())       # converts the nucleotide sequence into a sequence of codons
    test_aas = translate(test_cds)      # translates the codon sequence into a sequence of amino acids

    MAX_LEN = 2000      # standard length of input as define by ML model
    
    test_seq = aa_tokenizer.texts_to_sequences([test_aas])          # tokenises test sequence into integers
    padded_test_sequence = pad_sequences(test_seq, maxlen=MAX_LEN)  # sequence padding to convert it to a length equal to 2000

    y_hat = model.predict(padded_test_sequence)         # predicts synonymous codons for the given test sequence

    y_hat_cds=[]
    for i in y_hat[0]:
        prob = np.argmax(i)             # selects codon with maximum probability at each iteration
        if prob == 0:
            continue
        else:
            y = token_to_word(prob)     # reverse map integer token to its respective codon
            y_hat_cds.append(y)
    yhat.append(y_hat_cds)

    yhat_aa = translate(yhat[0])        # translate predicted codon sequence into a sequence of amino acids

    cds_acc = cal_acc(test_cds, yhat[0])    # calculates similarity between true and predicted codon sequence
    aas_acc = cal_acc(test_aas, yhat_aa)    # calculates similarity between true and predicted amino acid sequence

    print('Optimised sequence: ', ''.join(yhat[0]))
    print('Percentage of codons changed: {}%' .format(round(100-(cds_acc*100), 2)))
    print('Accuracy of predicting synonymous codons (same amino acid sequence): {}%' .format(round((aas_acc*100), 2)))


In [6]:
seq = input('Enter the nucleotide sequence: ')  
model_predict(seq)

Enter the nucleotide sequence: atgaagaagtggcaatgcgtggtgtgtggactgatctatgacgaggccaaaggctggccggaagaaggcatcgaggcgggaacgcgctgggaagacgtgcctgaagactggctgtgccccgactgcggcgtcggcaagctggacttcgagatgatcgaaatcggctga
Optimised sequence:  ATGAAAAAATGGCAATGTGTTGTTTGTGGTTTGATTTATGATGAAGCTAAAGGTTGGCCAGAAGAAGGTATTGAAGCTGGTACTAGATGGGAAGATGTTCCAGAAGATTGGTTGTGTCCAGATTGTGGTGTTGGTAAATTGGATTTTGAAATGATTGAAATTGGTTAA
Percentage of codons changed: 73.21%
Accuracy of predicting synonymous codons (same amino acid sequence): 100.0%
