### Inference N-mers Frequency

In [1]:
target_names = ['Human Adenovirus',
                'MERS-CoV',
                'Parainfluenza virus',
                'Rhinovirus-HRV',
                'SARS-CoV-2',
                'Zaire Ebolavirus',
                'Zika virus']

In [2]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO
import tensorflow as tf
import tensorflow.keras.backend as K

In [3]:
# N-mers Frequency function
def n_mers(Text, n):
    freq = {}
    l=len(Text)
    for i in range (l-n+1):
        Pattern = Text[i:i+n]
        if Pattern not in freq:
            freq[Pattern] = 1 # if a pattern found is not already  in the dictionary freq{}, it is assigned a value of 1 and added to the list
        else:
            freq[Pattern] +=1 # however, if the pattern is already in the dictionary, its value should go up by 1 (so if it has been found, it is initially given a pattern of 1, and then this adds another 1 if it is found again
    return dict(sorted(freq.items()))

In [4]:
dna_ordered_list = ["AAA", "AAC", "AAT", "AAG", "ACA", "ACC", "ACT", "ACG", "ATA", "ATC",
                    "ATT", "ATG", "AGA", "AGC", "AGT", "AGG", "CAA", "CAC", "CAT", "CAG",
                    "CCA", "CCC", "CCT", "CCG", "CTA", "CTC", "CTT", "CTG", "CGA", "CGC",
                    "CGT", "CGG", "GAA", "GAC", "GAT", "GAG", "GCA", "GCC", "GCT", "GCG",
                    "GTA", "GTC", "GTT", "GTG", "GGA", "GGC", "GGT", "GGG", "TAA", "TAC",
                    "TAT", "TAG", "TCA", "TCC", "TCT", "TCG", "TTA", "TTC", "TTG", "TGA",
                    "TGC", "TGT", "TGG", "TTT"]

### Load model

In [5]:
model = tf.keras.models.load_model("model2.h5")

In [6]:
input_data = "J01917.1.fasta"
sequences = []
seq_IDs = []
for record in SeqIO.parse(input_data, "fasta"):
    sequences.append(record.seq)
    seq_IDs.append(record.id)

result = n_mers(str(sequences[0]), 3)

In [7]:
new_arr = []

for string in dna_ordered_list:
    new_arr.append(result[string])

In [8]:
x = np.array(new_arr)
x = np.reshape(x, (1, 64))
x.shape

(1, 64)

In [9]:
new_pred = np.round(model.predict(x))
new_pred



array([[1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [10]:
target_names[np.argmax(new_pred)]

'Human Adenovirus'