# Implementation of Boyer Moore Searching Algorithm on Genome Sequence
Note: Run all cells in this notebook in order

### 1. File Handling
>#### Functions to read in and clean genome (DNA/RNA) sequence
>Note: Data cleaning function works with .fna files from https://www.ncbi.nlm.nih.gov/genome/

In [1]:
# To read .fna file

def readSourceFile(filename):
    source_file = open(filename, 'r')
    file = source_file.read()
    source_file.close()
    return file

In [2]:
# to get rid of title and newline characters in the .fna file
# return a clean genome sequence

def cleanSourceFile(filename):
    source_file = open(filename,'r')
    lines = source_file.readlines()[1:] # to remove title
    source_file.close()
    raw_DNA = "".join(lines)
    clean_DNA = raw_DNA.replace('\n','') # to remove newline characters
    clean_DNA = clean_DNA.upper()
    return clean_DNA


# This function is only applicable to .fna files downloaded from https://www.ncbi.nlm.nih.gov/genome/
# .fna files from other resources may have different formats, thus this function needs to be varied accordingly

### 2. Function Class for Boyer Moore Search
Contains:
- Prepocessing code in intialisation (Construction of bad match table)
- Search Function

In [3]:
class BMH(object):
    '''Class containing prepocessing and searching algorithm for Boyer-Moore-Horspool'''

    def __init__(self, text, pattern, alphabet):
        ''' Preprocessing Constucts bad match table from pattern '''
        self.text = text
        self.pattern = pattern
        self.pattern_len = len(pattern)
        self.text_len = len(text)

        # Creating {"A":0, "C":1, "G":2, "T":3}
        self.alphabet = alphabet
        self.alphabet_map = {} #{"A":0, "C":1, "G":2, "T":3}
        for i in range(len(alphabet)):
            self.alphabet_map[self.alphabet[i]] = i

        # Creating bad_match_table
        # Eg. "AGCTTC", alphabet_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        #  A  C  G  T
        # [5, 3, 4, 1]
        # All alphabets that do not appear in the pattern are set to len(pattern)
        # Since no other parts of the pattern would match 
        # the pattern can shift pass the foreign character
        self.bad_match_table = [self.pattern_len] * len(alphabet)

        # if it is the last character of the patten and:
        # a) it has not already been defined -> leave it to be pattern_len
        # b) it had been defined -> leave it to be the value of last matching character
        for i in range(self.pattern_len-1):
            char_to_index = self.alphabet_map[self.pattern[i]]
            # All other characters = pattern_len - greatest_index_of_this_char - 1
            self.bad_match_table[char_to_index] = self.pattern_len - i - 1
  
    def search(self):
        '''Searches for pattern sequence in text'''
        outcomes = [] #Records all found patterns
        text_index = 0
        while text_index <= (self.text_len - self.pattern_len):
            # Start from the rightside of the pattern
            pattern_index = self.pattern_len-1
            while (self.text[text_index+pattern_index] == self.pattern[pattern_index]):
                # When there is a match
                if pattern_index == 0:
                    outcomes.append(text_index+1)   
                    break 
                pattern_index-=1
      
            # Always check the bad_match_table with the rightmost character in the text
            # Eg. Text: ACCCTTTT (Mismatch at index 0 but we refer to bad_match_table with C the rightmost character)
            #  Pattern: CCC
            #  Bad_Match_Table: A  C  G  T
            #                  [5, 1, 5, 5]
            first_compared_text_char = self.text[text_index+self.pattern_len-1] 
            skip = self.bad_match_table[self.alphabet_map[first_compared_text_char]]
      
            # Shift pattern to the right according to bad_match_table
            text_index += skip
              
        return outcomes

### 3. Execution
#### Searching UI starts here
_Make sure sequence does not contain any other alphabet other than "ACGT" for DNA or "ACGU" for RNA for example "N"_

In [4]:
# To read in.fna file and get type of the file, DNA or RNA
file = input("Please enter the source file.fna: ")
type_of_sequence = input("Please input type of sequence. (1) DNA, (2) RNA (key in 1 or 2): ")

# print the content of the .fna file
print("The input file: ")
file_read = readSourceFile(file)
file_read

Please enter the source file.fna: test.fna
Please input type of sequence. (1) DNA, (2) RNA (key in 1 or 2): 1
The input file: 


'>NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome\nATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA\nCGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC\nTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG\nTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC\nCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC\nGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG\nCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT\nGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC\nGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT\nTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA\nGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG\nTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG\nCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTT

In [5]:
# print the clean genome sequence
print("Complete genome sequence after cleaning: ")
source = cleanSourceFile(file)
source

Complete genome sequence after cleaning: 


'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTC

In [6]:
# To propocessing and search for the pattern in the text
import datetime
# prompt the user to enter a query sequence
target = input("Please enter the query sequence to be found: ") 

print("Preprocessing...")
preprocessing_start_time = datetime.datetime.now()
if (type_of_sequence == str(1)):
    alpha = "ACGT"
elif (type_of_sequence == str(2)):
    alpha = "ACGU"

bmh = BMH(source, target, alpha)
preprocessing_time = datetime.datetime.now() - preprocessing_start_time
print("Finished Prepocesing. Proprocessing took", preprocessing_time.microseconds, "microseconds")

print()
print("Searching now...")
search_start_time = datetime.datetime.now()
result = bmh.search()
search_time = datetime.datetime.now() - search_start_time
print("The DNA segment can be found at the position(s) of "+str(result)+ " , there are/is "+str(len(result))+" occurrence(s) in total")
print("Search took", search_time.microseconds, "microseconds")

Please enter the query sequence to be found: CTCTTGAAACTGCTCAAAATTCTG
Preprocessing...
Finished Prepocesing. Proprocessing took 0 microseconds

Searching now...
The DNA segment can be found at the position(s) of [1917] , there are/is 1 occurrence(s) in total
Search took 7972 microseconds


### 4. References
- https://www.youtube.com/watch?v=IdRUrQxVlf4