# Naive exact matching plus reverse complement

The goal is to implement the naive exact matching algorithm that is strand-aware. Meaning, it will look for occurrences of the target sequence in the genome, along with the reverse complement of the sequence. 

In [5]:
def readFastq(filename):
    '''Parses read and quality strings from FASTQ file format.'''
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities


In [34]:
def reverseComplement(sequence):
    '''Creates reverse complement sequence from input sequence'''
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    reversed_seq = ''
    for base in sequence:
        reversed_seq = complement[base] + reversed_seq
    return reversed_seq

In [7]:
def readGenome(filename):
    '''Parses DNA sequence from FASTA file format.'''
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

In [18]:
def naive_with_rc(pattern, text):
    '''Execute exact matching algorithm on given pattern and its reverse complement.'''
    occurrences = []
    rev_comp = reverseComplement(p)
    for i in range(len(text) - len(pattern) + 1):  # loop over alignments
        match = True  # initialize to true
        for j in range(len(pattern)):  # loop over characters
            if text[i+j] != pattern[j] and text[i+j] != rev_comp[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences

In [19]:
# test scenario 1
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)

[10, 23]


In [20]:
# test scenario 2
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)

[10, 24]


In [25]:
# test scenario 3
# Phi-X genome
!wget http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa

--2022-03-07 01:14:38--  http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa
Resolving d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)... 54.239.153.90, 54.239.153.155, 54.239.153.105, ...
Connecting to d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)|54.239.153.90|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5528 (5.4K) [application/octet-stream]
Saving to: ‘phix.fa’


2022-03-07 01:14:38 (11.2 MB/s) - ‘phix.fa’ saved [5528/5528]



In [29]:
phix_genome = readGenome('phix.fa')

In [30]:
occurrences = naive_with_rc('ATTA', phix_genome)

In [32]:
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

offset of leftmost occurrence: 7
# occurrences: 285


In [33]:
# download and parse lambda virus
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

--2022-03-07 01:17:51--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 18.67.79.175, 18.67.79.43, 18.67.79.64, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|18.67.79.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: ‘lambda_virus.fa’


2022-03-07 01:17:51 (2.58 MB/s) - ‘lambda_virus.fa’ saved [49270/49270]

