### Compute the Number of Times a Pattern Appears in a Text

In [0]:
def occurance_of_pattern(text, pattern):
  text_len , pattern_len = len(text), len(pattern)
  return sum(1 for idx in range(text_len - pattern_len + 1) if text[idx: idx+pattern_len] == pattern)

In [0]:
occurance_of_pattern('GCGCG','GCG')

2

### Find the Most Frequent Words in a String

In [0]:
from collections import defaultdict

In [0]:
def most_frequent(text, k):
  mp = defaultdict(int)
  for idx in range(len(text) - k + 1):
    mp[text[idx:idx+k]] += 1
  #endfor
  mx = max(mp.values())
  return [key for key, val in mp.items() if val == mx]

In [0]:
print(*most_frequent('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4))

GCAT CATG


### Find the Reverse Complement of a String

In [0]:
def rev_complement(seq):
  mp = {
      'A' : 'T',
      'T' : 'A',
      'C' : 'G',
      'G' : 'C'
  }
  return ''.join([mp[ch] for ch in seq])[::-1]
  

In [0]:
print(rev_complement('AAAACCCGGT'))

ACCGGGTTTT


### Find All Occurrences of a Pattern in a String

In [0]:
def find_pos(pattern, text):
  k, text_len = len(pattern), len(text)
  return [idx for idx in range(text_len-k+1) if text[idx:idx+k] == pattern]

In [0]:
print(*find_pos('ATAT','GATATATGCATATACTT'))

1 3 9


### Find Patterns Forming Clumps in a String

In [0]:
def clump_find(seq, k, L, t):
  mp = defaultdict(int)
  ans = dict()
  
  for idx in range(L-k+1):
    mp[seq[idx:idx+k]] += 1
    if mp[seq[idx:idx+k]] >= t:
      #print(seq[idx:idx+k])
      ans[seq[idx:idx+k]] = True
    
  start = 0
  end = L - k
  
  while end < len(seq) - k:
    mp[seq[idx:idx+k]] -= 1
    start += 1
    end += 1
    mp[seq[end:end+k]] += 1
    if mp[seq[end:end+k]] >= t:
      #print(seq[end:end+k])
      ans[seq[end:end+k]] = True

  return [key for key in ans]

In [0]:
print(*clump_find('CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC', 5, 75, 4))

GAAGA CGACA AATGT


## Find a Position in a Genome Minimizing the Skew

In [0]:
def skew(seq):
  c = g = 0
  ls = [0]
  for ch in seq:
    if ch == 'C':
      c += 1
    if ch == 'G':
      g += 1
    ls.append((g-c))
  mn = min(ls)
  return [idx for idx, val in enumerate(ls) if val == mn]

In [0]:
print(*skew('CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG'))

53 97


### Compute the Hamming Distance Between Two Strings

In [0]:
def hamming_distance(seq1, seq2):
  return sum(1 for idx,ch in enumerate(seq1) if seq2[idx] != ch)

In [0]:
print(hamming_distance('GGGCCGTTGGT','GGACCGTTGAC'))

3


### Find All Approximate Occurrences of a Pattern in a String

In [0]:
def find_position_allowing_hamming(pattern, text, d):
  l1, l2 = len(pattern), len(text)
  return [pos for pos in range(l2-l1+1) if hamming_distance(text[pos:pos+len(pattern)], pattern) <= d]

In [0]:
print(*find_position_allowing_hamming('ATTCTGGA','CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC', 3))

6 7 26 27 78


### Find the Most Frequent Words with Mismatches in a String

In [0]:
from itertools import product


In [0]:

def find_frequent(text, k, d):
  all_gen = [''.join(x) for x in product(['A','C','G','T'],repeat = k)]
  all_sub = [text[pos:pos+k] for pos in range(len(text)-k+1)]
  mp = defaultdict(int)
  for string in all_gen:
    for sub in all_sub:
      if hamming_distance(string,sub) <= d:
        mp[string] += 1
  mx = max(mp.values())
  return [key for key,val in mp.items() if val == mx]

In [0]:
print(*find_frequent('ACGTTGCATGTCGCATGATGCATGAGAGCT',4,1))

ATGC ATGT GATG


### Find Frequent Words with Mismatches and Reverse Complements

In [0]:
def find_frequent_with_rev(text, k, d):
  all_gen = [''.join(x) for x in product(['A','C','G','T'],repeat = k)]
  all_sub = [text[pos:pos+k] for pos in range(len(text)-k+1)]
  mp = defaultdict(int)
  for string in all_gen:
    for sub in all_sub:
      if hamming_distance(string,sub) <= d:
        mp[string] += 1
  for string in all_gen:
    for sub in all_sub:
      if hamming_distance(rev_complement(string),sub) <= d:
        mp[string] += 1
  mx = max(mp.values())
  return [key for key,val in mp.items() if val == mx]

In [0]:
print(*find_frequent_with_rev('ACGTTGCATGTCGCATGATGCATGAGAGCT',4,1))

ACAT ATGT


### Generate the Frequency Array of a String

In [0]:
def find_frequency_array(text, k):
  k_mers = [''.join(x) for x in product('ACGT', repeat=k)]
  mp = defaultdict(int)
  for idx in range(len(text)-k+1):
    mp[text[idx:idx+k]] += 1
  return [mp[x] if x in mp else 0 for x in k_mers]

In [0]:
print(*find_frequency_array('ACGCGGCTCTGAAA',2))

2 1 0 0 0 0 2 2 1 2 1 0 0 1 1 0


### Implement PatternToNumber

In [0]:
def p2num(text):
  k = len(text)
  mp = {'A':0,'C':1,'G':2,'T':3}
  ans = 0
  for ch in text:
    ans = ans * 4 + mp[ch]
  #endfor
  return ans

In [0]:
p2num('AGT')

11

### Implement NumberToPattern

In [0]:
def num2p(val,k):
  text = ''
  mp = {0:'A', 1:'C', 2:'G', 3:'T'}
  for _ in range(k):
    text += mp[val%4]
    val //= 4
  #endfor
  return text[::-1]

In [0]:
print(num2p(45,4))

AGTC
