In [0]:
from collections import defaultdict
import copy

# Implement MotifEnumeration

In [0]:
def hamming_distance(seq1, seq2):
  return sum(1 for idx,ch in enumerate(seq1) if seq2[idx] != ch)

In [0]:
def is_ok(seq, motif, k, d):
  return any(hamming_distance(seq[idx:idx+k],motif) <= d  for idx in range(len(seq) - k + 1))


In [0]:
from itertools import product

In [0]:
def motif_enumeration(k, d, Dna):
  all_k_mers = [''.join(k_mer) for k_mer in product('ACGT', repeat=k)]
  ans = []
  for k_mer in all_k_mers:
    if all(is_ok(s,k_mer, k, d) for s in Dna):
      ans.append(k_mer)
    #endif
  #endfor
  return ans

In [0]:
print(*motif_enumeration(3, 1 , ['ATTTGGC','TGCCTTA','CGGTATC','GAAAATT']))

ATA ATT GTT TTT


# Find a Median String

In [0]:
def score_calculate(k, k_mer, Dna):
  score = 0
  for d in Dna:
    all_substring = [d[idx:idx+k] for idx in range(len(d) - k + 1)]
    score += min(hamming_distance(k_mer, s) for s in all_substring)
  #endfor
  return score


In [0]:
def find_median(k, Dna):
  k_mers = [''.join(x) for x in product('ACGT', repeat=k)]
  scores = [(k_mer, score_calculate(k, k_mer, Dna)) for k_mer in k_mers]
  return min(scores, key = lambda x: x[1])[0]

In [0]:
print(find_median(3, ['AAATTGACGCAT','GACGACCACGTT','CGTCAGCGCCTG','GCTGAGCACCGG','AGTACGGGACAG']))

ACG


# Find a Profile-most Probable k-mer in a String

In [0]:
def get_most_probable_k_mer(seq, k, profile_matrix):
  mp = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
  ans = []
  for idx in range(len(seq)-k+1):
    s = seq[idx:idx+k]
    score = 1.0
    for col, ch in enumerate(s):
      row = mp[ch]
      score *= profile_matrix[row][col]
    #endfor
    ans.append((score, idx))
  #endfor
  
  score, index = max(ans, key = lambda x: (x[0], -x[1]))
  return seq[index:index+k]

In [0]:
print(get_most_probable_k_mer('ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT', 5, [[0.2, 0.2, 0.3, 0.2, 0.3],[0.4, 0.3, 0.1, 0.5, 0.1],[0.3, 0.3, 0.5, 0.2, 0.4],[0.1, 0.2, 0.1, 0.1 ,0.2]]))

CCGAG


# Implement GreedyMotifSearch

In [0]:
def calculate_motif_set_score(t, k, motif_set):
  score = 0
  for idx in range(k):
    mp = defaultdict(int)
    for s in motif_set:
      mp[s[idx]] += 1
    score += (t - max([val for key,val in mp.items()]))
  return score

In [0]:
def get_profile(k, motifs):
  t = len(motifs)

  profile = [[], [], [], []]
  mp = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
  for i in range(k):
    for key in mp:
      val = sum(1 for x in motifs if x[i] == key)
      profile[mp[key]].append(val/t)
    #endfor
  #endfor
  return profile

In [0]:
def find_and_add_in_motif(k, seq, temp_motifs):
  profile = get_profile(k, temp_motifs)
  temp_motifs.append(get_most_probable_k_mer(seq, k, profile))


In [0]:
def greedy_motif_search(k, t, Dna):
  best_motifs = [d[0:k] for d in Dna]
  print()
  for idx in range(len(Dna[0]) - k +1):
    k_mer = Dna[0][idx:idx+k]
    temp_motifs = [k_mer]
    for i in range(1,t):
      find_and_add_in_motif(k, Dna[i], temp_motifs)
    if calculate_motif_set_score(t, k, best_motifs) > calculate_motif_set_score(t, k, temp_motifs):
      best_motifs = copy.copy(temp_motifs)
    #endfor
  #endfor
  return best_motifs


In [0]:
ans = greedy_motif_search(3,5,['GGCGTTCAGGCA','AAGAATCAGTCA','CAAGGAGTTCGC','CACGTCAATCAC','CAATAATATTCG'])




In [0]:
for x in ans:
  print(x)

CAG
CAG
CAA
CAA
CAA


# Implement GreedyMotifSearch with Pseudocounts

In [0]:
def get_profile_with_pseudo(k, motifs):
  t = len(motifs)

  profile = [[], [], [], []]
  mp = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
  for i in range(k):
    for key in mp:
      val = sum(1 for x in motifs if x[i] == key)
      val += 1
      profile[mp[key]].append(val/(t+4))
    #endfor
  #endfor
  return profile

In [0]:
def find_and_add_in_motif_pseudo(k, seq, temp_motifs):
  profile = get_profile_with_pseudo(k, temp_motifs)
  temp_motifs.append(get_most_probable_k_mer(seq, k, profile))

In [0]:
def greedy_motif_search_with_pseudo(k, t, Dna):
  best_motifs = [d[0:k] for d in Dna]
  print()
  for idx in range(len(Dna[0]) - k +1):
    k_mer = Dna[0][idx:idx+k]
    temp_motifs = [k_mer]
    for i in range(1,t):
      find_and_add_in_motif_pseudo(k, Dna[i], temp_motifs)
    if calculate_motif_set_score(t, k, best_motifs) > calculate_motif_set_score(t, k, temp_motifs):
      best_motifs = copy.copy(temp_motifs)
    #endfor
  #endfor
  return best_motifs

In [0]:
ans = greedy_motif_search_with_pseudo(3,5,['GGCGTTCAGGCA','AAGAATCAGTCA','CAAGGAGTTCGC','CACGTCAATCAC','CAATAATATTCG'])





In [0]:
for x in ans:
  print(x)

TTC
ATC
TTC
ATC
TTC


# Implement RandomizedMotifSearch 

In [0]:
import random
random.seed(31)

In [0]:
def random_motif_search(Dna, k, t):
  n = len(Dna[0])
  motifs = []
  for idx,motif in enumerate(Dna):
    r = random.randint(0,n-k)
    motifs.append(motif[r:r+k])
  #endfor
  while True:
    profile = get_profile_with_pseudo(k, motifs)
    temp = [get_most_probable_k_mer(x, k, profile) for x in Dna]
    if calculate_motif_set_score(t, k, motifs) > calculate_motif_set_score(t, k, temp):
      motifs = temp.copy()
    else:
      return motifs
  #endwhile

In [0]:
score = 100000000000000000
ans = None

In [0]:
for _ in range(1000):
  motif_set = random_motif_search(['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA','GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG','TAGTACCGAGACCGAAAGAAGTATACAGGCGT','TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'], 8, 5)
  if score > calculate_motif_set_score(5, 8, motif_set):
    score = calculate_motif_set_score(5, 8, motif_set)
    ans = motif_set.copy()

In [0]:
for x in ans:
  print(x)

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


# Implement GibbsSampler

In [0]:
def get_profile_for_gibs(k, i, motifs):
  
  del motifs[i]
  t = len(motifs)

  profile = [[], [], [], []]
  mp = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
  for i in range(k):
    for key in mp:
      val = sum(1 for x in motifs if x[i] == key)
      val += 1
      profile[mp[key]].append(val/(t+4))
    #endfor
  #endfor
  return profile

In [0]:
def gibss_sampler(k, t, N, Dna):
  n = len(Dna[0])
  motifs = []
  for idx,motif in enumerate(Dna):
    r = random.randint(0,n-k)
    motifs.append(motif[r:r+k])
  #endfor
  for _ in range(N):
    i = random.randint(0, len(motifs)-1)
    profile = get_profile_for_gibs(k, i, motifs.copy())
    temp = motifs.copy()
    temp[i] = get_most_probable_k_mer(Dna[i], k, profile)
    if(calculate_motif_set_score(t, k, temp) < calculate_motif_set_score(t, k, motifs)):
      motifs = temp.copy()
  #endfor
  return motifs

In [0]:
ans = gibss_sampler(8,5,100,['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA','GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG','TAGTACCGAGACCGAAAGAAGTATACAGGCGT','TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'])

In [0]:
for x in ans:
  print(x)

GTAAACGG
GTGTAAGT
ATACAGGC
GTGCACGT
CTCCACGT


# Implement DistanceBetweenPatternAndStrings

In [0]:
def calculate_total_distance(pattern, Dna):
  k = len(pattern)
  distance = 0
  for s in Dna:
    distance += min(hamming_distance(pattern, s[i:i+k]) for i in range(len(s)-k+1))
  return distance

In [0]:
calculate_total_distance('AAA', ['TTACCTTAAC','GATATCTGTC','ACGGCGTTCG','CCCTAAAGAG','CGTCAGAGGT'])

5