In [0]:
from itertools import product
from collections import Counter
import numpy as np

#Implement MotifEnumeration

In [0]:
def hamming_distance(s1, s2):
  return sum(1 for idx, ch in enumerate(s1) if ch != s2[idx])

In [0]:
def substr_match_with_mismatch(text, pattern, k, d):
  n = len(text)
  for idx in range(n-k+1):
    if hamming_distance(text[idx:idx+k], pattern) <= d:
      return True
  return False

In [0]:
def k_d_motif_finder(k,d, Dna):
  k_mers = [''.join(item) for item in product('ACGT', repeat=k)]
  ans = []
  for k_mer in k_mers:
    if all(substr_match_with_mismatch(s, k_mer, k,d) for s in Dna):
      ans.append(k_mer)
  return ans

#Find a Median String

In [0]:
def get_score(k, Dna, k_mer):
  n = len(Dna[0])
  score = 0
  for s in Dna:
    score += min([hamming_distance(s[idx:idx+k], k_mer) for idx in range(n-k+1)])
  return score

In [0]:
def median_string(k, Dna):
  k_mers = [''.join(item) for item in product('ACGT', repeat = k)]
  scores = [(k_mer,get_score(k,Dna,k_mer)) for k_mer in k_mers]
  return min(scores, key = lambda x: x[1])[0]

In [0]:
print(median_string(3,['AAATTGACGCAT','GACGACCACGTT','CGTCAGCGCCTG','GCTGAGCACCGG','AGTACGGGACAG']))

ACG


#Find a Profile-most Probable k-mer in a String

In [0]:
def score_for_profile(text, k, profile):
  mp = {'A':0, 'C':1, 'G':2, 'T':3}
  score = 1
  for idx,ch in enumerate(text):
    score *= profile[mp[ch]][idx]
  return score

In [0]:
def profile_most_k_mer(text, k, profile):
  n = len(text)
  k_mer_score_pair = [(text[idx:idx+k], score_for_profile(text[idx:idx+k],k,profile)) for idx in range(n-k+1)]
  return max(k_mer_score_pair, key = lambda x: x[1])[0]

In [0]:
with open('rosalind_ba2c.txt','r') as file:
  data = file.read().splitlines()
  text = data[0]
  k = int(data[1])
  profile = []
  for line in data[2::]:
    profile.append([float(x) for x in line.split()])
  print(profile_most_k_mer(text, k, profile))

# Implement GreedyMotifSearch

In [0]:
def motif_set_score(k,t,motif_set):
  score = 0
  for col in range(k):
    cnt = Counter([motif_set[row][col] for row in range(t)])
    score += t - max(cnt.values())
  return score

In [0]:
def build_profile_matrix(k,t,motif_set):
  mp = {'A':0,'C':1,'G':2,'T':3}
  matrix = np.zeros((4,k))
  for s in motif_set:
    for idx,ch in enumerate(s):
      matrix[mp[ch]][idx] += 1
  matrix += 1.0
  matrix /= (t+4)
  return matrix.tolist()

In [0]:
def greedy_motif_search(k,t,Dna):
  best_motif_set = [s[0:k] for s in Dna]
  n = len(Dna[0])
  for idx in range(n-k+1):
    k_mer = Dna[0][idx:idx+k]
    motif_set = [k_mer]
    for i in range(1,t):
      profile_matrix = build_profile_matrix(k,t,motif_set)
      motif_set.append(profile_most_k_mer(Dna[i], k, profile_matrix))
    if motif_set_score(k,t,motif_set) < motif_set_score(k,t, best_motif_set):
      best_motif_set = motif_set.copy()
  return best_motif_set

In [0]:
with open('rosalind_ba2d.txt','r') as file:
  data = file.read().splitlines()
  k,t = map(int, data[0].split())
  Dna = data[1:]
  ans = greedy_motif_search(k,t,Dna)
  for s in ans:
    print(s)

# Implement RandomizedMotifSearch

In [0]:
"""
from random import randint
from random import seed
import numpy as np
from collections import Counter


def build_profile_matrix(motifs, k, t):
    mp = {'A':0,'C':1,'G':2,'T':3}
    matrix = np.zeros((4,k))
    for s in motifs:
        for col, ch in enumerate(s):
            matrix[mp[ch]][col] += 1
    matrix += 1
    matrix /= (t + 4)
    return matrix.tolist()

def get_probability(k_mer, profile):
    mp = {'A':0,'C':1,'G':2,'T':3}
    score = 1.0
    for idx, ch in enumerate(k_mer):
        score *= profile[mp[ch]][idx]
    return score

def profile_most_k_mer(text, profile, k):
    n = len(text)
    scores = [(text[idx:idx+k],get_probability(text[idx:idx+k],profile)) for idx in range(n-k+1)]
    return max(scores, key = lambda x: x[1])[0]

def score_motif_set(motifs, k, t):
    score = 0
    for idx in range(k):
        cnt = Counter([s[idx] for s in motifs])
        score += t - max(cnt.values())
    return score

def random_motif_search(Dna, k, t):
    motifs = []
    n = len(Dna[0])
    for text in Dna:
        idx = randint(0,n-k)
        motifs.append(text[idx:idx+k])
    while True:
        profile = build_profile_matrix(motifs, k, t)
        new_motifs = [profile_most_k_mer(text, profile, k) for text in Dna]
        if score_motif_set(new_motifs,k,t) < score_motif_set(motifs, k, t):
            motifs = new_motifs.copy()
        else:
            return motifs

def main():
    k, t = map(int, input().split())
    Dna = [input() for _ in range(t)]
    ans = []
    for _ in range(1000):
        motifs = random_motif_search(Dna, k, t)
        ans.append((motifs, score_motif_set(motifs, k, t)))
    motif_set = min(ans, key = lambda x : x[1])[0]
    for x in motif_set:
        print(x)



if __name__=='__main__':
    main()
"""

#Implement DistanceBetweenPatternAndStrings

In [0]:
def min_hamming_distance(text, pattern):
  def distance(s1, s2):
    return sum(1 for idx, ch in enumerate(s1) if ch != s2[idx])
  n , k = len(text), len(pattern)
  return min(distance(text[idx:idx+k], pattern) for idx in range(n-k+1))

In [0]:
def distance_between_pattern(pattern, strings):
  return sum(min_hamming_distance(text, pattern) for text in strings)


In [0]:
distance_between_pattern('AAA', ['TTACCTTAAC' ,'GATATCTGTC' ,'ACGGCGTTCG','CCCTAAAGAG','CGTCAGAGGT'])

5