## Random Motif Search

In [0]:
import random
from collections import defaultdict

In [0]:
random.seed(335511)

In [0]:
def get_profile_matrix(k, motif_set):
  profile = [[],[],[],[]]
  for i in range(k):
    val = (sum(list(map(lambda x: x[i] == 'A', motif_set))) + 1)/(4 + len(motif_set))
    profile[0].append(val)
    val = (sum(list(map(lambda x: x[i] == 'C', motif_set))) + 1)/(4 + len(motif_set))
    profile[1].append(val)
    val = (sum(list(map(lambda x: x[i] == 'G', motif_set))) + 1)/(4 + len(motif_set))
    profile[2].append(val)
    val = (sum(list(map(lambda x: x[i] == 'T', motif_set))) + 1)/(4 + len(motif_set))
    profile[3].append(val)
  #endfor
  return profile

In [0]:
def get_most_probable_k_mer(k, seq, profile_matrix):
  mp = {
      'A' : 0,
      'C' : 1,
      'G' : 2,
      'T' : 3
  }
  #print(profile_matrix)
  ans = ''
  score = 0
  for idx in range(len(seq)-k+1):
    s = seq[idx:idx+k]
    temp_score = 1.0
    
    for idx, ch in enumerate(s):
      pos = mp[ch]
      temp_score = temp_score * profile_matrix[pos][idx]
    if temp_score > score:
      score = temp_score
      ans = s
  return ans

In [0]:
def calculate_motif_set_score(t, k, motif_set):
  score = 0
  for idx in range(k):
    mp = defaultdict(int)
    for s in motif_set:
      mp[s[idx]] += 1
    score += (t - max([val for key,val in mp.items()]))
  return score

In [0]:
def random_motif_search(k, t, Dna):
  length = len(Dna[0])
  motifs = []
  
  for idx,motif in enumerate(Dna):
    r = random.randint(0,length-k)
    motifs.append(motif[r:r+k])
  #endfor
  
  while True:
    profile_matrix = get_profile_matrix(k, motifs)
    temp_motifs = [get_most_probable_k_mer(k, seq, profile_matrix) for seq in Dna]
    if calculate_motif_set_score(t, k, motifs) > calculate_motif_set_score(t, k, temp_motifs):
      motifs = temp_motifs.copy()
    else:
      return motifs
    #endif
  #endwhile

In [0]:
#random_motif_search(8, 5, ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA','GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG','TAGTACCGAGACCGAAAGAAGTATACAGGCGT','TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'])

['CCCCTCTC', 'CAAGGTGC', 'GAAGTATA', 'CAAGTTTC', 'CCACGTGC']

In [0]:
score = 100000000000000000
ans = None

In [0]:
for _ in range(1000):
  motif_set = random_motif_search(8, 5, ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA','GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG','TAGTACCGAGACCGAAAGAAGTATACAGGCGT','TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'])
  if score > calculate_motif_set_score(5, 8, motif_set):
    score = calculate_motif_set_score(5, 8, motif_set)
    ans = motif_set.copy()
  

In [0]:
for motif in ans:
  print(motif)

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


## GIBBSSAMPLER

In [0]:
def gibbs_sampler(Dna, k, t, N):
  length = len(Dna[0])
  motifs = []
  for idx,motif in enumerate(Dna):
    r = int(random.randint(0,length-k))
    motifs.append(motif[r:r+k])
  for _ in range(N):
    i = random.randint(0,t-1)
    temp_motif = motifs.copy()
    del temp_motif[i]
    profile_matrix = get_profile_matrix(k, temp_motif)
    s = get_most_probable_k_mer(k, Dna[i], profile_matrix)
    temp_motif.insert(i, s)
    if calculate_motif_set_score(t, k, motifs) > calculate_motif_set_score(t, k, temp_motif):
      motifs = temp_motif.copy()
  #ENDIF
  return motifs

In [0]:
gibbs_sampler(8, 5, ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA','GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG','TAGTACCGAGACCGAAAGAAGTATACAGGCGT','TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'], 1000)

# Generate the k-mer Composition of a String

In [0]:
def fun(k, seq):
  return sorted([seq[idx:idx+k] for idx in range(len(seq)-k +1 )])

In [0]:
for s in fun(5, 'CAATCCAAC'):
  print(s)

AATCC
ATCCA
CAATC
CCAAC
TCCAA


# String Spelled by a Genome Path Problem

In [0]:
def fun(seq):
  ans = seq[0]
  for idx in range(1, len(seq)):
    ans += seq[idx][-1]
  return ans

In [0]:
print(fun(['ACCGA','CCGAA','CGAAG','GAAGC','AAGCT']))

ACCGAAGCT


# Construct the Overlap Graph of a Collection of k-mers

In [0]:
def fun(seq):
  ans = list()
  for i,s in enumerate(seq):
    for j,ss in enumerate(seq):
      if i == j:
        continue
      if ss.startswith(s[1::]):
        ans.append(s+' -> '+ ss)
  return ans

In [0]:
for s in fun(['ATGCG','GCATG','CATGC','AGGCA','GGCAT']):
  print(s)

GCATG -> CATGC
CATGC -> ATGCG
AGGCA -> GGCAT
GGCAT -> GCATG


# Construct the De Bruijn Graph of a String

In [0]:
def fun(k, seq):
  all_prefix = [seq[idx:idx+k] for idx in range(len(seq)-k+1)]
  mp = defaultdict(list)
  for prefix in all_prefix:
    mp[prefix[0:-1:]].append(prefix[1::])
  return mp

In [0]:
ans = fun(4,'AAGATTCTCTAC')

for key in ans:
  print('{} -> {}'.format(key,','.join(ans[key])))

AAG -> AGA
AGA -> GAT
GAT -> ATT
ATT -> TTC
TTC -> TCT
TCT -> CTC,CTA
CTC -> TCT
CTA -> TAC
