In [1]:
import numpy as np
from numpy.random import randint
from numpy.random import choice
from math import pow
from fractions import Fraction

In [2]:
def GibbsSampler(Dna, k, t, N):
  motifs = SelectRandomKmers(Dna, k, t)
  best_motifs = motifs
  for j in range(N):
    i = randint(t)
    motifs.pop(i)
    profile = GenerateProfile(motifs, k)
    motif_i = ProfileRandomlyGeneratedKmer(Dna[i], profile, k)
    motifs.insert(i, motif_i)
    if Score(motifs, k) < Score(best_motifs, k):
      best_motifs = motifs
  return best_motifs

In [3]:
def SelectRandomKmers(Dna, k, t):
  random_kmers = []
  for dna_string in Dna:
    random_kmers.append(FindAllKmers(dna_string, k)[randint(0, len(dna_string) - k + 1)]) #upper bound not included
  return random_kmers

In [4]:
def GenerateProfile(motifs, k):
  profile = np.zeros((4,k))
  motifs_array = DnaToArray(motifs)
  for i in range(k):
    frequency_dict = {'A':0, 'C':0, 'G':0, 'T':0}
    for j in range(len(np.unique(motifs_array[:,i], return_counts=True)[0])):
      frequency_dict[np.unique(motifs_array[:,i], return_counts=True)[0][j]] = np.unique(motifs_array[:,i], return_counts=True)[1][j]
    profile[:,i] = np.array(list(frequency_dict.values()))
  profile = profile + 1 #Laplace's rule of succession
  profile = profile / (len(motifs_array) + 4)
  return profile

In [5]:
def KmerProbability(profile, kmer):
  probability = 1
  for nucleotide in enumerate(kmer):
    probability = probability * profile[NucleotideIndex(nucleotide[1])][nucleotide[0]]
  return probability

In [6]:
def ProfileRandomlyGeneratedKmer(deleted_dna_string, profile, k):
  kmers = FindAllKmers(deleted_dna_string, k)
  kmers_probabilities = []
  initial_kmers_probabilities_sum = 0
  nominators_list = []
  denominators_list = []
  elementary_events_list = []
  for kmer in kmers:
    kmers_probabilities.append(KmerProbability(profile, kmer))
  initial_kmers_probabilities_sum = sum(kmers_probabilities)
  for i in range(len(kmers_probabilities)):
    kmers_probabilities[i] = kmers_probabilities[i] / initial_kmers_probabilities_sum
    nominators_list.append(int(str(Fraction(kmers_probabilities[i]).limit_denominator())[0]))
    denominators_list.append(int(str(Fraction(kmers_probabilities[i]).limit_denominator())[str(Fraction(kmers_probabilities[i]).limit_denominator()).find('/') + 1:len(str(Fraction(kmers_probabilities[i]).limit_denominator()))]))
  for i in range(len(nominators_list)):
    nominators_list[i] = int(nominators_list[i] * (np.lcm.reduce(denominators_list)/denominators_list[i]))
  for i in range(len(nominators_list)):
    for j in range(nominators_list[i]):
      elementary_events_list.append(i)
  return kmers[choice(elementary_events_list)]

In [7]:
def NucleotideIndex(nucleotide):
  if nucleotide == 'A':
    return 0
  elif nucleotide == 'C':
    return 1
  elif nucleotide == 'G':
    return 2
  else:
    return 3

In [8]:
def IndexNucleotide(index):
  if index == 0:
    return 'A'
  elif index == 1:
    return 'C'
  elif index == 2:
    return 'G'
  else:
    return 'T'

In [9]:
def FindAllKmers(dna_string, k):
  kmers_list = []
  for kmer_end_index in range(k - 1, len(dna_string)):
    kmers_list.append(dna_string[kmer_end_index + 1 - k:kmer_end_index + 1])
  return kmers_list

In [10]:
def Score(motifs, k):
  score = 0
  profile = GenerateProfile(motifs, k)
  consensus = Consensus(profile, k)
  for motif in motifs:
    score = score + HammingDistance(consensus, motif)
  return score

In [11]:
def Consensus(profile, k):
  consensus = []
  for i in range(k):
    consensus.append(IndexNucleotide(np.argmax(profile[:,i])))
  return ''.join(consensus)

In [12]:
def HammingDistance(string1, string2):
  counter = 0
  if len(string1) > len(string2):
    for i in range(len(string2)):
      if string1[i] != string2[i]:
        counter = counter + 1
    counter = counter + (len(string1) - len(string2))
  else:
    for i in range(len(string1)):
      if string1[i] != string2[i]:
        counter = counter + 1
    counter = counter + (len(string2) - len(string1))
  return counter

In [13]:
def DnaToArray(Dna):
  dna_array = np.zeros((len(Dna), len(Dna[0])), dtype='str')
  for dna_string in enumerate(Dna):
    dna_array[dna_string[0],:] = np.asarray(list(dna_string[1]), dtype='str')
  return dna_array

In [14]:
Dna = ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']

In [15]:
Dna

['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA',
 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG',
 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT',
 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC',
 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']

In [16]:
k = 8

In [17]:
t = 5

In [18]:
N = 100

In [29]:
GibbsSampler(Dna, k, t, N)

['CCCTCTCG', 'GGTGCCAG', 'AAGTATAC', 'AGATCAAG', 'CACCAGCT']

In [142]:
k = 15

In [143]:
t = 20

In [144]:
N = 2000

In [145]:
with open('/content/rosalind_ba2g.txt') as task_file:
  Dna = [line.rstrip() for line in task_file]

In [146]:
Dna

['GCCCGCATGCCTGCCTCTTGGCCTTAGTGTCTTACTAAAGGGTGCGCCATTATCCTGTGGCATTGCCATTTGTTGGACAAGTTAAAATGCCGTCTCGCCCTAACCGGCGAACCAGCCCAACTCCACCAAACTAATTTACGCTCGTTTTTGTATCCTGAGACTGGCACGGAACCTTTCCCTGCAGCCGGTTGTGAGGCTTCGCGAAGCGCGGACAAGGTTCACGCAAGCGTTGTGCCACCTCTAGTCGAGAGTGCGACTCAGGCCTGTGTTTACCAAAGCGAAGATGAACTAAAACTCGGTTACCGAAGCCCGCATGCCTGCC',
 'TCTTGGCCTTAGTGTCTTACTAAAGGGTGCGCCATTATCCTGTGGCATTGCCATTTGTTGGACAAGTTAAAATGCCGTCTCGCCCTAACCGGCGAACCAGCCCAACTCCACCAAACTAATTTACGCTCGTTTTTGTATCCTGAGACTGGCACGGAACCTGACTTAGTCTCAGCTTTCCCTGCAGCCGGTTGTGAGGCTTCGCGAAGCGCGGACAAGGTTCACGCAAGCGTTGTGCCACCTCTAGTCGAGAGTGCGACTCAGGCCTGTGTTTACCAAAGCGAAGATGAACTAAAACTCGGTTACCGAAGCCCGCATGCCTGCC',
 'AAAGTCCCACCAAATGTAAGGACCGGTTCTCAGCTCCAGGATAGCGGCAGGTCTGGGCAATGAGGGGACTGGTTACCGCACTGCTCCTCGAGGAAGCAAATAGTTCATCTGCTTCCTTAGCGCCTGACAGACCAAAACGGCTAGCGGACAGCAGTGTCAGCTCTATCAAGTCACCCATTGATTCCGGCTTGCATACTCTGATCTGATTTCCTTTGTGAACTTCCAATAGAAAGCTAATTAGAGGGGAAAATTATGAGTGGAGGCCAGTAGGGGGCATCCACAAACGATCGCGGATACCTTTAGATTATCAGCACTGGTGTTG',
 'TGAGTATCAATCGGAAC

In [None]:
GibbsSampler(Dna, k, t, N)