In [4]:
import numpy as np
from numpy.random import randint
from math import pow

In [5]:
def GibbsSampler(Dna, k, t, N):
  motifs = SelectRandomKmers(Dna, k, t)
  best_motifs = motifs
  for j in range(N):
    i = randint(t)
    motifs.pop(i)
    profile = GenerateProfile(motifs, k)
    motif_i = ProfileRandomlyGeneratedKmer(Dna, profile, i, k)
    if Score(motifs) < Score(best_motifs):
      best_motifs = motifs
  return best_motifs

In [6]:
def SelectRandomKmers(Dna, k, t):
  random_kmers = []
  for dna_string in Dna:
    random_kmers.append(FindAllKmers(dna_string, k)[randint(0, len(dna_string) - k + 1)]) #upper bound not included
  return random_kmers

In [7]:
def GenerateProfile(motifs, k):
  profile = np.zeros((4,k))
  motifs_array = DnaToArray(motifs)
  for i in range(k):
    frequency_dict = {'A':0, 'C':0, 'G':0, 'T':0}
    for j in range(len(np.unique(motifs_array[:,i], return_counts=True)[0])):
      frequency_dict[np.unique(motifs_array[:,i], return_counts=True)[0][j]] = np.unique(motifs_array[:,i], return_counts=True)[1][j]
    profile[:,i] = np.array(list(frequency_dict.values()))
  profile = profile + 1
  profile = profile / (len(motifs_array) + 4)
  return profile

In [8]:
def KmerProbability(profile, kmer):
  probability = 1
  for nucleotide in enumerate(kmer):
    probability = probability * profile[NucleotideIndex(nucleotide[1])][nucleotide[0]]
  return probability

In [9]:
def ProfileRandomlyGeneratedKmer(Dna, profile, i, k):
  kmers = FindAllKmers(Dna[i], k)
  kmers_probabilities = []
  occurence_list = []
  elementary_events_list = []
  for kmer in kmers:
    kmers_probabilities.append(KmerProbability(profile, kmer))
  if sum(kmers_probabilities) < 1:
    largest_denominator = 1
    for i in range(len(kmers_probabilities)):
      kmers_probabilities[i] = kmers_probabilities[i] / sum(kmers_probabilities)
      if kmers_probabilities[i].as_integer_ratio()[1] > largest_denominator:
        largest_denominator = kmers_probabilities[i].as_integer_ratio()[1]
  for i in range(len(kmers_probabilities)):
    occurence_list.append(kmers_probabilities[i].as_integer_ratio()[0] * largest_denominator)
  for i in range(len(kmers_probabilities)):
    for j in range(occurence_list.count(i)):
      elementary_events_list.append(i)
  return elementary_events_list[randint(0,len(elementary_events_list) - 1)]

In [10]:
def NucleotideIndex(nucleotide):
  if nucleotide == 'A':
    return 0
  elif nucleotide == 'C':
    return 1
  elif nucleotide == 'G':
    return 2
  else:
    return 3

In [11]:
def IndexNucleotide(index):
  if index == 0:
    return 'A'
  elif index == 1:
    return 'C'
  elif index == 2:
    return 'G'
  else:
    return 'T'

In [12]:
def FindAllKmers(dna_string, k):
  kmers_list = []
  for kmer_end_index in range(k - 1, len(dna_string)):
    kmers_list.append(dna_string[kmer_end_index + 1 - k:kmer_end_index + 1])
  return kmers_list

In [13]:
def Score(motifs, k):
  score = 0
  profile = GenerateProfile(motifs, k)
  consensus = Consensus(profile, k)
  for motif in motifs:
    score = score + HammingDistance(consensus, motif)
  return score

In [14]:
def Consensus(profile, k):
  consensus = []
  for i in range(k):
    consensus.append(IndexNucleotide(np.argmax(profile[:,i])))
  return ''.join(consensus)

In [15]:
def HammingDistance(string1, string2):
  counter = 0
  if len(string1) > len(string2):
    for i in range(len(string2)):
      if string1[i] != string2[i]:
        counter = counter + 1
    counter = counter + (len(string1) - len(string2))
  else:
    for i in range(len(string1)):
      if string1[i] != string2[i]:
        counter = counter + 1
    counter = counter + (len(string2) - len(string1))
  return counter

In [16]:
def DnaToArray(Dna):
  dna_array = np.zeros((len(Dna), len(Dna[0])), dtype='str')
  for dna_string in enumerate(Dna):
    dna_array[dna_string[0],:] = np.asarray(list(dna_string[1]), dtype='str')
  return dna_array

In [17]:
Dna = ['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC', 'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']

In [18]:
k = 8

In [19]:
t = 5

In [20]:
N = 100

In [21]:
GibbsSampler(Dna, k, t, N)

ValueError: ignored