In [2]:
from itertools import product

def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def generate_kmers(k):
    return [''.join(p) for p in product("ACGT", repeat=k)]

def get_all_kmers(dna, k):
    return [dna[i:i+k] for i in range(len(dna) - k + 1)]

def motif_enumeration(dna_list, k, d):
    all_kmers = generate_kmers(k)
    motifs = set()
    
    for kmer in all_kmers:
        is_motif = True
        for dna in dna_list:
            kmer_found = False
            
            dna_kmers = get_all_kmers(dna, k)
            
            for dna_kmer in dna_kmers:
                if hamming_distance(kmer, dna_kmer) <= d:
                    kmer_found = True
                    break
            
            if not kmer_found:
                is_motif = False
                break
        
        if is_motif:
            motifs.add(kmer)
    
    return motifs

if __name__ == "__main__":
    with open("../data/rosalind_ba2a.txt", "r") as f:
        k, d = map(int, f.readline().strip().split())
        dna = [line.strip() for line in f]
    for i in motif_enumeration(dna, k, d):
        print(i)

GATCG
ATGAG
CGAAA
CGTGG
AGGTC
GCTGG
TACTG
TGAGC
GTTGG
CCCGG
TAGAG
CTGAG
AATGT
CAGAA
GGTGG
GAATT
GGTAT
TGTTG
CGAAC
ATGGT
GAATG
GGTCG
GAGTT
ATGGC
CTGAC
GCATA
TTTCG
TGGGA
GGGCC
TGCCA
TAGTG
TGACC
AGGTA
TCTGG
CCTGG
GATCT
TGGTT
GTGAC
GGGTC
GACGC
AGATC
CGGGC
TGCGT
TCGTA
GCCCT
GACGA
ACTAA
GACCT
GGGAC
TCGGG
AATGA
TGGAG
CGTGA
TTGCA
CGATA
GCACG
CCATG
ACTGA
ACGAT
GGTTA
CGTGC
GCGTA
CGAAT
CGCTT
AATGC
ACCTA
GCCGA
CCGGA
AAGTT
GGCCC
AGCGA
GAGGT
GGACC
AGCGG
GTCGG
TGGTA
TCTAG
AACTA
GCGAA
GGAAT
AACTC
CTCTG
CAATG
CTCGA
TGAAT
GGACG
TTGTC
CAACT
TGAAC
TTAAT
AGTCG
GCGGG
ATCGT
TCTGA
CGGCA
GCTTA
ACGTA
GCAGG
CGGAG
GAGAT
TACGG
CGTAT
CGGCT
CTTGA
AGCTT
CATGC
AAACG
AGAAG
AGTGT
GATAA
ACGTC
CATGG
GCCAA
ACATG
CTGCA
CATGA
CCTTG
ACCGA
AGGCC
AGCAC
GCCTG
AACGA
GTACT
ATTAA
GAAGG
GGCGA
GACAT
GAGCC
CTTAA
TCAAG
GCACT
AGACC
GAGCG
TAGGG
CTTAG
GTCCT
GTACC
CTTTA
CACGA
GTATC
AAATG
CGCAG
ACGAA
CGACT
TGCGC
GACCA
TGCAC
GGCCA
AGGCG
TGCCT
GTTGA
GGCCT
GCCGG
GGTCC
CTAGG
ATGTT
CCCAG
GGACT
AGTAA
GGCAA
CGAAG
TGGGC
GTCAG
TTACG
GACGG
CCGAA
GGGC

In [1]:
# ^_^ coding:utf-8 ^_^

"""
Implement MotifEnumeration
url: http://rosalind.info/problems/ba2a/

Given: Integers k and d, followed by a collection of strings Dna.
Return: All (k, d)-motifs in Dna.
"""

def neighbour(pattern, mismatch, words):
    if mismatch == 0:
        words.add(pattern)
    else:
        bases = ['A', 'T', 'C', 'G']
        for i in range(len(pattern)):
            for j in range(len(bases)):
                new_pattern = pattern[:i] + bases[j] + pattern[i+1:]
                if mismatch <= 1:
                    words.add(new_pattern)
                else:
                    neighbour(new_pattern, mismatch-1, words)

def MotifEnumeration(dna, k, d):
    patterns = []
    for n in range(len(dna)):
        pattern = set()
        for i in range(len(dna[n]) - k + 1):
            kmerspattern = set()
            neighbour(dna[n][i:i + k], d, kmerspattern)
            for words in kmerspattern:
                pattern.add(words)
        for j in pattern:
            patterns.append(j)
    motifpattern = []
    for element in patterns:
        if patterns.count(element) == len(dna):
            motifpattern.append(element)
    motifpattern = list(set(motifpattern))
    return motifpattern

if __name__ == "__main__":
    with open("../data/rosalind_ba2a.txt", "r") as f:
        k, d = map(int, f.readline().strip().split())
        dna = [line.strip() for line in f]
    for i in MotifEnumeration(dna, k, d):
        print(i)

GATCG
CGAAA
ATGAG
CGTGG
GCTGG
TACTG
AGGTC
TGAGC
TAGAG
GTTGG
CCCGG
CTGAG
AATGT
CAGAA
GGTGG
GAATT
GGTAT
TGTTG
CGAAC
ATGGT
GAATG
GGTCG
GAGTT
ATGGC
CTGAC
GCATA
TTTCG
TGGGA
TGCCA
GGGCC
TAGTG
TGACC
AGGTA
TCTGG
CCTGG
GATCT
TGGTT
GTGAC
GGGTC
GACGC
AGATC
CGGGC
TGCGT
TCGTA
GCCCT
GACGA
ACTAA
GACCT
GGGAC
TCGGG
AATGA
TGGAG
CGTGA
TTGCA
CGATA
GCACG
CCATG
ACTGA
ACGAT
GGTTA
CGTGC
GCGTA
CGAAT
CGCTT
AATGC
ACCTA
GCCGA
CCGGA
AAGTT
GGCCC
AGCGA
GAGGT
GGACC
AGCGG
GTCGG
TGGTA
TCTAG
AACTA
GCGAA
GGAAT
AACTC
CTCTG
CAATG
CTCGA
TGAAT
GGACG
TTGTC
CAACT
TGAAC
TTAAT
AGTCG
GCGGG
ATCGT
TCTGA
CGGCA
GCTTA
ACGTA
GCAGG
CGGAG
GAGAT
TACGG
CGTAT
CGGCT
CTTGA
AGCTT
CATGC
AAACG
AGAAG
AGTGT
GATAA
ACGTC
CATGG
GCCAA
CTGCA
ACATG
CATGA
CCTTG
ACCGA
AGGCC
AGCAC
GCCTG
AACGA
GTACT
ATTAA
GAAGG
GGCGA
GACAT
GAGCC
TCAAG
CTTAA
GCACT
AGACC
GAGCG
TAGGG
CTTAG
GTCCT
GTACC
CTTTA
CACGA
GTATC
AAATG
CGCAG
ACGAA
CGACT
TGCGC
GACCA
TGCAC
GGCCA
AGGCG
TGCCT
GTTGA
GGCCT
GCCGG
GGTCC
CTAGG
ATGTT
CCCAG
GGACT
AGTAA
GGCAA
TGGGC
CGAAG
GTCAG
TTACG
GGGCT
CCGAA
GACG