In [3]:
import random
import sys

def profile_with_pseudocounts(motifs):
    if not motifs:
        return {}
    
    k = len(motifs[0])
    t = len(motifs)
    profile = {'A': [1.0] * k, 'C': [1.0] * k, 'G': [1.0] * k, 'T': [1.0] * k} # Initialize with pseudocounts of 1, Keys are 'A', 'C', 'G', 'T'. Values are lists of probabilities for each position

    for j in range(k): # For each position
        for i in range(t): # For each motif
            nucleotide = motifs[i][j] # Motif number i for position j in the motif
            if nucleotide in profile:
                profile[nucleotide][j] += 1 #Profile[nucleotide][position] add 1

    # Normalize to get probabilities
    # Total count at each position will be t (from motifs) + 4 (from pseudocounts)
    total_count = t + 4.0 
    for nucleotide in profile:
        for j in range(k):
            profile[nucleotide][j] /= total_count
            
    return profile

def consensus(motifs): #Calculates the consensus string from a list of motifs
    if not motifs:
        return ""
    k = len(motifs[0])
    t = len(motifs)
    consensus_string = ""
    
    for j in range(k): # For each position
        counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
        for i in range(t): # For each motif
            nucleotide = motifs[i][j]
            if nucleotide in counts:
                 counts[nucleotide] += 1
        
        # Find the most frequent nucleotide
        max_count = -1
        most_frequent_nuc = ''
        # Ensure deterministic tie-breaking (e.g., alphabetical A > C > G > T)
        for nuc in ['A', 'C', 'G', 'T']: 
            if counts[nuc] > max_count:
                max_count = counts[nuc]
                most_frequent_nuc = nuc
        consensus_string += most_frequent_nuc
        
    return consensus_string

def score(motifs): # Sum of Hamming distances from motif to consensus
    if not motifs:
        return 0
        
    cons = consensus(motifs)
    k = len(motifs[0])
    t = len(motifs)
    total_score = 0
    
    for i in range(t): # For each motif
        motif = motifs[i]
        for j in range(k): # For each position
            if motif[j] != cons[j]:
                total_score += 1
                
    return total_score

def probability(kmer, profile): #Probability of a k-mer given a profile matrix
    prob = 1.0
    k = len(kmer)
    for j in range(k):
        nucleotide = kmer[j]
        if nucleotide in profile:
            prob *= profile[nucleotide][j]
        else:
             return 0.0 # If there is something unexpected
    return prob

def profile_most_probable_kmer(text, k, profile): # k-mer in text that is most probable considering the profile
    max_prob = -1.0
    most_probable = text[0:k] # First k-mer
    
    for i in range(len(text) - k + 1):
        kmer = text[i:i+k]
        current_prob = probability(kmer, profile)
        if current_prob > max_prob:
            max_prob = current_prob
            most_probable = kmer
            
    return most_probable

def motifs_from_profile(profile, dna, k): # List of the profile-most probable k-mers for each DNA input
    return [profile_most_probable_kmer(text, k, profile) for text in dna]

def random_motifs(dna, k, t): # Select a random k-mer
    motifs = []
    for i in range(t):
        start_index = random.randint(0, len(dna[i]) - k)
        motifs.append(dna[i][start_index : start_index + k])
    return motifs

def randomized_motif_search(dna, k, t):
    # Random motifs
    current_motifs = random_motifs(dna, k, t) # k: The length of the k-mers (motifs), t: The number of DNA strings
    best_motifs = list(current_motifs)
    best_score = score(best_motifs)
    
    while True:
        profile = profile_with_pseudocounts(current_motifs)
        current_motifs = motifs_from_profile(profile, dna, k)
        current_score = score(current_motifs)
        
        if current_score < best_score:
            best_motifs = list(current_motifs) # Make a copy
            best_score = current_score
        else:
            return best_motifs, best_score # Return the previous best_motif and score

def run_randomized_search_multiple_times(dna, k, t, num_runs=1000): # Running the search 1000 times
    overall_best_motifs = []
    overall_best_score = float('inf') # Initialize with a very high score

    for _ in range(num_runs):
        current_best_motifs, current_best_score = randomized_motif_search(dna, k, t)
        
        if current_best_score < overall_best_score:
            overall_best_score = current_best_score
            overall_best_motifs = current_best_motifs
            
    return overall_best_motifs

if __name__ == "__main__":
    with open("../data/rosalind_ba2f.txt", "r") as f:
        line1 = f.readline().strip().split()
        k = int(line1[0])
        t = int(line1[1])
        dna = [line.strip() for line in f if line.strip()] # Read remaining non-empty lines


    # Run the randomized search 1000 times
    best_motifs_overall = run_randomized_search_multiple_times(dna, k, t, 1000)
    for motif in best_motifs_overall:
        print(motif)

CCAAGCTGTGCGTTC
CCAAATTTTGCTTAG
CCACGATTTGCAGGG
CCCACATTTGCTTAG
CCACGAGGAGCTTAG
CCACGATTGTTTTAG
CCACGATTTGGCCAG
CCACGGAATGCTTAG
CCACTCCTTGCTTAG
CCACGACAGGCTTAG
GGACGATTTGCTTAT
CCATCGTTTGCTTAG
CCACGATCACCTTAG
CCACCCGTTGCTTAG
CCACGTACTGCTTAG
CCACGATTTCACTAG
CGTGGATTTGCTTAG
CCACGATTTGCTATT
ACACGATTTGCTTGA
TGCCGATTTGCTTAG
