## Desciption

An algorithm that analyzes k-mer frequencies from real DNA sequences and assigns a realness score (0-1) to new sequences:

1. Extracts k-mer frequencies from real Urtecho 2023 promoter sequences.
2. Computes k-mer frequencies for an input sequence.
3. Scores the input sequence using a distance metric from 0 (fake) to 1 (real) using: cosine similarity, KL-divergence, or euclidean distance.

In [21]:
import pandas as pd
from collections import defaultdict
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import entropy
from numpy.linalg import norm

In [2]:
# Load the sequences from the file

file_path = '../Data/Urtecho_2023/Urtecho_2023.csv'
real_sequences = list(pd.read_csv(file_path)['promoter_sequence'])

In [18]:
def extract_kmer_frequencies(sequences, k_min=2, k_max=3):
    """
    Extracts k-mer frequency distributions from one or multiple DNA sequences.
    If a list of sequences is provided, it computes the overall frequency distribution.
    If a single sequence is provided, it computes its individual k-mer frequency.
    """
    if isinstance(sequences, str):
        sequences = [sequences]

    kmer_counts = {k: defaultdict(int) for k in range(k_min, k_max+1)}
    total_counts = {k: 0 for k in range(k_min, k_max+1)}
    
    for seq in sequences:
        for k in range(k_min, k_max+1):
            for i in range(len(seq) - k + 1):
                kmer = seq[i:i+k]
                kmer_counts[k][kmer] += 1
                total_counts[k] += 1

    kmer_freqs = {
        k: {kmer: count / total_counts[k] for kmer, count in kmer_counts[k].items()}
        for k in range(k_min, k_max+1) if total_counts[k] > 0
    }
    
    return kmer_freqs

def cosine_similarity(real_kmer_freqs, input_kmer_freqs):
    """
    Compares the k-mer distributions of an input sequence against real DNA k-mer distributions.
    Uses cosine similarity to compute a realness score (0 to 1).
    """
    similarity_scores = []

    for k in real_kmer_freqs.keys():
        real_dist = real_kmer_freqs[k]
        input_dist = input_kmer_freqs.get(k, {})
        all_kmers = set(real_dist.keys()).union(set(input_dist.keys()))
        real_vector = np.array([real_dist.get(kmer, 0) for kmer in all_kmers])
        input_vector = np.array([input_dist.get(kmer, 0) for kmer in all_kmers])
        sim = 1 - cosine(real_vector, input_vector) if np.any(real_vector) and np.any(input_vector) else 0
        similarity_scores.append(sim)

    return np.mean(similarity_scores)


def kl_divergence(real_kmer_freqs, input_kmer_freqs):
    """
    Computes the average KL-divergence between real and input k-mer frequency distributions.
    A lower KL-divergence means the input sequence is more similar to real DNA sequences.
    """
    kl_scores = []

    for k in real_kmer_freqs.keys():
        real_dist = real_kmer_freqs[k]
        input_dist = input_kmer_freqs.get(k, {})
        all_kmers = set(real_dist.keys()).union(set(input_dist.keys()))
        real_vector = np.array([real_dist.get(kmer, 1e-10) for kmer in all_kmers])
        input_vector = np.array([input_dist.get(kmer, 1e-10) for kmer in all_kmers])
        kl_score = entropy(input_vector, real_vector)  # KL(P || Q)
        kl_scores.append(kl_score)

    return np.mean(kl_scores)


def euclidean_distance(real_kmer_freqs, input_kmer_freqs):
    """
    Computes the average Euclidean distance between real and input k-mer frequency distributions.
    A lower Euclidean distance means the input sequence is more similar to real DNA sequences.
    """
    distance_scores = []

    for k in real_kmer_freqs.keys():
        real_dist = real_kmer_freqs[k]
        input_dist = input_kmer_freqs.get(k, {})
        all_kmers = set(real_dist.keys()).union(set(input_dist.keys()))
        real_vector = np.array([real_dist.get(kmer, 0) for kmer in all_kmers])
        input_vector = np.array([input_dist.get(kmer, 0) for kmer in all_kmers])
        distance = norm(real_vector - input_vector)
        distance_scores.append(distance)

    return np.mean(distance_scores)


In [10]:
# Find k-mer frequencies of the real sequences

real_kmer_freqs = extract_kmer_frequencies(real_sequences)

In [19]:
# Find the kmer frequencies of pLac

input_sequence = 'AATACTAGAGGTCTTCCGACTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGGGCGGGAAGACAACTAGGGG'
input_kmer_freqs = extract_kmer_frequencies(input_sequence)

In [22]:
# Compare the realness of the input sequence

cosine_similarity_score = cosine_similarity(real_kmer_freqs, input_kmer_freqs)
print(f"Cosine similarity: {cosine_similarity_score:.4f}")

kl_divergence_score = kl_divergence(real_kmer_freqs, input_kmer_freqs)
print(f"KL-divergence: {kl_divergence_score:.4f}")

euclidean_distance_score = euclidean_distance(real_kmer_freqs, input_kmer_freqs)
print(f"Euclidean distance: {euclidean_distance_score:.4f}")

Cosine similarity: 0.7700
KL-divergence: 0.3662
Euclidean distance: 0.1284
