# Description

Takes the dot product between sequence and sigma-factor consensus regions:

* σ32 -- 5' -- CCCTTTGAA -- 13-15bp CCCGATNT -- 3'
* σ28 -- 5' -- CTAAA -- 15bp GCCGATAA -- 3'
* σ70 -- 5' -- TTGACA -- 16-18bp TATAAT -- 3'
* σ54 -- 5' -- CTGGNA -- 6bp TTGCA -- 3'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras

In [9]:
df = pd.read_csv('../Data/Combined/LaFleur_supp.csv')[['Promoter Sequence']]
df.head()

Unnamed: 0,Promoter Sequence
0,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...
1,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATG...
2,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...
3,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...
4,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...


In [None]:
def one_hot_sequence(sequence):
    mapping = {
        'A': np.array([1, 0, 0, 0]),
        'C': np.array([0, 1, 0, 0]),
        'G': np.array([0, 0, 1, 0]),
        'T': np.array([0, 0, 0, 1]),
        '0': np.array([0, 0, 0, 0]),
        'N': np.array([0.25, 0.25, 0.25, 0.25])
    }
    return np.array([mapping[nucleotide.upper()] for nucleotide in sequence.zfill(150)])

df['Promoter Sequence'] = df['Promoter Sequence'].apply(one_hot_sequence)
df.head()

Unnamed: 0,Promoter Sequence
0,"[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0,..."
1,"[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0,..."
2,"[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0,..."
3,"[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0,..."
4,"[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0,..."


In [None]:
import numpy as np

def score_match(seq, consensus):
    """
    Computes a similarity score between the input sequence and consensus binding site.
    Uses a simple dot product approach where matches contribute positively.
    """
    score = sum(1 for a, b in zip(seq, consensus) if a == b)
    return score

def find_best_binding_site(promoter):
    """
    Identifies the best matching sigma factor binding site in the given promoter sequence.
    """
    sigma_factors = {
        'σ32': [("CCCTTTGAA", 13, 15, "CCCGATNT")],
        'σ28': [("CTAAA", 15, 15, "GCCGATAA")],
        'σ70': [("TTGACA", 16, 18, "TATAAT")],
        'σ54': [("CTGGNA", 6, 6, "TTGCA")]
    }
    
    best_match = None
    best_score = -1
    best_position = -1
    best_sigma = None
    
    for sigma, sites in sigma_factors.items():
        for site1, spacer_min, spacer_max, site2 in sites:
            for i in range(len(promoter) - len(site1) - spacer_min - len(site2) + 1):
                for spacer in range(spacer_min, spacer_max + 1):
                    j = i + len(site1) + spacer
                    if j + len(site2) > len(promoter):
                        break  # Ensures the spacer remains within defined bounds
                    seq1 = promoter[i:i+len(site1)]
                    seq2 = promoter[j:j+len(site2)]
                    
                    score1 = score_match(seq1, site1)
                    score2 = score_match(seq2, site2)
                    total_score = score1 + score2
                    
                    if total_score > best_score:
                        best_score = total_score
                        best_match = (seq1, seq2)
                        best_position = i
                        best_sigma = sigma
    
    return best_sigma, best_position, best_match, best_score

# Example usage:
promoter_seq = "AAACCCCTTTGAATTTGCCGATNTTAAA"
result = find_best_binding_site(promoter_seq)
print(f"Best matching sigma factor: {result[0]}, Position: {result[1]}, Sequence match: {result[2]}, Score: {result[3]}")


Best matching sigma factor: σ28, Position: 1, Sequence match: ('CCTTT', 'CCCGATNT'), Score: 6


In [41]:
import re

def motif_to_regex(motif):
    """Converts a motif substring with 'N' into a regex pattern where 'N' matches any character."""
    return re.compile(re.escape(motif).replace('N', '.'))

def find_closest_motif(s, motifs):
    scores = {}
    best_motif_name = None
    best_score = float('inf')
    best_positions = None

    for name, motif in motifs.items():
        sub1, spacer, sub2 = motif
        sub1_regex = motif_to_regex(sub1)
        sub2_regex = motif_to_regex(sub2)

        sub1_matches = [m.start() for m in sub1_regex.finditer(s)]
        min_motif_score = float('inf')
        best_motif_positions = None

        for sub1_pos in sub1_matches:
            expected_sub2_pos = sub1_pos + len(sub1) + spacer
            closest_sub2_pos = None
            min_spacing_error = float('inf')

            sub2_matches = [m.start() for m in sub2_regex.finditer(s)]
            for sub2_pos in sub2_matches:
                spacing_error = abs(sub2_pos - expected_sub2_pos)
                if spacing_error < min_spacing_error:
                    min_spacing_error = spacing_error
                    closest_sub2_pos = sub2_pos
            
            if closest_sub2_pos is not None:
                penalty_score = min_spacing_error

                if penalty_score < min_motif_score:
                    min_motif_score = penalty_score
                    best_motif_positions = (sub1_pos, closest_sub2_pos)

        scores[name] = min_motif_score if min_motif_score != float('inf') else None

        if min_motif_score < best_score:
            best_score = min_motif_score
            best_motif_name = name
            best_positions = best_motif_positions

    return scores, (best_motif_name, best_score, best_positions)

# Example usage
s = "abcde---xyzabc--xyzabcde"
motifs = {
    'motif_1': ("aNc", 3, "xyz"),  # 'N' allows any character
    'motif_2': ("abc", 2, "xNz"),  # 'N' allows any character in 'xNz'
    'motif_3': ("abcde", 5, "xyz")
}

scores, best_match = find_closest_motif(s, motifs)
print("Scores:", scores)
print("Best Match:", best_match)


Scores: {'motif_1': 1, 'motif_2': 0, 'motif_3': 2}
Best Match: ('motif_2', 0, (11, 16))


This doesnt quite work: the spacer cannot be outside of spacer_min-spacer_max