In [9]:
import os
import re
from collections import Counter
import pandas as pd

# **All Folders Directories**

In [47]:
direcrory_of_folders = 'Results/'
folders = [folder for folder in os.listdir(direcrory_of_folders) if os.path.isdir(os.path.join(direcrory_of_folders, folder))]
folders

['Bioprospector', 'MDScan', 'MotifSampler', 'Streme', 'MEME']

In [48]:
def return_the_files(path):
    txt_files = [file for file in os.listdir(path) if file.endswith(".txt")]
    only_results = []
    for file in txt_files:
        if '_' in file:
            name_parts = file.split("_")
            if len(name_parts) > 1:
                name = name_parts[0]
                only_results.append(name)
    name_counts = Counter(only_results)
    unique_names = [name for name, count in name_counts.items() if count > 1] 
    return  unique_names 

## **For MDScan**

In [49]:
# # Read the MDscan output file
# with open('Results/MDScan/Ada_1.txt', 'r') as file:
#     mdscan_output = file.read()
directory_of_files = 'Results/MDScan/'

# Extract the motifs using regex
motif_pattern = r"Motif\s+(\d+):\s+Wid\s+(\d+);\s+Score\s+([\d.]+);\s+Sites\s+(\d+);\s+Con\s+([ACGT]+);\s+RCon\s+([ACGT]+)"
# motif_info = re.findall(motif_pattern, mdscan_output)

# Extract the site information using regex
site_pattern = r">(\d+-\d+-(?:forward|reverse))\s+Len\s+\d+\s+Site\s+#(\d+)\s+([fr])\s+(\d+)\n([ACGT]+)"
# site_info = re.findall(site_pattern, mdscan_output)

The return_the_files function stores all the unique names of the .txt files except the background.txt file.

The get_the_motifs and get_site_info functions creates a dataframes with vital informations of the motif_info's and the site_info's.

In [50]:
def get_the_motifs_MD(motif_info, file_name):
    motifs = {}

    for motif_match in motif_info:
        motif = {
            'File_name': file_name,
            'Motif_ID': motif_match[0],
            'Width': motif_match[1],
            'Score': motif_match[2],
            'Sites': motif_match[3],
            'con': motif_match[4],
            'rcon': motif_match[5]
        }
        motif_id = motif_match[0]

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif

    # Convert the motifs dictionary into a DataFrame
    df = pd.DataFrame.from_dict(motifs, orient='index')
    return df

In [51]:
def get_site_info(site_info, motif_info, file_name):
    data = []
    motif_id = 0  # Initial motif ID
    initial_position = site_info[0][0]  # Initial position

    for site in site_info:
        site_id, site_number, _, starting_point, motif_sequence = site

        if site_number == '1' and site_id == initial_position:
            motif_id += 1
            # initial_position = site_id

        data.append([site_id, site_number, starting_point, motif_sequence, motif_id, file_name])

    columns = ['Site_ID', 'Site_number', 'Starting_Point', 'Motif_Sequence', 'Motif_ID', 'File_Name']
    df_info = pd.DataFrame(data, columns=columns)
    return df_info

In [52]:
def process_mdscan_output(directory_of_files):
    names = return_the_files(directory_of_files)
    motif_dfs = []
    site_dfs = []

    for name in names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                mdscan_output = file.read()
            
            motif_info = re.findall(motif_pattern, mdscan_output)
            df_motif = get_the_motifs_MD(motif_info, name)
            motif_dfs.append(df_motif)
            
            site_info = re.findall(site_pattern, mdscan_output)
            df_site = get_site_info(site_info, motif_info, name)
            site_dfs.append(df_site)

    motif_df = pd.concat(motif_dfs, ignore_index=True)
    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Motif_ID'] = pd.to_numeric(site_df['Motif_ID'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
    motif_df['Motif_ID'] = pd.to_numeric(motif_df['Motif_ID'])
    motif_df['Sites'] = pd.to_numeric(motif_df['Sites'])
    motif_df['Score'] = pd.to_numeric(motif_df['Score'])
    motif_df['Width'] = pd.to_numeric(motif_df['Width'])
    
    return motif_df, site_df

# Usage example
motif_df_MD, site_df_MD = process_mdscan_output(directory_of_files)

In [53]:
motif_df_MD.head()

Unnamed: 0,File_name,Motif_ID,Width,Score,Sites,con,rcon
0,Ada,1,15,1.795,14,CGGAACCGCTGGCGG,CCGCCAGCGGTTCCG
1,Ada,2,15,1.777,16,CCGGAAACGATGGCG,CGCCATCGTTTCCGG
2,Ada,3,15,1.764,16,GGAAGCGCTGGCGGC,GCCGCCAGCGCTTCC
3,Ada,4,15,1.754,19,CGCCGCTGGCGGCTG,CAGCCGCCAGCGGCG
4,Ada,5,15,1.749,15,TATCCGTGACGGTGA,TCACCGTCACGGATA


In [54]:
site_df_MD.head()

Unnamed: 0,Site_ID,Site_number,Starting_Point,Motif_Sequence,Motif_ID,File_Name
0,209398-209425-forward,1,151,AAGCGCCGCTGGCGG,1,Ada
1,209398-209425-forward,2,258,CGCCATCGCTTCCGG,1,Ada
2,209398-209425-forward,3,166,CTGAAGCGATGGGTA,1,Ada
3,209398-209425-forward,4,229,CGGAACCACTGGGTG,1,Ada
4,209398-209425-forward,5,259,CGGAAGCGATGGCGG,1,Ada


## **For Bioprospector**

In [55]:
Bioproepector_path = 'Results/Bioprospector/'  # Replace with the actual directory path
motif_pattern = r'Motif\s+#(\d+):\s+\((\w+/\w+)\)\n\*+\nWidth \((\d+), \d+\);\s+Gap \[\d+, \d+\];\s+MotifScore (\d+\.\d+);\s+Sites (\d+)'
site_pattern = r'>(\d+-\d+-\w+)\s+len\s\d+\s+site\s+#(\d+)\s+(\w+)\s+(\d+)\n(\w+)'

In [56]:
def get_the_motifs_BP(motif_info, file_name):
    motifs = {}

    for motif_match in motif_info:
        motif_id = motif_match[0]
        con, rcon = motif_match[1].split('/')
        width = motif_match[2]
        score = motif_match[3]
        sites = motif_match[4]

        motif = {
            'File_name': file_name,
            'Motif_ID': motif_id,
            'Width': width,
            'Score': score,
            'Sites': sites,
            'con': con,
            'rcon': rcon
        }

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif

    # Convert the motifs dictionary into a DataFrame
    df = pd.DataFrame.from_dict(motifs, orient='index')
    return df

In [57]:
def process_Bioprospector_output(directory_of_files):
    names = return_the_files(directory_of_files)
    motif_dfs = []
    site_dfs = []

    for name in names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                mdscan_output = file.read()
            
            motif_info = re.findall(motif_pattern, mdscan_output)
            df_motif = get_the_motifs_BP(motif_info, name)
            motif_dfs.append(df_motif)
            
            site_info = re.findall(site_pattern, mdscan_output)
            df_site = get_site_info(site_info, motif_info, name)
            site_dfs.append(df_site)

    motif_df = pd.concat(motif_dfs, ignore_index=True)
    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Motif_ID'] = pd.to_numeric(site_df['Motif_ID'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
    motif_df['Motif_ID'] = pd.to_numeric(motif_df['Motif_ID'])
    motif_df['Sites'] = pd.to_numeric(motif_df['Sites'])
    motif_df['Score'] = pd.to_numeric(motif_df['Score'])
    motif_df['Width'] = pd.to_numeric(motif_df['Width'])
    
    return motif_df, site_df


In [58]:
motif_df_BP, site_df_BP = process_Bioprospector_output(Bioproepector_path)

## **For MotifSampler**

In [29]:
with open('Results/MotifSampler/Ada_1.txt', 'r') as file:
    text = file.read()


In [30]:
# Define the patterns
pattern_id_ll = re.compile(r'^#id: (?P<id>\S+).*ll: (?P<ll>\S+)')
pattern_sites = re.compile(r'id "(?P<id>\S+)"; site "(?P<site>\S+)";')

In [31]:
# Initialize the dictionary to store the results
data = {}

# Split the text into lines and process each line
for line in text.split('\n'):
    match_id_ll = pattern_id_ll.match(line)
    match_sites = pattern_sites.search(line)

    # If the line matches the id_ll pattern, store the id and ll in the dictionary
    if match_id_ll:
        id = match_id_ll.group('id')
        ll = float(match_id_ll.group('ll'))
        data[id] = {'ll': ll, 'sites': []}

    # If the line matches the sites pattern, append the site to the list of sites for the appropriate id
    if match_sites:
        id = match_sites.group('id')
        site = match_sites.group('site')
        data[id]['sites'].append(site)

# Print the results
for id, info in data.items():
    print(f"id: {id}, ll: {info['ll']}, sites: {info['sites']}")


id: box_1_1_nnTnTnnGTGAnGGT, ll: 28.81, sites: ['GTTATCGGTGAAGGT', 'GGTATCCGTGACGGT', 'TTTTTGCGTGATGGT']
id: box_1_2_TCACCGTCACssrkA, ll: 25.89, sites: ['TCACCGTAACCCGGA', 'CCACCTTCACCGATA', 'TTACCGTCACGCATG', 'TCACCGGCATGGGGA']
id: box_2_3_TAAmskTTATnCTGA, ll: 28.52, sites: ['TAACGTTTATGCTGA', 'TAAACGTTATTCAGA', 'TAAAGGCTATCCTTA', 'TAGCCTTTAGGCTGC']
id: box_3_1_GnTnnCnGnGAnGGn, ll: 28.9, sites: ['GTTATCGGTGAAGGT', 'GGTATCCGTGACGGT', 'GTTCGCCGGGAAGGG']
id: box_3_2_AAnnnnGnCGnCnAA, ll: 26.91, sites: ['AAGAGCGCCGACAAA', 'AATGCCGACGGCGAA', 'AAGACGGACGGCAAA']
id: box_5_1_kyTTCGyTnTCArCG, ll: 25.01, sites: ['TCGTCGTTATCAGCG', 'GTTGCGCTTTCAACG', 'GCTTCCTTGTCAGCG', 'TTTTCGCTGACAAGG']
id: box_5_2_TnysCGTGmCGGTGA, ll: 27.75, sites: ['TCCCCATGCCGGTGA', 'CATGCGTGACGGTAA', 'TATCCGTGACGGTGA', 'TACTCGGGCCGGAGA', 'TTTGCGTGATGGTGA']
id: box_5_3_TAAmskTTATnCTGA, ll: 28.5, sites: ['TAACGTTTATGCTGA', 'TAAACGTTATTCAGA', 'TAAAGGCTATCCTTA', 'TAGCCTTTAGGCTGC']


## **For MEME**

In [2]:
with open('Results/MEME/Ada_meme0.txt', 'r') as file:
    meme_output = file.read()


meme_motif_pattern = r'MOTIF\s+(?P<motif>\w+)\s+MEME-(?P<index>\d)\s+width\s+=\s+(?P<width>\d+)\s+sites\s+=\s+(?P<sites>\d+).+E-value\s*=\s*(?P<evalue>\d+(?:\.\d+)?(?:e[+-]?\d+)?)'
meme_motif_info = re.finditer(meme_motif_pattern, meme_output)

sites_pattern = r'MEME-(\d+) sites sorted by position p-value\n(?:.*\n){3}((?:(?!-+).*\n)*)'
meme_sites = re.findall(sites_pattern, meme_output)

In [3]:
meme_dict = {'File_name': [],
             'Motif_ID': [],
             'Width': [],
             'Score': [],
             'Sites': [],
             'con': []
             }
for i, match in enumerate(meme_motif_info):
    # print(match)
    meme_dict['File_name'].append('Ada')
    meme_dict['Motif_ID'].append(int(match.group('index')))
    meme_dict['Width'].append(int(match.group('width')))
    meme_dict['Score'].append(float(match.group('evalue')))
    meme_dict['Sites'].append(int(match.group('sites')))
    meme_dict['con'].append(match.group('motif'))

print(meme_dict)

{'File_name': ['Ada', 'Ada', 'Ada', 'Ada', 'Ada'], 'Motif_ID': [1, 2, 3, 4, 5], 'Width': [8, 8, 10, 8, 9], 'Score': [1.3, 920.0, 10000.0, 18000.0, 13000.0], 'Sites': [15, 11, 2, 2, 3], 'con': ['TCTSGCSG', 'TGAAARMG', 'AGMTTTAAAA', 'TACGGTTA', 'GTGAHGGTG']}


In [8]:
meme_motif_df = pd.DataFrame(meme_dict)
meme_motif_df.sort_values(axis=0, by='Score', inplace=True, ascending=False)
meme_motif_df

Unnamed: 0,File_name,Motif_ID,Width,Score,Sites,con
3,Ada,4,8,18000.0,2,TACGGTTA
4,Ada,5,9,13000.0,3,GTGAHGGTG
2,Ada,3,10,10000.0,2,AGMTTTAAAA
1,Ada,2,8,920.0,11,TGAAARMG
0,Ada,1,8,1.3,15,TCTSGCSG


In [27]:
meme_site_pattern = r'(?P<seq_id>\d+-\d+-(?:forward|reverse))\s*(?P<start_number>\d+)\s+\S+\s+(?P<site>\S+)\s+\S+\s+\S+'
# meme_site_pattern = r'(?P<site_id>\S+)\s+(?P<start_number>\d+)\s+\S+\s+(?P<motif_sequence>\S+)\s+\S+\s+\S+'
meme_site_dict = {'Sequence_ID': [],
                  'Site': [],
                  'Starting_Point': [],
                  'Score': [],
                  'Width': [],
                  'File_Name': []
                  }
for i, site in enumerate(meme_sites):
    print(site[0])
    print(site[1])
    # sites_per_motif = re.findall(meme_site_pattern, site[1])
    sites_per_motif = re.finditer(meme_site_pattern, site[1])
    print(sites_per_motif)
    for j, match in enumerate(sites_per_motif):
        meme_site_dict['Sequence_ID'].append(match.group('seq_id'))
        meme_site_dict['Score'].append(j)
        meme_site_dict['Starting_Point'].append(int(match.group('start_number')))
        meme_site_dict['Site'].append(match.group('site'))
        meme_site_dict['Width'].append(i)
        meme_site_dict['File_Name'].append('Ada')

print(meme_site_dict)

1
209398-209425-forward       302  1.61e-05 TGCTGGCGGA TCTGGCCG ATCTCGACGT
2145603-2145630-reverse     322  3.22e-05 TGTTGGGATT TCTCGCCG CCCGTGCGGT
209398-209425-forward       106  3.22e-05 TATTCCGTTA TCTCGCCG GAAGGTTGTG
2145603-2145630-reverse     382  4.82e-05 ATGCCCGTAG TCTGGCGG TGGGCGAATA
209398-209425-forward        36  4.82e-05 TGAAGGTGGT TCTGGCGG TGCGCTGGCG
2308475-2308502-reverse     398  1.29e-04 TTGCCGTCCG TCTTGCCG CGCCAGACAT
2308475-2308502-reverse     361  1.44e-04 GCGAATTCGT TTTCGCCG TGCGTACCAC
2308475-2308502-reverse     111  1.61e-04 GCCGGGAAGG GCTGGCGG TTTATATGAT
209398-209425-forward       293  1.61e-04 AAGCGCAACT GCTGGCGG ATCTGGCCGA
209398-209425-forward       158  1.61e-04 ACAAAGCGCC GCTGGCGG CTGAAGCGAT
2308475-2308502-reverse     386  3.02e-04 CACAGGCATC TTTTGCCG TCCGTCTTGC
2308475-2308502-reverse      97  3.02e-04 TAAAGAGGTT GTTCGCCG GGAAGGGCTG
2308475-2308502-reverse      71  3.82e-04 TTGATGGTAC TCGGGCCG GAGAAAGCTA
209398-209425-forward       311  4.13e-04 ATCTGGC

In [10]:
meme_site_df = pd.DataFrame(meme_site_dict)
meme_site_df

NameError: name 'meme_site_dict' is not defined

In [16]:
def parse_meme_motif(motif_info, filename):
    meme_dict = {'File_name': [],
             'Motif_ID': [],
             'Width': [],
             'Score': [],
             'Sites': [],
             'con': []
             }
    for i, match in enumerate(motif_info):
        # print(match)
        meme_dict['File_name'].append(filename)
        meme_dict['Motif_ID'].append(int(match.group('index')))
        meme_dict['Width'].append(int(match.group('width')))
        meme_dict['Score'].append(float(match.group('evalue')))
        meme_dict['Sites'].append(int(match.group('sites')))
        meme_dict['con'].append(match.group('motif'))
    meme_motif_df = pd.DataFrame(meme_dict)
    # meme_motif_df.sort_values(axis=0, by='Score', inplace=True, ascending=False)
    return meme_motif_df

def parse_meme_sites(sites_info, motif_df, filename):
    meme_site_pattern = r'(?P<seq_id>\d+-\d+-(?:forward|reverse))\s*(?P<start_number>\d+)\s+\S+\s+(?P<site>\S+)\s+\S+\s+\S+'
    # meme_site_pattern = r'(?P<site_id>\S+)\s+(?P<start_number>\d+)\s+\S+\s+(?P<motif_sequence>\S+)\s+\S+\s+\S+'
    meme_site_dict = {'Sequence_ID': [],
                      'Site': [],
                      'Starting_Point': [],
                      'Score': [],
                      'Width': [],
                      'File_Name': []
                      }

    for i, site in enumerate(sites_info):
        # print(site[0])
        # print(site[1])
        # sites_per_motif = re.findall(meme_site_pattern, site[1])
        motif_mask = motif_df['Motif_ID'] == site[0]
        motif = motif_df[motif_mask]

        sites_per_motif = re.finditer(meme_site_pattern, site[1])
        # print(sites_per_motif)
        for j, match in enumerate(sites_per_motif):
            meme_site_dict['Sequence_ID'].append(match.group('seq_id'))
            meme_site_dict['Score'].append(motif.Score)
            meme_site_dict['Starting_Point'].append(int(match.group('start_number')))
            meme_site_dict['Site'].append(match.group('site'))
            meme_site_dict['Width'].append(motif.Width)
            meme_site_dict['File_Name'].append(filename)

    meme_site_df = pd.DataFrame(meme_site_dict)
    return meme_site_df



def parse_meme_files():
    meme_dir = os.path.join(os.getcwd(), 'Results/MEME')
    files = os.listdir(meme_dir)
    filtered_files = list(filter(lambda name: name if name.find('10') == -1 else '', files))
    print(len(filtered_files))

    meme_motif_pattern = r'MOTIF\s+(?P<motif>\w+)\s+MEME-(?P<index>\d)\s+width\s+=\s+(?P<width>\d+)\s+sites\s+=\s+(?P<sites>\d+).+E-value\s*=\s*(?P<evalue>\d+(?:\.\d+)?(?:e[+-]?\d+)?)'
    sites_pattern = r'MEME-(\d+) sites sorted by position p-value\n(?:.*\n){3}((?:(?!-+).*\n)*)'

    results_list = []
    for filename in filtered_files:
        with open(os.path.join(meme_dir, filename), 'r') as file:
            meme_output = file.read()

        meme_motif_info = re.finditer(meme_motif_pattern, meme_output)
        meme_sites = re.findall(sites_pattern, meme_output)

        name = filename.split('_')[0]
        meme_motif_df = parse_meme_motif(meme_motif_info, name)
        meme_sites_df = parse_meme_sites(meme_sites, meme_motif_df, name)

        results_list.append(meme_sites_df)


    results_df = pd.concat(results_list)
    return results_df

In [17]:
parse_meme_files()





610


Unnamed: 0,Sequence_ID,Site,Starting_Point,Score,Width,File_Name
0,4464895-4464912-reverse,AAGCGCCGCA,140,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",PhoP
1,4464895-4464912-reverse,GTTAGGCTCA,261,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",PhoP
2,4464895-4464912-reverse,AGGAGAATCC,157,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",PhoP
3,1189730-1189747-reverse,ACACTATTTT,252,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",PhoP
4,1906840-1906857-reverse,ATATCCGCTG,51,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",PhoP
...,...,...,...,...,...,...
20,328605-328625-forward,GTAACAATAA,268,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",BetI
21,328605-328625-forward,TCTGCGCGAT,62,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",BetI
22,328607-328627-reverse,TGGGCATGCA,353,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",BetI
23,328607-328627-reverse,GCCGTGCAGG,389,"Series([], Name: Score, dtype: float64)","Series([], Name: Width, dtype: int64)",BetI
