In [1]:
import os
import re
from collections import Counter
import pandas as pd

# **All Folders Directories**

In [2]:
direcrory_of_folders = 'Results/'
folders = [folder for folder in os.listdir(direcrory_of_folders) if os.path.isdir(os.path.join(direcrory_of_folders, folder))]
folders

['Bioprosector', 'MDScan', 'MotifSampler', 'Streme', 'MEME']

## **For MDScan**

In [3]:
# # Read the MDscan output file
# with open('Results/MDScan/Ada_1.txt', 'r') as file:
#     mdscan_output = file.read()
directory_of_files = 'Results/MDScan/'

# Extract the motifs using regex
motif_pattern = r"Motif\s+(\d+):\s+Wid\s+(\d+);\s+Score\s+([\d.]+);\s+Sites\s+(\d+);\s+Con\s+([ACGT]+);\s+RCon\s+([ACGT]+)"
# motif_info = re.findall(motif_pattern, mdscan_output)

# Extract the site information using regex
site_pattern = r">(\d+-\d+-(?:forward|reverse))\s+Len\s+\d+\s+Site\s+#(\d+)\s+([fr])\s+(\d+)\n([ACGT]+)"
# site_info = re.findall(site_pattern, mdscan_output)

The return_the_files function stores all the unique names of the .txt files except the background.txt file.

In [4]:
def return_the_files(path):
    txt_files = [file for file in os.listdir(path) if file.endswith(".txt")]
    only_results = []
    for file in txt_files:
        if '_' in file:
            name_parts = file.split("_")
            if len(name_parts) > 1:
                name = name_parts[0]
                only_results.append(name)
    name_counts = Counter(only_results)
    unique_names = [name for name, count in name_counts.items() if count > 1] 
    return  unique_names 

The get_the_motifs and get_site_info functions creates a dataframes with vital informations of the motif_info's and the site_info's.

In [11]:
def get_the_motifs(motif_info, file_name):
    motifs = {}

    for motif_match in motif_info:
        motif = {
            'File_name': file_name,
            'Motif_ID': motif_match[0],
            'Width': motif_match[1],
            'Score': motif_match[2],
            'Sites': motif_match[3],
            'con': motif_match[4],
            'rcon': motif_match[5]
        }
        motif_id = motif_match[0]

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif

    # Convert the motifs dictionary into a DataFrame
    df = pd.DataFrame.from_dict(motifs, orient='index')
    return df

In [12]:
def get_site_info(site_info, motif_info, file_name):
    data = []
    motif_id = 0  # Initial motif ID
    initial_position = site_info[0][0]  # Initial position

    for site in site_info:
        site_id, site_number, _, starting_point, motif_sequence = site

        if site_number == '1' and site_id == initial_position:
            motif_id += 1
            # initial_position = site_id

        data.append([site_id, site_number, starting_point, motif_sequence, motif_id, file_name])

    columns = ['Site_ID', 'Site_number', 'Starting_Point', 'Motif_Sequence', 'Motif_ID', 'File_Name']
    df_info = pd.DataFrame(data, columns=columns)
    return df_info

In [13]:
names = return_the_files(directory_of_files)

motif_dfs = []
for name in names:
    file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]
    for file_path in file_paths:
        with open(os.path.join(directory_of_files, file_path), 'r') as file:
            mdscan_output = file.read()
        motif_info = re.findall(motif_pattern, mdscan_output)
        df_motif = get_the_motifs(motif_info, name)
        motif_dfs.append(df_motif)

motif_df = pd.concat(motif_dfs, ignore_index=True)

# Concatenate site info dataframes
site_dfs = []
for name in names:
    file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]
    for file_path in file_paths:
        with open(os.path.join(directory_of_files, file_path), 'r') as file:
            mdscan_output = file.read()
        site_info = re.findall(site_pattern, mdscan_output)
        df_site = get_site_info(site_info, motif_info, name)
        site_dfs.append(df_site)

site_df = pd.concat(site_dfs, ignore_index=True)

In [14]:
site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
site_df['Motif_ID'] = pd.to_numeric(site_df['Motif_ID'])
site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
site_df.head()

Unnamed: 0,Site_ID,Site_number,Starting_Point,Motif_Sequence,Motif_ID,File_Name
0,209398-209425-forward,1,151,AAGCGCCGCTGGCGG,1,Ada
1,209398-209425-forward,2,258,CGCCATCGCTTCCGG,1,Ada
2,209398-209425-forward,3,166,CTGAAGCGATGGGTA,1,Ada
3,209398-209425-forward,4,229,CGGAACCACTGGGTG,1,Ada
4,209398-209425-forward,5,259,CGGAAGCGATGGCGG,1,Ada


In [17]:
motif_df['Motif_ID'] = pd.to_numeric(motif_df['Motif_ID'])
motif_df['Sites'] = pd.to_numeric(motif_df['Sites'])
motif_df['Score'] = pd.to_numeric(motif_df['Score'])
motif_df['Width'] = pd.to_numeric(motif_df['Width'])

motif_df.head()

Unnamed: 0,File_name,Motif_ID,Width,Score,Sites,con,rcon
0,Ada,1,15,1.795,14,CGGAACCGCTGGCGG,CCGCCAGCGGTTCCG
1,Ada,2,15,1.777,16,CCGGAAACGATGGCG,CGCCATCGTTTCCGG
2,Ada,3,15,1.764,16,GGAAGCGCTGGCGGC,GCCGCCAGCGCTTCC
3,Ada,4,15,1.754,19,CGCCGCTGGCGGCTG,CAGCCGCCAGCGGCG
4,Ada,5,15,1.749,15,TATCCGTGACGGTGA,TCACCGTCACGGATA


# Regex for MEME

In [None]:
with open('Results/MEME/Ada_meme0.txt', 'r') as file:
    meme_output = file.read()


meme_motif_pattern = r'MOTIF\s+(?P<motif>\w+)\s+MEME-(?P<index>\d)\s+width\s+=\s+(?P<width>\d+)\s+sites\s+=\s+(?P<sites>\d+).+E-value\s*=\s*(?P<evalue>\d+(?:\.\d+)?(?:e[+-]?\d+)?)'
meme_motif_info = re.finditer(meme_motif_pattern, meme_output)

sites_pattern = r'MEME-(\d+) sites sorted by position p-value\n(?:.*\n){3}((?:(?!-+).*\n)*)'
meme_sites = re.findall(sites_pattern, meme_output)

In [None]:
meme_dict = {'Ada_meme0': {}}
for i, match in enumerate(meme_motif_info):
    temp_dict = {'motif': match.group('motif'),
                                 'index': int(match.group('index')),
                                 'width': int(match.group('width')),
                                 'sites': int(match.group('sites')),
                                 'evalue': float(match.group('evalue'))
                                 }

    motif = match.group('motif')
    print(temp_dict)
    meme_dict['Ada_meme0']['motif'+str(i)] = temp_dict
meme_dict

{'motif': 'TCTSGCSG', 'index': 1, 'width': 8, 'sites': 15, 'evalue': 1.3}
{'motif': 'TGAAARMG', 'index': 2, 'width': 8, 'sites': 11, 'evalue': 920.0}
{'motif': 'AGMTTTAAAA', 'index': 3, 'width': 10, 'sites': 2, 'evalue': 10000.0}
{'motif': 'TACGGTTA', 'index': 4, 'width': 8, 'sites': 2, 'evalue': 18000.0}
{'motif': 'GTGAHGGTG', 'index': 5, 'width': 9, 'sites': 3, 'evalue': 13000.0}


{'Ada_meme0': {'motif0': {'motif': 'TCTSGCSG',
   'index': 1,
   'width': 8,
   'sites': 15,
   'evalue': 1.3},
  'motif1': {'motif': 'TGAAARMG',
   'index': 2,
   'width': 8,
   'sites': 11,
   'evalue': 920.0},
  'motif2': {'motif': 'AGMTTTAAAA',
   'index': 3,
   'width': 10,
   'sites': 2,
   'evalue': 10000.0},
  'motif3': {'motif': 'TACGGTTA',
   'index': 4,
   'width': 8,
   'sites': 2,
   'evalue': 18000.0},
  'motif4': {'motif': 'GTGAHGGTG',
   'index': 5,
   'width': 9,
   'sites': 3,
   'evalue': 13000.0}}}

In [None]:
meme_site_pattern = r'(\d+-\d+-(?:forward|reverse))\s*(\d+)'
for i, site in enumerate(meme_sites):
    print(site[0])
    sites_per_motif = re.findall(meme_site_pattern, site[1])
    print(sites_per_motif)
    temp_dict = {}
    # for j, s in enumerate(sites_per_motif):
    #     print(j)
    #     print(s)
    #     temp_dict[j] = {'seq_id': s[0], 'position': s[1]}
        # print(temp_dict)

1
[('209398-209425-forward', '302'), ('2145603-2145630-reverse', '322'), ('209398-209425-forward', '106'), ('2145603-2145630-reverse', '382'), ('209398-209425-forward', '36'), ('2308475-2308502-reverse', '398'), ('2308475-2308502-reverse', '361'), ('2308475-2308502-reverse', '111'), ('209398-209425-forward', '293'), ('209398-209425-forward', '158'), ('2308475-2308502-reverse', '386'), ('2308475-2308502-reverse', '97'), ('2308475-2308502-reverse', '71'), ('209398-209425-forward', '311'), ('2145603-2145630-reverse', '398')]
2
[('2145603-2145630-reverse', '243'), ('209398-209425-forward', '280'), ('209398-209425-forward', '397'), ('209398-209425-forward', '196'), ('2145603-2145630-reverse', '207'), ('2308475-2308502-reverse', '279'), ('209398-209425-forward', '135'), ('2308475-2308502-reverse', '201'), ('2145603-2145630-reverse', '348'), ('2145603-2145630-reverse', '253'), ('2308475-2308502-reverse', '86')]
3
[('209398-209425-forward', '332'), ('2308475-2308502-reverse', '145')]
4
[('2093