In [1]:
import os
import re
from collections import Counter
import pandas as pd

In [66]:
# Read the MDscan output file
with open('Results/MDScan/Ada_1.txt', 'r') as file:
    mdscan_output = file.read()

# Use regex patterns to extract the motif information
motif_pattern = r"Motif (\d+):\s+Wid (\d+);\s+Score ([\d.]+);\s+Sites (\d+);\s+Con (\w+);\s+RCon (\w+)"
motif_info = re.findall(motif_pattern, mdscan_output)

# # Use regex patterns to extract the motif alignment matrix
# alignment_pattern = r"\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)"
# alignment_matrix = re.findall(alignment_pattern, mdscan_output)

# Use regex patterns to extract the site information
site_pattern = r">(\d+-\d+-(?:forward))\s+Len\s+\d+\s+Site\s+#(\d+)\s+(f|r)\s+(\d+)\n([ACGT]+)"
site_info = re.findall(site_pattern, mdscan_output)

Store the Folders of the results

In [67]:
direcrory_of_folders = 'Results/'
folders = [folder for folder in os.listdir(direcrory_of_folders) if os.path.isdir(os.path.join(direcrory_of_folders, folder))]
folders

['Bioprosector', 'MDScan', 'MotifSampler', 'Streme', 'MEME']

In [68]:
motif_info

[('1', '15', '1.795', '14', 'CGGAACCGCTGGCGG', 'CCGCCAGCGGTTCCG'),
 ('2', '15', '1.777', '16', 'CCGGAAACGATGGCG', 'CGCCATCGTTTCCGG'),
 ('3', '15', '1.764', '16', 'GGAAGCGCTGGCGGC', 'GCCGCCAGCGCTTCC'),
 ('4', '15', '1.754', '19', 'CGCCGCTGGCGGCTG', 'CAGCCGCCAGCGGCG'),
 ('5', '15', '1.749', '15', 'TATCCGTGACGGTGA', 'TCACCGTCACGGATA')]

In [69]:
site_info

[('209398-209425-forward', '1', 'f', '151', 'AAGCGCCGCTGGCGG'),
 ('209398-209425-forward', '2', 'r', '258', 'CGCCATCGCTTCCGG'),
 ('209398-209425-forward', '3', 'f', '166', 'CTGAAGCGATGGGTA'),
 ('209398-209425-forward', '4', 'f', '229', 'CGGAACCACTGGGTG'),
 ('209398-209425-forward', '5', 'f', '259', 'CGGAAGCGATGGCGG'),
 ('209398-209425-forward', '6', 'f', '286', 'CGCAACTGCTGGCGG'),
 ('209398-209425-forward', '1', 'r', '286', 'CCGCCAGCAGTTGCG'),
 ('209398-209425-forward', '2', 'r', '271', 'CTTTCAACGATGCCG'),
 ('209398-209425-forward', '3', 'r', '259', 'CCGCCATCGCTTCCG'),
 ('209398-209425-forward', '4', 'f', '165', 'GCTGAAGCGATGGGT'),
 ('209398-209425-forward', '5', 'f', '228', 'CCGGAACCACTGGGT'),
 ('209398-209425-forward', '6', 'f', '258', 'CCGGAAGCGATGGCG'),
 ('209398-209425-forward', '7', 'f', '285', 'GCGCAACTGCTGGCG'),
 ('209398-209425-forward', '1', 'f', '41', 'CGGTGCGCTGGCGAT'),
 ('209398-209425-forward', '2', 'r', '269', 'TTCAACGATGCCGCC'),
 ('209398-209425-forward', '3', 'f', '149

Store the names of the outputted files

In [9]:
file_path = 'Results/MDScan/'
def return_the_files(path):
    txt_files = [file for file in os.listdir(path) if file.endswith(".txt")]
    only_results = []
    for file in txt_files:
        if '_' in file:
            name_parts = file.split("_")
            if len(name_parts) > 1:
                name = name_parts[0]
                only_results.append(name)
    name_counts = Counter(only_results)
    unique_names = [name for name, count in name_counts.items() if count > 1] 
    return  unique_names 
 
names = return_the_files(file_path)


In [55]:
def get_the_motifs(motif_info, file_name):
    motif_dict = {}
    current_motif = None

    motifs = {}  # Use a dictionary instead of a list

    for motif_match in motif_info:
        motif = {
        'file_name': file_name,
        'motif_id': motif_match[0],
        'width': motif_match[1],
        'score': motif_match[2],
        'sites': motif_match[3],
        'con': motif_match[4],
        'rcon': motif_match[5]
        }
        motif_id = motif_match[0]

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif 
            

    return motifs, len(motifs)

In [74]:
motif_dict, num_of_motifs = get_the_motifs(motif_info,'Ada')
motif_dict

{'1': {'file_name': 'Ada',
  'motif_id': '1',
  'width': '15',
  'score': '1.795',
  'sites': '14',
  'con': 'CGGAACCGCTGGCGG',
  'rcon': 'CCGCCAGCGGTTCCG'},
 '2': {'file_name': 'Ada',
  'motif_id': '2',
  'width': '15',
  'score': '1.777',
  'sites': '16',
  'con': 'CCGGAAACGATGGCG',
  'rcon': 'CGCCATCGTTTCCGG'},
 '3': {'file_name': 'Ada',
  'motif_id': '3',
  'width': '15',
  'score': '1.764',
  'sites': '16',
  'con': 'GGAAGCGCTGGCGGC',
  'rcon': 'GCCGCCAGCGCTTCC'},
 '4': {'file_name': 'Ada',
  'motif_id': '4',
  'width': '15',
  'score': '1.754',
  'sites': '19',
  'con': 'CGCCGCTGGCGGCTG',
  'rcon': 'CAGCCGCCAGCGGCG'},
 '5': {'file_name': 'Ada',
  'motif_id': '5',
  'width': '15',
  'score': '1.749',
  'sites': '15',
  'con': 'TATCCGTGACGGTGA',
  'rcon': 'TCACCGTCACGGATA'}}

# Regex for MEME

In [54]:
with open('Results/MEME/Ada_meme0.txt', 'r') as file:
    meme_output = file.read()


meme_motif_pattern = r'MOTIF\s+(?P<motif>\w+)\s+MEME-(?P<index>\d)\s+width\s+=\s+(?P<width>\d+)\s+sites\s+=\s+(?P<sites>\d+).+E-value\s*=\s*(?P<evalue>\d+(?:\.\d+)?(?:e[+-]?\d+)?)'
meme_motif_info = re.finditer(meme_motif_pattern, meme_output)

sites_pattern = r'MEME-(\d+) sites sorted by position p-value\n(?:.*\n){3}((?:(?!-+).*\n)*)'
meme_sites = re.findall(sites_pattern, meme_output)

In [15]:
meme_dict = {'Ada_meme0': {}}
for i, match in enumerate(meme_motif_info):
    temp_dict = {'motif': match.group('motif'),
                                 'index': int(match.group('index')),
                                 'width': int(match.group('width')),
                                 'sites': int(match.group('sites')),
                                 'evalue': float(match.group('evalue'))
                                 }

    motif = match.group('motif')
    print(temp_dict)
    meme_dict['Ada_meme0']['motif'+str(i)] = temp_dict
meme_dict

{'motif': 'TCTSGCSG', 'index': 1, 'width': 8, 'sites': 15, 'evalue': 1.3}
{'motif': 'TGAAARMG', 'index': 2, 'width': 8, 'sites': 11, 'evalue': 920.0}
{'motif': 'AGMTTTAAAA', 'index': 3, 'width': 10, 'sites': 2, 'evalue': 10000.0}
{'motif': 'TACGGTTA', 'index': 4, 'width': 8, 'sites': 2, 'evalue': 18000.0}
{'motif': 'GTGAHGGTG', 'index': 5, 'width': 9, 'sites': 3, 'evalue': 13000.0}


{'Ada_meme0': {'motif0': {'motif': 'TCTSGCSG',
   'index': 1,
   'width': 8,
   'sites': 15,
   'evalue': 1.3},
  'motif1': {'motif': 'TGAAARMG',
   'index': 2,
   'width': 8,
   'sites': 11,
   'evalue': 920.0},
  'motif2': {'motif': 'AGMTTTAAAA',
   'index': 3,
   'width': 10,
   'sites': 2,
   'evalue': 10000.0},
  'motif3': {'motif': 'TACGGTTA',
   'index': 4,
   'width': 8,
   'sites': 2,
   'evalue': 18000.0},
  'motif4': {'motif': 'GTGAHGGTG',
   'index': 5,
   'width': 9,
   'sites': 3,
   'evalue': 13000.0}}}

In [16]:
meme_site_pattern = r'(\d+-\d+-(?:forward|reverse))\s*(\d+)'
for i, site in enumerate(meme_sites):
    print(site[0])
    sites_per_motif = re.findall(meme_site_pattern, site[1])
    print(sites_per_motif)
    temp_dict = {}
    # for j, s in enumerate(sites_per_motif):
    #     print(j)
    #     print(s)
    #     temp_dict[j] = {'seq_id': s[0], 'position': s[1]}
        # print(temp_dict)

1
[('209398-209425-forward', '302'), ('2145603-2145630-reverse', '322'), ('209398-209425-forward', '106'), ('2145603-2145630-reverse', '382'), ('209398-209425-forward', '36'), ('2308475-2308502-reverse', '398'), ('2308475-2308502-reverse', '361'), ('2308475-2308502-reverse', '111'), ('209398-209425-forward', '293'), ('209398-209425-forward', '158'), ('2308475-2308502-reverse', '386'), ('2308475-2308502-reverse', '97'), ('2308475-2308502-reverse', '71'), ('209398-209425-forward', '311'), ('2145603-2145630-reverse', '398')]
2
[('2145603-2145630-reverse', '243'), ('209398-209425-forward', '280'), ('209398-209425-forward', '397'), ('209398-209425-forward', '196'), ('2145603-2145630-reverse', '207'), ('2308475-2308502-reverse', '279'), ('209398-209425-forward', '135'), ('2308475-2308502-reverse', '201'), ('2145603-2145630-reverse', '348'), ('2145603-2145630-reverse', '253'), ('2308475-2308502-reverse', '86')]
3
[('209398-209425-forward', '332'), ('2308475-2308502-reverse', '145')]
4
[('2093