In [149]:
import os
import re
from collections import Counter
import pandas as pd
import itertools


# **All Folders Directories**

In [2]:
direcrory_of_folders = 'Results/'
folders = [folder for folder in os.listdir(direcrory_of_folders) if os.path.isdir(os.path.join(direcrory_of_folders, folder))]
folders

['Bioprospector', 'MDScan', 'MotifSampler', 'Streme', 'MEME']

In [3]:
def return_the_files(path):
    txt_files = [file for file in os.listdir(path) if file.endswith(".txt")]
    only_results = []
    for file in txt_files:
        if '_' in file:
            name_parts = file.split("_")
            if len(name_parts) > 1:
                name = name_parts[0]
                only_results.append(name)
    name_counts = Counter(only_results)
    unique_names = [name for name, count in name_counts.items() if count > 1] 
    return  unique_names 

## **For MDScan**

In [4]:
# # Read the MDscan output file
# with open('Results/MDScan/Ada_1.txt', 'r') as file:
#     mdscan_output = file.read()
directory_of_files = 'Results/MDScan/'

# Extract the motifs using regex
motif_pattern = r"Motif\s+(\d+):\s+Wid\s+(\d+);\s+Score\s+([\d.]+);\s+Sites\s+(\d+);\s+Con\s+([ACGT]+);\s+RCon\s+([ACGT]+)"
# motif_info = re.findall(motif_pattern, mdscan_output)

# Extract the site information using regex
site_pattern = r">(\d+-\d+-(?:forward|reverse))\s+Len\s+\d+\s+Site\s+#(\d+)\s+([fr])\s+(\d+)\n([ACGT]+)"
# site_info = re.findall(site_pattern, mdscan_output)

The return_the_files function stores all the unique names of the .txt files except the background.txt file.

The get_the_motifs and get_site_info functions creates a dataframes with vital informations of the motif_info's and the site_info's.

In [5]:
def get_the_motifs_MD(motif_info, file_name):
    motifs = {}

    for motif_match in motif_info:
        motif = {
            'File_name': file_name,
            'Motif_ID': motif_match[0],
            'Width': motif_match[1],
            'Score': motif_match[2],
            'Sites': motif_match[3],
            'con': motif_match[4],
            'rcon': motif_match[5]
        }
        motif_id = motif_match[0]

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif

    # Convert the motifs dictionary into a DataFrame
    df = pd.DataFrame.from_dict(motifs, orient='index')
    return df

In [6]:
def get_site_info(site_info, motif_info, file_name):
    data = []
    motif_id = 0  # Initial motif ID
    initial_position = site_info[0][0]  # Initial position

    for site in site_info:
        site_id, site_number, _, starting_point, motif_sequence = site

        if site_number == '1' and site_id == initial_position:
            motif_id += 1
            # initial_position = site_id

        data.append([site_id, site_number, starting_point, motif_sequence, motif_id, file_name])

    columns = ['Site_ID', 'Site_number', 'Starting_Point', 'Motif_Sequence', 'Motif_ID', 'File_Name']
    df_info = pd.DataFrame(data, columns=columns)
    return df_info

In [7]:
def process_mdscan_output(directory_of_files):
    names = return_the_files(directory_of_files)
    motif_dfs = []
    site_dfs = []

    for name in names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                mdscan_output = file.read()
            
            motif_info = re.findall(motif_pattern, mdscan_output)
            df_motif = get_the_motifs_MD(motif_info, name)
            motif_dfs.append(df_motif)
            
            site_info = re.findall(site_pattern, mdscan_output)
            df_site = get_site_info(site_info, motif_info, name)
            site_dfs.append(df_site)

    motif_df = pd.concat(motif_dfs, ignore_index=True)
    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Motif_ID'] = pd.to_numeric(site_df['Motif_ID'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
    motif_df['Motif_ID'] = pd.to_numeric(motif_df['Motif_ID'])
    motif_df['Sites'] = pd.to_numeric(motif_df['Sites'])
    motif_df['Score'] = pd.to_numeric(motif_df['Score'])
    motif_df['Width'] = pd.to_numeric(motif_df['Width'])
    
    return motif_df, site_df

# Usage example
motif_df_MD, site_df_MD = process_mdscan_output(directory_of_files)

In [8]:
motif_df_MD.head()

Unnamed: 0,File_name,Motif_ID,Width,Score,Sites,con,rcon
0,Ada,1,15,1.795,14,CGGAACCGCTGGCGG,CCGCCAGCGGTTCCG
1,Ada,2,15,1.777,16,CCGGAAACGATGGCG,CGCCATCGTTTCCGG
2,Ada,3,15,1.764,16,GGAAGCGCTGGCGGC,GCCGCCAGCGCTTCC
3,Ada,4,15,1.754,19,CGCCGCTGGCGGCTG,CAGCCGCCAGCGGCG
4,Ada,5,15,1.749,15,TATCCGTGACGGTGA,TCACCGTCACGGATA


In [9]:
site_df_MD.head()

Unnamed: 0,Site_ID,Site_number,Starting_Point,Motif_Sequence,Motif_ID,File_Name
0,209398-209425-forward,1,151,AAGCGCCGCTGGCGG,1,Ada
1,209398-209425-forward,2,258,CGCCATCGCTTCCGG,1,Ada
2,209398-209425-forward,3,166,CTGAAGCGATGGGTA,1,Ada
3,209398-209425-forward,4,229,CGGAACCACTGGGTG,1,Ada
4,209398-209425-forward,5,259,CGGAAGCGATGGCGG,1,Ada


## **For Bioprospector**

In [10]:
Bioproepector_path = 'Results/Bioprospector/'  # Replace with the actual directory path
motif_pattern = r'Motif\s+#(\d+):\s+\((\w+/\w+)\)\n\*+\nWidth \((\d+), \d+\);\s+Gap \[\d+, \d+\];\s+MotifScore (\d+\.\d+);\s+Sites (\d+)'
site_pattern = r'>(\d+-\d+-\w+)\s+len\s\d+\s+site\s+#(\d+)\s+(\w+)\s+(\d+)\n(\w+)'

In [11]:
def get_the_motifs_BP(motif_info, file_name):
    motifs = {}

    for motif_match in motif_info:
        motif_id = motif_match[0]
        con, rcon = motif_match[1].split('/')
        width = motif_match[2]
        score = motif_match[3]
        sites = motif_match[4]

        motif = {
            'File_name': file_name,
            'Motif_ID': motif_id,
            'Width': width,
            'Score': score,
            'Sites': sites,
            'con': con,
            'rcon': rcon
        }

        if motif_id in motifs:
            motifs[motif_id].update(motif)  # Merge with existing motif dictionary
        else:
            motifs[motif_id] = motif

    # Convert the motifs dictionary into a DataFrame
    df = pd.DataFrame.from_dict(motifs, orient='index')
    return df

In [12]:
def process_Bioprospector_output(directory_of_files):
    names = return_the_files(directory_of_files)
    motif_dfs = []
    site_dfs = []

    for name in names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                mdscan_output = file.read()
            
            motif_info = re.findall(motif_pattern, mdscan_output)
            df_motif = get_the_motifs_BP(motif_info, name)
            motif_dfs.append(df_motif)
            
            site_info = re.findall(site_pattern, mdscan_output)
            df_site = get_site_info(site_info, motif_info, name)
            site_dfs.append(df_site)

    motif_df = pd.concat(motif_dfs, ignore_index=True)
    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Motif_ID'] = pd.to_numeric(site_df['Motif_ID'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
    motif_df['Motif_ID'] = pd.to_numeric(motif_df['Motif_ID'])
    motif_df['Sites'] = pd.to_numeric(motif_df['Sites'])
    motif_df['Score'] = pd.to_numeric(motif_df['Score'])
    motif_df['Width'] = pd.to_numeric(motif_df['Width'])
    
    return motif_df, site_df


In [13]:
motif_df_BP, site_df_BP = process_Bioprospector_output(Bioproepector_path)

In [153]:
site_df_BP.head()

Unnamed: 0,Site_ID,Site_number,Starting_Point,Motif_Sequence,Motif_ID,File_Name
0,1031186-1031204-forward,1,193,TGTGCAAAAGTTTCA,1,NarP
1,2301629-2301647-reverse,1,232,TGTGTCAAAGATGCA,1,NarP
2,2301629-2301647-reverse,2,134,TTTGAAATGTGAGCA,1,NarP
3,1031186-1031204-forward,1,193,TGTGCAAAAGTTTCA,2,NarP
4,2301629-2301647-reverse,1,232,TGTGTCAAAGATGCA,2,NarP


## **For MotifSampler**

## **For MEME**

In [14]:
with open('Results/MEME/Ada_meme0.txt', 'r') as file:
    meme_output = file.read()


meme_motif_pattern = r'MOTIF\s+(?P<motif>\w+)\s+MEME-(?P<index>\d)\s+width\s+=\s+(?P<width>\d+)\s+sites\s+=\s+(?P<sites>\d+).+E-value\s*=\s*(?P<evalue>\d+(?:\.\d+)?(?:e[+-]?\d+)?)'
meme_motif_info = re.finditer(meme_motif_pattern, meme_output)

sites_pattern = r'MEME-(\d+) sites sorted by position p-value\n(?:.*\n){3}((?:(?!-+).*\n)*)'
meme_sites = re.findall(sites_pattern, meme_output)

In [15]:
meme_dict = {'File_name': [],
             'Motif_ID': [],
             'Width': [],
             'Score': [],
             'Sites': [],
             'con': []
             }
for i, match in enumerate(meme_motif_info):
    # print(match)
    meme_dict['File_name'].append('Ada')
    meme_dict['Motif_ID'].append(int(match.group('index')))
    meme_dict['Width'].append(int(match.group('width')))
    meme_dict['Score'].append(float(match.group('evalue')))
    meme_dict['Sites'].append(int(match.group('sites')))
    meme_dict['con'].append(match.group('motif'))
    # temp_dict = {'motif': match.group('motif'),
    #                              'index': int(match.group('index')),
    #                              'width': int(match.group('width')),
    #                              'sites': int(match.group('sites')),
    #                              'evalue': float(match.group('evalue'))
    #                              }
    #
    # motif = match.group('motif')
    # print(meme_dict)
    # meme_dict['Ada_meme0']['motif'+str(i)] = temp_dict
# print(meme_dict)

{'File_name': ['Ada', 'Ada', 'Ada', 'Ada', 'Ada'], 'Motif_ID': [1, 2, 3, 4, 5], 'Width': [8, 8, 10, 8, 9], 'Score': [1.3, 920.0, 10000.0, 18000.0, 13000.0], 'Sites': [15, 11, 2, 2, 3], 'con': ['TCTSGCSG', 'TGAAARMG', 'AGMTTTAAAA', 'TACGGTTA', 'GTGAHGGTG']}


In [16]:
meme_motif_df = pd.DataFrame(meme_dict)
meme_motif_df

Unnamed: 0,File_name,Motif_ID,Width,Score,Sites,con
0,Ada,1,8,1.3,15,TCTSGCSG
1,Ada,2,8,920.0,11,TGAAARMG
2,Ada,3,10,10000.0,2,AGMTTTAAAA
3,Ada,4,8,18000.0,2,TACGGTTA
4,Ada,5,9,13000.0,3,GTGAHGGTG


In [17]:
meme_site_pattern = r'(?P<site_id>\d+-\d+-(?:forward|reverse))\s*(?P<start_number>\d+)\s+\S+\s+(?P<motif_sequence>\S+)\s+\S+\s+\S+'
# meme_site_pattern = r'(?P<site_id>\S+)\s+(?P<start_number>\d+)\s+\S+\s+(?P<motif_sequence>\S+)\s+\S+\s+\S+'
meme_site_dict = {'Site_ID': [],
                  'Site_number': [],
                  'Starting_Point': [],
                  'Motif_Sequence': [],
                  'Motif_ID': [],
                  'File_Name': []
                  }
for i, site in enumerate(meme_sites):
    print(site[0])
    print(site[1])
    # sites_per_motif = re.findall(meme_site_pattern, site[1])
    sites_per_motif = re.finditer(meme_site_pattern, site[1])
    print(sites_per_motif)
    for j, match in enumerate(sites_per_motif):
        meme_site_dict['Site_ID'].append(match.group('site_id'))
        meme_site_dict['Site_number'].append(j)
        meme_site_dict['Starting_Point'].append(int(match.group('start_number')))
        meme_site_dict['Motif_Sequence'].append(match.group('motif_sequence'))
        meme_site_dict['Motif_ID'].append(i)
        meme_site_dict['File_Name'].append('Ada')


1
209398-209425-forward       302  1.61e-05 TGCTGGCGGA TCTGGCCG ATCTCGACGT
2145603-2145630-reverse     322  3.22e-05 TGTTGGGATT TCTCGCCG CCCGTGCGGT
209398-209425-forward       106  3.22e-05 TATTCCGTTA TCTCGCCG GAAGGTTGTG
2145603-2145630-reverse     382  4.82e-05 ATGCCCGTAG TCTGGCGG TGGGCGAATA
209398-209425-forward        36  4.82e-05 TGAAGGTGGT TCTGGCGG TGCGCTGGCG
2308475-2308502-reverse     398  1.29e-04 TTGCCGTCCG TCTTGCCG CGCCAGACAT
2308475-2308502-reverse     361  1.44e-04 GCGAATTCGT TTTCGCCG TGCGTACCAC
2308475-2308502-reverse     111  1.61e-04 GCCGGGAAGG GCTGGCGG TTTATATGAT
209398-209425-forward       293  1.61e-04 AAGCGCAACT GCTGGCGG ATCTGGCCGA
209398-209425-forward       158  1.61e-04 ACAAAGCGCC GCTGGCGG CTGAAGCGAT
2308475-2308502-reverse     386  3.02e-04 CACAGGCATC TTTTGCCG TCCGTCTTGC
2308475-2308502-reverse      97  3.02e-04 TAAAGAGGTT GTTCGCCG GGAAGGGCTG
2308475-2308502-reverse      71  3.82e-04 TTGATGGTAC TCGGGCCG GAGAAAGCTA
209398-209425-forward       311  4.13e-04 ATCTGGC

In [21]:
site_df_meme = pd.DataFrame(meme_site_dict)
site_df_meme.head()

Unnamed: 0,Site_ID,Site_number,Starting_Point,Motif_Sequence,Motif_ID,File_Name
0,209398-209425-forward,0,302,TGCTGGCGGA,0,Ada
1,2145603-2145630-reverse,1,322,TGTTGGGATT,0,Ada
2,209398-209425-forward,2,106,TATTCCGTTA,0,Ada
3,2145603-2145630-reverse,3,382,ATGCCCGTAG,0,Ada
4,209398-209425-forward,4,36,TGAAGGTGGT,0,Ada


## **For STREME**

In [182]:
file_path = 'Results/Streme/Ada_streme0.txt'
# with open(path, 'r') as file:
#     output_streme = file.read()

# # Extract motifs using regex
# motif_pattern = r"MOTIF (\d+)-([A-Z]+) STREME-\d+"
# motifs = re.findall(motif_pattern, output_streme)
def get_best_score(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('letter-probability matrix'):
                parts = line.split()
                for i, part in enumerate(parts):
                    if 'S=' in part:
                        try:
                            e_value = float(parts[i+1])
                            return e_value
                        except ValueError:
                            print(f"Cannot convert '{parts[i+1]}' to float")
                            return None
    return None


In [183]:
# Extract the file name from the file path
file_name = file_path.split('/')[-1]

# Get the best score from the file
best_score = get_best_score(file_path)

# Read the text file
with open(file_path, 'r') as file:
    output_streme = file.read()

# Extract motifs using regex
motif_pattern = r"MOTIF (\d+)-([A-Z]+) STREME-\d+"
motifs = re.findall(motif_pattern, output_streme)

temp_motifs = pd.DataFrame(motifs, columns=['Motif_ID', 'Con'])
temp_motifs['Motif_ID'] = temp_motifs['Motif_ID'].astype(int)

# Add the file name and best score as additional columns
temp_motifs['File_Name'] = file_name
temp_motifs['Best_Score'] = best_score

temp_motifs


Unnamed: 0,Motif_ID,Con,File_Name,Best_Score
0,1,ACCGTCACGGATACC,Ada_streme0.txt,0.05
1,2,CAGCMTAAASGYTAT,Ada_streme0.txt,0.05
2,3,WGAAAKCSCAACWKC,Ada_streme0.txt,0.05
3,4,ATGCTTTGCGCGCCA,Ada_streme0.txt,0.05
4,5,CCATCGCTTCAGCCG,Ada_streme0.txt,0.05


In [85]:
# def parse_site_info(site_info):
#     # Define the column names
#     columns = ['A', 'C', 'G', 'T']
#     # Create an empty DataFrame
#     dfs = []
#     # Iterate over the matches and populate the DataFrame
#     for i, match in enumerate(site_info):
#         # Remove leading and trailing whitespace
#         match = match.strip()
#         # Split the match into rows
#         rows = match.split('\n')
#         # Create a nested list to hold the motif values
#         motif_values = []
#         # Iterate over the rows and split into individual values
#         for row in rows:
#             values = row.split()
#             motif_values.append(values)
#         # Create a DataFrame from the motif values
#         motif_df = pd.DataFrame(motif_values, columns=columns)
#         # Append the motif DataFrame to the list
#         dfs.append(motif_df)
#     # Concatenate all the DataFrames in the list
#     df = pd.concat(dfs, ignore_index=True)
#     # Set the index of the DataFrame based on groupings of 15 rows
#     df['Motif'] = df.index // 15 + 1
#     # Rearrange the columns to have the 'Motif' column as the first column
#     df = df[['Motif'] + columns]
#     # Convert the values to numeric data type
#     df = df.astype({col: float for col in columns})
#     # Reset the index and add 1
#     df.index = (df.index % 15) + 1

#     return df


In [86]:
# probs_df = parse_site_info(site_info)
# probs_df.head()

In [146]:
# Dictionary to map IUPAC nucleotide codes to the possible bases they represent
IUPAC_dict = {
    "A": ["A"],
    "C": ["C"],
    "G": ["G"],
    "T": ["T"],
    "R": ["A", "G"],
    "Y": ["C", "T"],
    "S": ["G", "C"],
    "W": ["A", "T"],
    "K": ["G", "T"],
    "M": ["A", "C"],
    "B": ["C", "G", "T"],
    "D": ["A", "G", "T"],
    "H": ["A", "C", "T"],
    "V": ["A", "C", "G"],
    "N": ["A", "C", "G", "T"],
}

def generate_sequences(motif):
    # Generate a list of lists where each inner list contains the possible bases for each position in the motif
    bases = [IUPAC_dict[char] for char in motif]
    # Generate all combinations of the possible bases
    combinations = list(itertools.product(*bases))
    # Join each combination into a string and return a list of all possible sequences
    sequences = [''.join(combination) for combination in combinations]
    return sequences



In [150]:
# Let's assume that streme_motifs is a DataFrame with 'con' and 'Motif_ID' columns
results = []

for _, row in streme_motifs.iterrows():
    motif_id = row['Motif_ID']
    sequences = generate_sequences(row['Con'])
    for sequence in sequences:
        results.append({'Motif_ID': motif_id, 'Sequence': sequence})

streme_motifs = pd.DataFrame(results)
streme_motifs.head()

Unnamed: 0,Motif_ID,Sequence
0,1,ACCGTCACGGATACC
1,2,CAGCATAAAGGCTAT
2,2,CAGCATAAAGGTTAT
3,2,CAGCATAAACGCTAT
4,2,CAGCATAAACGTTAT
