In [1]:
import os
import re
from collections import Counter
import pandas as pd
import itertools


# **All Folders Directories**

In [39]:
direcrory_of_folders = 'Results/'
folders = [folder for folder in os.listdir(direcrory_of_folders) if os.path.isdir(os.path.join(direcrory_of_folders, folder))]
folders

['Bioprospector', 'MDScan', 'MotifSampler', 'Streme', 'MEME']

In [40]:
def return_the_files(path):
    txt_files = [file for file in os.listdir(path) if file.endswith(".txt")]
    only_results = []
    for file in txt_files:
        if '_' in file:
            name_parts = file.split("_")
            if len(name_parts) > 1:
                name = name_parts[0]
                only_results.append(name)
    name_counts = Counter(only_results)
    unique_names = [name for name, count in name_counts.items() if count > 1] 
    return  unique_names 

## **For MDScan**

In [41]:
# # Read the MDscan output file
# with open('Results/MDScan/Ada_1.txt', 'r') as file:
#     mdscan_output = file.read()
directory_of_files = 'Results/MDScan/'

# Extract the motifs using regex
motif_pattern = r"Motif\s+(\d+):\s+Wid\s+(\d+);\s+Score\s+([\d.]+);\s+Sites\s+(\d+);\s+Con\s+([ACGT]+);\s+RCon\s+([ACGT]+)"
# motif_info = re.findall(motif_pattern, mdscan_output)

# Extract the site information using regex
site_pattern = r">(\d+-\d+-(?:forward|reverse))\s+Len\s+\d+\s+Site\s+#(\d+)\s+([fr])\s+(\d+)\n([ACGT]+)"
# site_info = re.findall(site_pattern, mdscan_output)

The return_the_files function stores all the unique names of the .txt files except the background.txt file.

The get_the_motifs and get_site_info functions creates a dataframes with vital informations of the motif_info's and the site_info's.

In [42]:
# def get_the_motifs_MD(motif_info, file_name):
#     motifs = {}

#     for motif_match in motif_info:
#         motif = {
#             'File_name': file_name,
#             'Motif_ID': motif_match[0],
#             'Width': motif_match[1],
#             'Score': motif_match[2],
#             'Sites': motif_match[3],
#             'con': motif_match[4],
#             'rcon': motif_match[5]
#         }
#         motif_id = motif_match[0]

#         if motif_id in motifs:
#             motifs[motif_id].update(motif)  # Merge with existing motif dictionary
#         else:
#             motifs[motif_id] = motif

#     # Convert the motifs dictionary into a DataFrame
#     df = pd.DataFrame.from_dict(motifs, orient='index')
#     return df

In [43]:
def get_site_info_MD(site_info, motif_info, file_name):
    data = []
    motif_id = 0  # Initial motif ID
    initial_position = site_info[0][0]  # Initial position
    score_mapping = {motif[0]: motif[2] for motif in motif_info}  # Map motif_id to score

    for site in site_info:
        site_id, site_number, _, starting_point, motif_sequence = site

        if site_number == '1' and site_id == initial_position:
            motif_id += 1

        data.append([site_id, site_number, starting_point, motif_sequence, score_mapping.get(str(motif_id), None), file_name])

    columns = ['Sequence_ID', 'Site_number', 'Starting_Point', 'Motif_Sequence', 'Score', 'File_Name']
    df_info = pd.DataFrame(data, columns=columns)
    return df_info


In [44]:
def process_MDScan_output(directory_of_files):
    file_names = return_the_files(directory_of_files)
    file_names = sorted(file_names)
    site_dfs = []

    for name in file_names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                mdscan_output = file.read()

            motif_info = re.findall(motif_pattern, mdscan_output)
            site_info = re.findall(site_pattern, mdscan_output)
            df_site = get_site_info_MD(site_info, motif_info, name)
            site_dfs.append(df_site)

    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])

    # Final result - selected columns and rename
    result_df = site_df[['File_Name', 'Sequence_ID', 'Motif_Sequence', 'Score', 'Starting_Point']].rename(columns={'File_Name': 'File_name', 'Sequence_ID': 'Sequence_ID', 'Motif_Sequence': 'Site', 'Score': 'Score', 'Starting_Point': 'Starting_position'})

    return result_df

# Usage example
final_df_MD = process_MDScan_output(directory_of_files)



In [45]:
final_df_MD.head()

Unnamed: 0,File_name,Sequence_ID,Site,Score,Starting_position
0,Ada,209398-209425-forward,AAGCGCCGCTGGCGG,1.795,151
1,Ada,209398-209425-forward,CGCCATCGCTTCCGG,1.795,258
2,Ada,209398-209425-forward,CTGAAGCGATGGGTA,1.795,166
3,Ada,209398-209425-forward,CGGAACCACTGGGTG,1.795,229
4,Ada,209398-209425-forward,CGGAAGCGATGGCGG,1.795,259


## **For Bioprospector**

In [46]:
Bioproepector_path = 'Results/Bioprospector/'  # Replace with the actual directory path
motif_pattern = r'Motif\s+#(\d+):\s+\((\w+/\w+)\)\n\*+\nWidth \((\d+), \d+\);\s+Gap \[\d+, \d+\];\s+MotifScore (\d+\.\d+);\s+Sites (\d+)'
site_pattern = r'>(\d+-\d+-\w+)\s+len\s\d+\s+site\s+#(\d+)\s+(\w+)\s+(\d+)\n(\w+)'

In [47]:
# def get_the_motifs_BP(motif_info, file_name):
#     motifs = {}

#     for motif_match in motif_info:
#         motif_id = motif_match[0]
#         con, rcon = motif_match[1].split('/')
#         width = motif_match[2]
#         score = motif_match[3]
#         sites = motif_match[4]

#         motif = {
#             'File_name': file_name,
#             'Motif_ID': motif_id,
#             'Width': width,
#             'Score': score,
#             'Sites': sites,
#             'con': con,
#             'rcon': rcon
#         }

#         if motif_id in motifs:
#             motifs[motif_id].update(motif)  # Merge with existing motif dictionary
#         else:
#             motifs[motif_id] = motif

#     # Convert the motifs dictionary into a DataFrame
#     df = pd.DataFrame.from_dict(motifs, orient='index')
#     return df

In [48]:
def get_site_info_BP(site_info, motif_info, file_name):
    data = []
    motif_id = 0  # Initial motif ID
    initial_position = site_info[0][0]  # Initial position
    score_mapping = {motif[0]: motif[3] for motif in motif_info}  # Map motif_id to score

    for site in site_info:
        site_id, site_number, _, starting_point, motif_sequence = site

        if site_number == '1' and site_id == initial_position:
            motif_id += 1

        data.append([site_id, site_number, starting_point, motif_sequence, score_mapping.get(str(motif_id), None), file_name])

    columns = ['Sequence_ID', 'Site_number', 'Starting_Point', 'Motif_Sequence', 'Score', 'File_Name']
    df_info = pd.DataFrame(data, columns=columns)
    return df_info

In [49]:
def process_Bioprospector_output(directory_of_files):
    file_names = return_the_files(directory_of_files)
    file_names = sorted(file_names)
    site_dfs = []

    for name in file_names:
        file_paths = [file for file in os.listdir(directory_of_files) if file.startswith(f"{name}_")]

        for file_path in file_paths:
            with open(os.path.join(directory_of_files, file_path), 'r') as file:
                Bioprospector_output = file.read()
            
            motif_info = re.findall(motif_pattern, Bioprospector_output)
            site_info = re.findall(site_pattern, Bioprospector_output)
            df_site = get_site_info_BP(site_info, motif_info, name)
            site_dfs.append(df_site)

    site_df = pd.concat(site_dfs, ignore_index=True)
    
    site_df['Site_number'] = pd.to_numeric(site_df['Site_number'])
    site_df['Starting_Point'] = pd.to_numeric(site_df['Starting_Point'])
    
    # Calculating the width of the motif as the length of the 'Motif_Sequence'
    site_df['Width'] = site_df['Motif_Sequence'].apply(len)
    
    # Final result - selected columns and rename
    result_df = site_df[['File_Name', 'Sequence_ID', 'Motif_Sequence', 'Score', 'Starting_Point', 'Width']].rename(columns={'File_Name': 'File_name', 'Sequence_ID': 'Sequence_ID', 'Motif_Sequence': 'Site', 'Score': 'Score', 'Starting_Point': 'Starting_position', 'Width': 'Width'})

    return result_df

# Usage example
final_df_BP = process_Bioprospector_output(Bioproepector_path)


In [50]:
final_df_BP.head()

Unnamed: 0,File_name,Sequence_ID,Site,Score,Starting_position,Width
0,Ada,209398-209425-forward,GCCGCCATCGCTTCC,1.817,274,15
1,Ada,209398-209425-forward,ACCGCCAGAACCACC,1.817,44,15
2,Ada,209398-209425-forward,ACGGTGAGCACCACC,1.817,254,15
3,Ada,209398-209425-forward,ACCGTACAAACTACC,1.817,19,15
4,Ada,2145603-2145630-reverse,ACCGTAATCAAAACC,1.817,112,15


In [51]:
final_df_BP

Unnamed: 0,File_name,Sequence_ID,Site,Score,Starting_position,Width
0,Ada,209398-209425-forward,GCCGCCATCGCTTCC,1.817,274,15
1,Ada,209398-209425-forward,ACCGCCAGAACCACC,1.817,44,15
2,Ada,209398-209425-forward,ACGGTGAGCACCACC,1.817,254,15
3,Ada,209398-209425-forward,ACCGTACAAACTACC,1.817,19,15
4,Ada,2145603-2145630-reverse,ACCGTAATCAAAACC,1.817,112,15
...,...,...,...,...,...,...
56052,XylR,3728472-3728487-reverse,GGGTGATGGATGATG,2.109,265,15
56053,XylR,3728472-3728487-reverse,AATGGAATGATGAAA,2.109,156,15
56054,XylR,3728472-3728487-reverse,AATCGAAAGATAAAA,2.109,47,15
56055,XylR,3728472-3728487-reverse,AAGAAATAAACCAAA,2.109,68,15


## **For MotifSampler**

## **For MEME**

In [2]:
from parse_meme_output import parse_meme_files

meme_df = parse_meme_files()

610


In [5]:
meme_df.sort_values(by=['File_name', 'Sequence_ID', 'Score'], inplace=True, ascending=False)
meme_df.reset_index(drop=True, inplace=True)
meme_df.to_csv('meme_sites.csv', encoding='utf-8')
meme_df

Unnamed: 0,Sequence_ID,Site,Starting_position,Score,Width,File_name
0,3728622-3728637-forward,AGAAAAATGC,154,17000.0,8,XylR
1,3728622-3728637-forward,AGAAAAATGC,154,17000.0,8,XylR
2,3728622-3728637-forward,AGAAAAATGC,154,17000.0,8,XylR
3,3728622-3728637-forward,AGAAAAATGC,154,17000.0,8,XylR
4,3728622-3728637-forward,AGAAAAATGC,154,17000.0,8,XylR
...,...,...,...,...,...,...
28685,209398-209425-forward,TATTCCGTTA,106,1.3,8,Ada
28686,209398-209425-forward,TGAAGGTGGT,36,1.3,8,Ada
28687,209398-209425-forward,AAGCGCAACT,293,1.3,8,Ada
28688,209398-209425-forward,ACAAAGCGCC,158,1.3,8,Ada


## **For STREME**

In [70]:
# STREME_path = 'Results/Streme/Ada_streme0.txt'
# with open(path, 'r') as file:
#     output_streme = file.read()

# # Extract motifs using regex
# motif_pattern = r"MOTIF (\d+)-([A-Z]+) STREME-\d+"
# motifs = re.findall(motif_pattern, output_streme)
def get_best_score_streme(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('letter-probability matrix'):
                parts = line.split()
                for i, part in enumerate(parts):
                    if 'S=' in part:
                        try:
                            e_value = float(parts[i+1])
                            return e_value
                        except ValueError:
                            print(f"Cannot convert '{parts[i+1]}' to float")
                            return None
    return None


In [60]:
# # Extract the file name from the file path
# file_name = STREME_path.split('/')[-1]

# # Get the best score from the file
# best_score = get_best_score(STREME_path)

# # Read the text file
# with open(STREME_path, 'r') as file:
#     output_streme = file.read()

# # Extract motifs using regex
# motif_pattern = r"MOTIF (\d+)-([A-Z]+) STREME-\d+"
# motifs = re.findall(motif_pattern, output_streme)

# temp_motifs = pd.DataFrame(motifs, columns=['Motif_ID', 'Con'])
# temp_motifs['Motif_ID'] = temp_motifs['Motif_ID'].astype(int)

# # Add the file name and best score as additional columns
# temp_motifs['File_Name'] = file_name
# temp_motifs['Best_Score'] = best_score

# temp_motifs


In [61]:
# def parse_site_info(site_info):
#     # Define the column names
#     columns = ['A', 'C', 'G', 'T']
#     # Create an empty DataFrame
#     dfs = []
#     # Iterate over the matches and populate the DataFrame
#     for i, match in enumerate(site_info):
#         # Remove leading and trailing whitespace
#         match = match.strip()
#         # Split the match into rows
#         rows = match.split('\n')
#         # Create a nested list to hold the motif values
#         motif_values = []
#         # Iterate over the rows and split into individual values
#         for row in rows:
#             values = row.split()
#             motif_values.append(values)
#         # Create a DataFrame from the motif values
#         motif_df = pd.DataFrame(motif_values, columns=columns)
#         # Append the motif DataFrame to the list
#         dfs.append(motif_df)
#     # Concatenate all the DataFrames in the list
#     df = pd.concat(dfs, ignore_index=True)
#     # Set the index of the DataFrame based on groupings of 15 rows
#     df['Motif'] = df.index // 15 + 1
#     # Rearrange the columns to have the 'Motif' column as the first column
#     df = df[['Motif'] + columns]
#     # Convert the values to numeric data type
#     df = df.astype({col: float for col in columns})
#     # Reset the index and add 1
#     df.index = (df.index % 15) + 1

#     return df


In [62]:
# probs_df = parse_site_info(site_info)
# probs_df.head()

In [71]:
# Dictionary to map IUPAC nucleotide codes to the possible bases they represent
IUPAC_dict = {
    "A": ["A"],
    "C": ["C"],
    "G": ["G"],
    "T": ["T"],
    "R": ["A", "G"],
    "Y": ["C", "T"],
    "S": ["G", "C"],
    "W": ["A", "T"],
    "K": ["G", "T"],
    "M": ["A", "C"],
    "B": ["C", "G", "T"],
    "D": ["A", "G", "T"],
    "H": ["A", "C", "T"],
    "V": ["A", "C", "G"],
    "N": ["A", "C", "G", "T"],
}

def generate_sequences_streme(motif):
    # Generate a list of lists where each inner list contains the possible bases for each position in the motif
    bases = [IUPAC_dict[char] for char in motif]
    # Generate all combinations of the possible bases
    combinations = list(itertools.product(*bases))
    # Join each combination into a string and return a list of all possible sequences
    sequences = [''.join(combination) for combination in combinations]
    return sequences



In [64]:
# # Let's assume that streme_motifs is a DataFrame with 'con' and 'Motif_ID' columns
# results = []

# for _, row in temp_motifs.iterrows():
#     motif_id = row['Motif_ID']
#     sequences = generate_sequences(row['Con'])
#     for sequence in sequences:
#         results.append({'Motif_ID': motif_id, 'Sequence': sequence})

# streme_motifs = pd.DataFrame(results)
# streme_motifs.head()

In [72]:
Streme_path = 'Results/Streme/'


def generate_motif_sequences(file_dir):
    # Get the list of file names from the given directory
    file_names = return_the_files(file_dir) 
    # Sort the file names for consistency
    file_names = sorted(file_names)
    # Initialize the list that will hold all results
    all_results = []
    
    # Loop over each file name
    for file_name in file_names:
        # Loop over each file in the directory
        for file in os.listdir(file_dir):
            # If the file starts with the current file name
            if file.startswith(file_name):
                # Create the full file path
                file_path = os.path.join(file_dir, file)
                # Get the best score from the file
                best_score = get_best_score_streme(file_path)
                # Open the file
                with open(file_path, 'r') as open_file:
                    # Read the contents of the file
                    output_streme = open_file.read()
                # Define the regex pattern to extract motifs
                motif_pattern = r"MOTIF (\d+)-([A-Z]+) STREME-\d+"
                # Find all matches in the file contents
                motifs = re.findall(motif_pattern, output_streme)
                # Create a DataFrame from the motifs
                temp_motifs = pd.DataFrame(motifs, columns=['Motif_ID', 'Con'])
                # Convert the 'Motif_ID' column to integer
                temp_motifs['Motif_ID'] = temp_motifs['Motif_ID'].astype(int)
                # Extract the first part of the file name (before '_') and assign it to the 'File_Name' column
                temp_motifs['File_Name'] = file.split('_')[0]
                # Assign the best score to the 'Best_Score' column
                temp_motifs['Score'] = best_score

                # Loop over each row in the DataFrame
                for _, row in temp_motifs.iterrows():
                    # Generate all possible sequences from the 'Con' column
                    sequences = generate_sequences_streme(row['Con'])
                    # For each sequence, append a new row to the results
                    for sequence in sequences:
                        all_results.append({'File_Name': file.split('_')[0],'Site': sequence, 'Score': best_score, 'Width': len(sequence) })

    streme_motifs = pd.DataFrame(all_results)
    return streme_motifs


streme_motifs = generate_motif_sequences(Streme_path)
streme_motifs.head()

Unnamed: 0,File_Name,Site,Score,Width
0,Ada,ACCGTCACGGATACC,0.05,15
1,Ada,CAGCATAAAGGCTAT,0.05,15
2,Ada,CAGCATAAAGGTTAT,0.05,15
3,Ada,CAGCATAAACGCTAT,0.05,15
4,Ada,CAGCATAAACGTTAT,0.05,15


In [69]:
streme_motifs.shape

(53812, 4)