# Ensemble Motif Discovery (EMD) Algorithm

## Libraries

In [6]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from parse_mdscam_output import process_MDScan_output
from parse_meme_output import parse_meme_files
from parse_motifSampler_output import parse_motifSampler_files
from parse_bioprospector_output import process_Bioprospector_output


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## The input Datasets

A collection of motif groups files, comprising multiple sequence sets from E.Coli genome.

## Component Algorithms

1. MDScan
2. BioProspector
3. MEME
4. Motif Sampler

## A. Collecting Phase
1. Run available component algorithms multiple times on an input dataset
2. Parse results txt using regex patterns to create useful motif data structures
3. Store results from each algorithm to separate CSVs

### Parse MDScan motif discovery output files

In [3]:
mdscan_path = 'Results/MDScan/'  # The directory containing the txt results of all MDScan runs

final_df_MD = process_MDScan_output(mdscan_path)

final_df_MD.head()

Unnamed: 0,File_name,Sequence_ID,Site,Score,Starting_position,Width
0,Ada,209398-209425-forward,AAGCGCCGCTGGCGG,1.795,151,15
1,Ada,209398-209425-forward,CGCCATCGCTTCCGG,1.795,258,15
2,Ada,209398-209425-forward,CTGAAGCGATGGGTA,1.795,166,15
3,Ada,209398-209425-forward,CGGAACCACTGGGTG,1.795,229,15
4,Ada,209398-209425-forward,CGGAAGCGATGGCGG,1.795,259,15


In [4]:
final_df_MD.sort_values(by=['File_name', 'Sequence_ID', 'Score'], inplace=True, ascending=False)
final_df_MD.reset_index(drop=True, inplace=True)
final_df_MD.to_csv('CSVs/mdscan_sites.csv', encoding='utf-8')

In [5]:
print(final_df_MD.shape)

(119190, 6)


### Parse BioProspector motif discovery output files

In [7]:
bioprospector_path = 'Results/Bioprospector/'
final_df_BP = process_Bioprospector_output(bioprospector_path)

final_df_BP.head()

Unnamed: 0,File_name,Sequence_ID,Site,Score,Starting_position,Width
0,Ada,209398-209425-forward,ACGGTGAGCACCACC,1.844,254,15
1,Ada,209398-209425-forward,GCCGCCATCGCTTCC,1.844,274,15
2,Ada,209398-209425-forward,ACCGCCAGAACCACC,1.844,44,15
3,Ada,209398-209425-forward,ACCGTACAAACTACC,1.844,19,15
4,Ada,2145603-2145630-reverse,ACCGTAATCAAAACC,1.844,112,15


In [8]:
final_df_BP.sort_values(by=['File_name', 'Sequence_ID', 'Score'], inplace=True, ascending=False)
final_df_BP.reset_index(drop=True, inplace=True)
final_df_BP.to_csv('CSVs/bioprospector_sites.csv', encoding='utf-8')

In [9]:
print(final_df_BP.shape)

(56057, 6)


## Parse MotifSampler motif discovery output files


In [11]:
ms_dir = os.path.join(os.getcwd(), 'Results/MotifSampler')
final_df_MS = parse_motifSampler_files(ms_dir)
final_df_MS.head()

Unnamed: 0,Sequence_ID,Site,Starting_position,Score,Width,File_name
0,2716618-2716662-forward,TACATCACTTTGACC,85.0,1.21,15.0,OxyR
1,4131318-4131362-forward,TACATCTCTTTAACC,178.0,1.21,15.0,OxyR
2,4131318-4131362-forward,AACTTCTCTCTAACG,242.0,1.21,15.0,OxyR
3,2716618-2716662-forward,CTGGAACAATGTCCC,249.0,1.21,15.0,OxyR
4,710081-710125-reverse,CCTGTACAATGTCCC,259.0,1.21,15.0,OxyR


In [12]:
final_df_MS.sort_values(by=['File_name', 'Sequence_ID', 'Score'], inplace=True, ascending=False)
final_df_MS.reset_index(drop=True, inplace=True)
final_df_MS.to_csv('motifSampler_sites.csv', encoding='utf-8')

In [13]:
print(final_df_MS.shape)

(24015, 6)


## Parse MEME motif discovery output files

In [14]:
meme_dir = os.path.join(os.getcwd(), 'Results/MEME')
final_df_ME = parse_meme_files(meme_dir)
final_df_ME.head()

Unnamed: 0,Sequence_ID,Site,Starting_position,Score,Width,File_name
0,4464895-4464912-reverse,AAGCGCCGCA,140,600.0,8,PhoP
1,4464895-4464912-reverse,GTTAGGCTCA,261,600.0,8,PhoP
2,4464895-4464912-reverse,AGGAGAATCC,157,600.0,8,PhoP
3,1189730-1189747-reverse,ACACTATTTT,252,600.0,8,PhoP
4,1906840-1906857-reverse,ATATCCGCTG,51,600.0,8,PhoP


In [15]:
final_df_ME.sort_values(by=['File_name', 'Sequence_ID', 'Score'], inplace=True, ascending=False)
final_df_ME.reset_index(drop=True, inplace=True)
final_df_ME.to_csv('CSVs/meme_sites.csv', encoding='utf-8')

In [16]:
print(final_df_ME.shape)

(28690, 6)


## B. Grouping Phase

