### Perform motif scanning on the generated datasets

DRESS v0.1.0 natively embeds motif disruption counts (motif gains/losses of a synthetic sequence in respect to the original), as present in Figure 7.
Because all analysis in the paper were performed with DRESS v0.0.1, we converted the datasets into a compatible format for DRESS v0.1.0. The main difference is the inclusion of the original sequence (wt) in the first line of the final dataset. 

In [None]:
import glob
import shutil
import pandas as pd
from dress.datasetevaluation.representation.motifs.search import FimoSearch
from dress.datasetgeneration.dataset import Dataset

In [None]:
def load_datasets_generalization(strategy:str, evolutions_path: str):

    print("Reading directory: ", evolutions_path)
    files = [f for f in glob.glob(evolutions_path + "/*_dataset.csv.gz")]
    uniq_basenames = sorted(list(set([f.split('/')[-1].split('_seed')[0] for f in files])))

    for bn in uniq_basenames[:1]:

        dataset = Dataset(glob.glob(f"{evolutions_path}/{bn}*_dataset.csv.gz"))

        try:
            motif_search = FimoSearch(dataset, subset_rbps='encode', motif_search='fimo', motif_db='oRNAment', skip_location_mapping=True, outdir=f"3_motifs/{strategy}_fimo_oRNAment/{bn}")
            shutil.rmtree(f"3_motifs/{strategy}_fimo_oRNAment/{bn}/motifs/fimo")
            motif_search.tabulate_occurrences(write_output=True)
        except ValueError:
            continue


In [None]:
DATASETS_PATH = {
    "GGGP": "RBFOX2_knockdown",
    "RS": "RBFOX2_knockdown_randomSearch",
    }

In [None]:
for strategy, directory in DATASETS_PATH.items():
    df = load_datasets_generalization(strategy, directory)