In [None]:
import pandas as pd 
import seaborn as sns
sns.set(font_scale=1)
from plotnine import *
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd
from pandarallel import pandarallel

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)
tqdm.pandas()

In [None]:
PAIRED_DATASETS = pd.read_csv("/home/pbarbosa/git_repos/mutsplice/data/2_paired_datasets/ALL_data.tsv.gz", sep="\t")

In [None]:
from gtfhandle.utils import file_to_bed_df
from explainer.datasets.preprocessing import Preprocessing
import os

def runMutSplice(df, rbp_name: str, exon_group: str, motif_source: str, motif_search: str):
    GTF_CACHE = "/home/pbarbosa/data/genomes/hg38/gtf_cache_gencode/"
    OUT_DIR = f"/big_data/pbarbosa/mutsplice/{motif_source}_{motif_search}/"
    FASTA = "/home/pbarbosa/data/genomes/hg38/GRCh38.primary_assembly.genome.fa"

    kwargs = {
        "gtf_cache": GTF_CACHE,
        "fasta": FASTA,
        "out_dir": f"{OUT_DIR}{rbp_name}_{exon_group}",
        "outbasename": f"{rbp_name}",
        "subset_rbps": [rbp_name],
        "motif_source": motif_source,
        "motif_search": motif_search,
        "qvalue_threshold": 0.05,
        "spliceai_save_raw": True,
        "spliceai_raw_results": None,
        "spliceai_final_results": None,
        "batch_size": 64,
        "use_full_sequence": False,
    }
    bed_df = file_to_bed_df(df, is_0_based=False, header=0, col_index=0)

    Preprocessing(
        bed_df,
        do_gtf_queries=True,
        do_motif_scanning=True,
        do_mutations=True,
        run_spliceai=True,
        **kwargs,
    )


def process_single_RBP(group, exon_group: str, motif_source: str, motif_search: str):
    try:
        rbp_name = group.iloc[0].rbp_name
        runMutSplice(
            group,
            rbp_name=rbp_name,
            exon_group=exon_group,
            motif_source=motif_source,
            motif_search=motif_search,
        )

    except ValueError as ve:
        print(f"Caught ValueError for {rbp_name}: {ve}")

#### Knockdown exons

In [None]:
pandarallel.initialize(progress_bar=True, nb_workers=1)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

KD_data = PAIRED_DATASETS[PAIRED_DATASETS.exon_group == "KD"]
ALREADY_DONE = [
    "ADAR",
    "AKAP8L",
    "BUD13",
    "CCAR2",
    "CDC40",
    "CELF1",
    "DAZAP1",
    "DDX20",
    "DDX5",
    "EFTUD2",
    "EWSR1",
    "FMR1",
    "FUBP1",
    "FUS",
    "GEMIN5",
    "GPKOW",
    "HNRNPA1",
    "HNRNPA2B1",
    "HNRNPC",
    "HNRNPD",
    "HNRNPF",
    "HNRNPH1",
    "HNRNPK",
]
KD_data = KD_data[~KD_data.rbp_name.isin(ALREADY_DONE)]
KD_data.groupby("rbp_name").parallel_apply(
    process_single_RBP, exon_group="KD", motif_source="rosina2017", motif_search="plain"
)

#### Control exons 

In [None]:
pandarallel.initialize(progress_bar=True, nb_workers=1)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Ctrl_data = PAIRED_DATASETS[PAIRED_DATASETS.exon_group == "CTRL"]
Ctrl_data.groupby("rbp_name").parallel_apply(
    process_single_RBP,
    exon_group="CTRL",
    motif_source="rosina2017",
    motif_search="plain",
)