### Count motif occurrences across datasets and generate tabular data

Differently than MutSplice pipeline, that targeted motif of the RBP of interest, we now scan all motifs over all paired datasets to generate motif occurrences that can be used for machine learning. In addition, we also generate tabular datasets of k-mer-based occurrences, which serves as a unbiased approach to distinguish exon groups based on patterns found in sequences.

In [1]:
import pandas as pd 
import seaborn as sns
import numpy as np
sns.set(font_scale=1)
from plotnine import *
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd
import os
import warnings
import itertools
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option("display.max_columns", None)
pd.set_option('display.width', 1000)
tqdm.pandas()
from mutsplice.datasets.tabular_dataset import TabularDataset

In [2]:
PAIRED_DATASETS = pd.read_csv("/home/pbarbosa/git_repos/mutsplice/data/2_paired_datasets/ALL_data.tsv.gz", sep="\t")

In [3]:
SPLICEAI_PREDS = pd.read_csv("/home/pbarbosa/git_repos/mutsplice/notebooks/1_SpliceAI/4_datasets/encode_sequences_fixed_at_5000bp_output.tsv.gz", sep="\t")
SPLICEAI_PREDS['pred'] = SPLICEAI_PREDS[['ref_donor_cassette', 'ref_acceptor_cassette']].mean(axis=1)
#SPLICEAI_PREDS = SPLICEAI_PREDS[['seq_id', 'target_coordinates', 'pred']]
SPLICEAI_PREDS['transcript_id'] = SPLICEAI_PREDS.seq_id.str.split("_").str[1]

### Scanning motifs

In [4]:
from gtfhandle.utils import file_to_bed_df
from mutsplice.datasets.mutsplice_pipeline import MutSplicePipeline

def runMutSplice_justMotifScan(df, rbp_name: str, exon_group: str, motif_source: str, motif_search: str):
    GTF_CACHE = "/home/pbarbosa/data/genomes/hg38/gtf_cache_gencode/"
    OUT_DIR = f"/home/pbarbosa/git_repos/mutsplice/notebooks/4_all_motifScan/{rbp_name}/"
    FASTA = "/home/pbarbosa/data/genomes/hg38/GRCh38.primary_assembly.genome.fa"

    kwargs = {
        "gtf_cache": GTF_CACHE,
        "fasta": FASTA,
        "out_dir": f"{OUT_DIR}{exon_group}",
        "outbasename": f"{rbp_name}",
        "subset_rbps": "encode",
        "motif_source": motif_source,
        "motif_search": motif_search,
        "pvalue_threshold": 0.00002,
        "min_nuc_probability": 0.15,
        "use_full_sequence": False,
        "spliceai_final_results": None
    }
    bed_df = file_to_bed_df(df, is_0_based=False, header=0, col_index=0)

    MutSplicePipeline(
        bed_df,
        do_gtf_queries=True,
        do_motif_scanning=True,
        do_mutations=False,
        run_spliceai=False,
        **kwargs,
    )


def process_single_RBP(group, exon_group: str, motif_source: str, motif_search: str):
    try:
        rbp_name = group.iloc[0].rbp_name
        runMutSplice_justMotifScan(
            group,
            rbp_name=rbp_name,
            exon_group=exon_group,
            motif_source=motif_source,
            motif_search=motif_search,
        )

    except ValueError as ve:
        print(f"Caught ValueError for {rbp_name}: {ve}")

2024-04-20 18:15:13.866790: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-20 18:15:14.513067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-20 18:15:14.513148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


#### Knockdown exons

In [5]:
KD_data = PAIRED_DATASETS[PAIRED_DATASETS.exon_group == "KD"]
KD_data.groupby("rbp_name").apply(
    process_single_RBP, exon_group="KD", motif_source="ATtRACT", motif_search="fimo"
)

#### Control exons 

In [6]:
Ctrl_data = PAIRED_DATASETS[PAIRED_DATASETS.exon_group == "CTRL"]
Ctrl_data.groupby("rbp_name").apply(
    process_single_RBP,
    exon_group="CTRL",
    motif_source="ATtRACT",
    motif_search="fimo",
)

### Tabular datasets

#### Based on motif occurrences (total and per location)

In [7]:
merge_on = ['target_coordinates', 'transcript_id']
PAIRED_DATASETS = PAIRED_DATASETS.merge(SPLICEAI_PREDS, on=merge_on)

In [None]:
kwargs = {'subset_rbps': 'encode_in_attract', 
          'motif_source': 'attract'}
rbp_dirs = (os.listdir("/home/pbarbosa/git_repos/mutsplice/notebooks/4_all_motifScan/"))
cols_to_drop = ["pred", "transcript_id", "Strand", "gene_name", "paired_with"]
to_drop_ending_with = ["downstream_2", "upstream_2"]
to_drop_starting_with = ["ref_acceptor", "ref_donor"]

for rbp in rbp_dirs:
    _dir = f"4_all_motifScan/{rbp}"

    data_kd = PAIRED_DATASETS[(PAIRED_DATASETS.rbp_name == rbp) & (PAIRED_DATASETS.exon_group == "KD")]
    data_ctrl = PAIRED_DATASETS[(PAIRED_DATASETS.rbp_name == rbp) & (PAIRED_DATASETS.exon_group == "CTRL")]

    for granularity, tag in {'motif': 'motifs', 'per_location': 'motifs_per_loc'}.items():
        out = []
    
        for g_name, _data in {"KD": data_kd, "CTRL": data_ctrl}.items():
        
            td = TabularDataset(_data, 
                outdir=f"{_dir}/{g_name}", 
                granularity=granularity,
                normalize_by_length=True,
                **kwargs)
            
            td.data = td.data.drop(columns=cols_to_drop)
            cols_to_keep = [col for col in td.data.columns if not any([col.endswith(ending) for ending in to_drop_ending_with]) and not any([col.startswith(start) for start in to_drop_starting_with])]
            td.data = td.data[cols_to_keep]
            out.append(td.data)
        
        df = pd.concat(out)
        df.rename(columns={'average_cassette_strength': 'spliceai_pred'}, inplace=True)
        cols = df.columns.tolist()
        cols_to_move = ['exon_group', 'dPSI', 'spliceai_pred']
        cols = [col for col in cols if col not in cols_to_move] + cols_to_move
        df = df[cols]
        df.to_csv(f"../data/3_paired_datasets_tabular_to_ML/{rbp}_{tag}.tsv.gz", sep="\t", compression='gzip', index=False)

#### Based on k-mer occurrences 

In [9]:
from gtfhandle.utils import fasta_to_dict

In [10]:
def _trim_seq(row, ss_df):
    ss_idx = ss_df[ss_df.seq_id == row.seq_id].iloc[0]
    start = ss_idx.acceptor_idx.split(";")[0]
    if start == "<NA>":
        start = 0
    else:
        start = int(start)
    end = ss_idx.donor_idx.split(";")[2]
    if end == "<NA>":
        end = len(row.sequence)
    else:
        end = int(end)
    
    row['sequence'] = row['sequence'][start:end]
    return row

In [14]:
def _count_kmers(row, possible_kmers):
    n_kmers = len(possible_kmers)
    counts = np.zeros((1, n_kmers))
    for j, kmer in enumerate(possible_kmers):
        counts[0, j] = f"{row.sequence.count(kmer) / len(row.sequence):.5f}"
    return pd.concat([row, pd.Series(counts[0], index=possible_kmers)])

In [None]:
rbp_dirs = os.listdir("/home/pbarbosa/git_repos/mutsplice/notebooks/4_all_motifScan/")
possible_kmers = ["".join(x) for x in itertools.product("ATCG", repeat=5)]

out_kmers, out_lsgkm = [], []

for rbp in tqdm(rbp_dirs):
    _dir = f"4_all_motifScan/{rbp}"

    fasta_ctrl = f"{_dir}/CTRL/1_seq_extraction/{rbp}_sequences_fixed_at_5000bp.fa"
    ctrl_df_ss = pd.read_csv(f"{_dir}/CTRL/1_seq_extraction/{rbp}_sequences_ss_idx_fixed_at_5000bp.tsv", sep="\t")
    ctrl_df_ss['seq_id'] = ctrl_df_ss.header + "_"  + ctrl_df_ss.tx_id
        
    fasta_kd = f"{_dir}/KD/1_seq_extraction/{rbp}_sequences_fixed_at_5000bp.fa"
    kd_df_ss = pd.read_csv(f"{_dir}/KD/1_seq_extraction/{rbp}_sequences_ss_idx_fixed_at_5000bp.tsv", sep="\t")
    kd_df_ss['seq_id'] = kd_df_ss.header + "_"  + kd_df_ss.tx_id

    for i, g in enumerate(['kd', 'ctrl']):
        if i == 0:
            data = fasta_kd
            ss = kd_df_ss
        else:
            data = fasta_ctrl
            ss = ctrl_df_ss

        df = pd.DataFrame.from_dict(fasta_to_dict(data), orient='index').reset_index().rename(columns={'index': 'seq_id', 0: 'sequence'})
        df = df.merge(PAIRED_DATASETS[['seq_id', 'target_coordinates', 'rbp_name', 'exon_group', 'dPSI', 'pred']], on=['seq_id']).rename(columns={'pred': 'spliceai_pred'})
        df = df[df.rbp_name == rbp]
        df = df.apply(_trim_seq, ss_df=ss, axis=1)
        df = df.apply(_count_kmers, possible_kmers=possible_kmers, axis=1, result_type='expand')
        cols = df.columns.tolist()
        cols_to_move = ['exon_group', 'dPSI', 'spliceai_pred']
        cols = [col for col in cols if col not in cols_to_move] + cols_to_move
        df = df[cols]
        out_kmers.append(df.drop(columns=['sequence']))
        out_lsgkm.append(df.drop(columns=possible_kmers))

    pd.concat(out_kmers).to_csv(f"../data/3_paired_datasets_tabular_to_ML/{rbp}_kmers.tsv.gz", sep="\t", compression='gzip', index=False)
    pd.concat(out_lsgkm).to_csv(f"../data/3_paired_datasets_tabular_to_ML/{rbp}_full_seqs.tsv.gz", sep="\t", compression='gzip', index=False)    