# How much gets filtered when removing rRNA, tRNA, and mtDNA/RNA?
# Notebook 1: Actual tests

I'm making this notebook as a n intermission from preparing rolypoly external database. So far I've used a combination of NCBI +SILVA rRNAs to remove contaminating rRNA reads from RNA-seq data. Subsequently, the organisms whose rRNAs were most matched, are then also fetched (or their transcriptomes, if available) to remove any remaining reads that may have come from those organisms.  
THis is messy, requires NCBI taxdump, taxonkit, and ncbi-datasets. So I started removing this dependency by using a set of rRNAs for which I can generate a prebuilt table containing the FTP addresses of the hosts' genomes/transcriptomes.  
While doing so, I realised the step above could be split - quick rRNA mapping to get rough taxonomic breakdown of the sample, and then a more thorough removal of rRNA, tRNA, and mtDNA/RNA using a more comprehensive database. The question is then how much will these diffrent combinations filter, how much more time, and would masking the fasta for subsequences shared with RNA viruses change the results significantly?

To test these, below are the actual mapping/search/etc filtering of these sequence types, and some graphs and so on.

*Note*: Parts of this script are from [build_data.py](../src/rolypoly/commands/misc/build_data.py) and [filter_reads.py](../src/rolypoly/commands/reads/filter_reads.py) scripts in the rolypoly repository.

Loading libraries and defining paths to sets already created/downloaded:

In [None]:
import json
import logging
import shutil
import subprocess
import aria2p
import tempfile
import os
import time
from ftplib import FTP
import glob

import polars as pl
from tqdm.notebook import tqdm
from bbmapy import bbduk, bbmask, kcompress

from rolypoly.utils.bio.sequences import (
    filter_fasta_by_headers,
    write_fasta_file,
    remove_duplicates
)

from rolypoly.utils.bio.polars_fastx import from_fastx_eager

from rolypoly.utils.logging.loggit import  setup_logging
from rolypoly.utils.various import fetch_and_extract, run_command_comp, extract

### DEBUG ARGS (for manually building, not entering via CLI):
threads = 6
log_file = "notebooks/Exprimental/trrna.log"
data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data"

global rrna_dir
global contam_dir

logger = setup_logging(log_file) # for some functions from build_data.py that require/are easier with a logger...

contam_dir = os.path.join(data_dir, "contam")
rrna_dir = os.path.join(contam_dir, "rrna")
trna_dir =  os.path.join(contam_dir, "trna")
trna_seqs = os.path.join(trna_dir, "tRNA_sequences_deduplicated_filtered.fasta")
masking_dir = os.path.join(contam_dir, "masking")
taxonomy_dir = os.path.join(data_dir, "taxdump")
reference_seqs = os.path.join(data_dir, "reference_seqs")
mmseqs_ref_dir = os.path.join(reference_seqs, "mmseqs")
rvmt_dir = os.path.join(reference_seqs, "RVMT")
ncbi_ribovirus_dir = os.path.join(reference_seqs, "ncbi_ribovirus")


# Masking sequences 
rvmt_fasta_path = os.path.join(data_dir, "reference_seqs", "RVMT", "RVMT_cleaned_contigs.fasta")
ncbi_ribovirus_fasta_path = os.path.join(data_dir,"reference_seqs","ncbi_ribovirus","refseq_ribovirus_genomes.fasta")
rna_viruses_entropy_masked_path = os.path.join(masking_dir, "combined_entropy_masked.fasta")


## Reading fetched/processed data (SILVA, NCBI rRNAs, tRNAs, mtDNAs):

In [5]:
# Silva
silva_release = "138.2"
silva_ssu_path = os.path.join(rrna_dir, f"SILVA_{silva_release}_SSURef_NR99_tax_silva.fasta")
silva_lsu_path = os.path.join(rrna_dir, f"SILVA_{silva_release}_LSURef_NR99_tax_silva.fasta")
silva_fasta_df = pl.read_parquet(os.path.join(rrna_dir, "silva99_fasta.parquet"))
silva_df = pl.read_parquet(os.path.join(rrna_dir, "silva_rrna_sequences.parquet"))

# organelles
mito_fasta = os.path.join( os.path.join(data_dir, "reference_seqs", "mito_refseq"), "combined_mito_refseq.fasta")
plastid_fasta = os.path.join( os.path.join(data_dir, "reference_seqs", "plastid_refseq"), "combined_plastid_refseq.fasta")

# rrna (ITS, 16S, 18S,23S, LSU, SSU from blastdbs)
rrna_df = pl.scan_csv(
    os.path.join(rrna_dir, "*.tab"),
    separator="@",
    has_header=False,
    null_values = ["N/A"],
    new_columns=["taxid", "header","name","sequence"],
    include_file_paths="type"
).collect()
rrna_df = rrna_df.drop("name")
rrna_df = rrna_df.with_columns(
    rRNA_type=pl.col("type").str.extract(r"([^/]+)\.tab$", 1)
).drop("type")

for rna_type in rrna_df["rRNA_type"].unique():
    temp_fasta = os.path.join(rrna_dir, f"{rna_type}_ncbi.fasta")
    temp_df = rrna_df.filter(pl.col(["rRNA_type"]) == rna_type)
    write_fasta_file(
        seqs=temp_df["sequence"].to_list(),
        headers=temp_df["header"].to_list(),
        output_file=temp_fasta,
    )
    del temp_df

# tRNAs
trna_fasta = os.path.join( os.path.join(data_dir, "", "trna_refseq"), "combined_trna_refseq.fasta")

### create entropy masked, RNAvirus masked, dust masked variants for each of the sequences to test as filters
also need to remember to create a kcomressed version (so reduced, compacted version of kmers of length something)

In [12]:
# # for simplicity, will make a dataframe with the info on all sequence types to be tested
test_track_df = pl.DataFrame({
    "name" : ["ITS","16S","18S","23S","LSU","SSU","tRNA", "mito","plastid","silva_SSU","silva_LSU","RefSeq_RNA_viruses","RVMT"],
    "raw_fasta_file" : [
        os.path.join(rrna_dir, f"ITS_ncbi_rRNA_all_sequences.fasta"),
        os.path.join(rrna_dir, f"16S_ncbi_rRNA_all_sequences.fasta"),
        os.path.join(rrna_dir, f"18S_ncbi_rRNA_all_sequences.fasta"),
        os.path.join(rrna_dir, f"23S_ncbi_rRNA_all_sequences.fasta"),
        os.path.join(rrna_dir, f"LSU_ncbi_rRNA_all_sequences.fasta"),
        os.path.join(rrna_dir, f"SSU_ncbi_rRNA_all_sequences.fasta"),
        trna_fasta,
        mito_fasta,
        plastid_fasta,
        silva_ssu_path,
        silva_lsu_path,
        ncbi_ribovirus_fasta_path,
        rvmt_fasta_path
    ]})
test_track_df

name,raw_fasta_file
str,str
"""ITS""","""/clusterfs/jgi/scratch/science…"
"""16S""","""/clusterfs/jgi/scratch/science…"
"""18S""","""/clusterfs/jgi/scratch/science…"
"""23S""","""/clusterfs/jgi/scratch/science…"
"""LSU""","""/clusterfs/jgi/scratch/science…"
…,…
"""plastid""","""/clusterfs/jgi/scratch/science…"
"""silva_SSU""","""/clusterfs/jgi/scratch/science…"
"""silva_LSU""","""/clusterfs/jgi/scratch/science…"
"""RefSeq_RNA_viruses""","""/clusterfs/jgi/scratch/science…"


In [None]:
# Merge SILVA sequences and apply entropy masking
print("Merging and masking SILVA sequences")

# # Apply entropy masking
# bbduk(
#     in1=silva_merged,
#     out=silva_masked,
#     entropy=0.6,
#     entropyk=4,
#     entropywindow=24,
#     maskentropy=True,
#     ziplevel=9,
# )

# print(f"Created masked SILVA rRNA database: {silva_masked}")

# # clean up
# try:
#     os.remove(deduplicated_fasta)
#     os.remove(compressed_path)
# except Exception as e:
#     logger.warning(f"Could not remove intermediate files: {e}")

print(f"Masking sequences prepared in {masking_dir}")

# ##### Create rRNA DB #####
# cd $rolypoly/data/
# mkdir rRNA
# cd rRNA
# gzip SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz
# gzip SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz
# cat *fasta > merged.fas

# bbduk.sh -Xmx1g in=merged.fas out=merged_masked.fa zl=9 entropy=0.6 entropyk=4 entropywindow=24 maskentropy
# bbduk.sh -Xmx1g in=rmdup_rRNA_ncbi.fasta  out=rmdup_rRNA_ncbi_masked.fa zl=9 entropy=0.6 entropyk=4 entropywindow=24 maskentropy


Wrote combined rRNA fasta to /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/ncbi_rRNA_all_sequences.fasta
