# How much gets filtered when removing rRNA, tRNA, and mtDNA/RNA? 
I'm making this notebook as a n intermission from preparing rolypoly external database. So far I've used a combination of NCBI +SILVA rRNAs to remove contaminating rRNA reads from RNA-seq data. Subsequently, the organisms whose rRNAs were most matched, are then also fetched (or their transcriptomes, if available) to remove any remaining reads that may have come from those organisms.  
THis is messy, requires NCBI taxdump, taxonkit, and ncbi-datasets. So I started removing this dependency by using a set of rRNAs for which I can generate a prebuilt table containing the FTP addresses of the hosts' genomes/transcriptomes.  
While doing so, I realised the step above could be split - quick rRNA mapping to get rough taxonomic breakdown of the sample, and then a more thorough removal of rRNA, tRNA, and mtDNA/RNA using a more comprehensive database. The question is then how much will these diffrent combinations filter, how much more time, and would masking the fasta for subsequences shared with RNA viruses change the results significantly?

To test these, below are how I got the data, how I named the sets, filtering of them, graphs and so on. 

*Note*: Parts of this script are from [build_data.py](../src/rolypoly/commands/misc/build_data.py) and [filter_reads.py](../src/rolypoly/commands/reads/filter_reads.py) scripts in the rolypoly repository.

Loading libraries and defining paths to sets already created/downloaded:

In [None]:
import datetime
import json
import logging
import os
import shutil
import subprocess
import tarfile
from pathlib import Path as pt

import polars as pl
import requests
from bbmapy import bbduk, bbmask, kcompress
from rich.console import Console
from rich_click import command, option

from rolypoly.utils.bio.alignments import (
    hmmdb_from_directory,
    mmseqs_profile_db_from_directory,
)
from rolypoly.utils.bio.sequences import (
    filter_fasta_by_headers,
    remove_duplicates,
)

from rolypoly.utils.bio.polars_fastx import from_fastx_eager

from rolypoly.utils.logging.citation_reminder import remind_citations
from rolypoly.utils.logging.loggit import get_version_info, setup_logging
from rolypoly.utils.various import fetch_and_extract, run_command_comp

### DEBUG ARGS (for manually building, not entering via CLI):
threads = 6
log_file = "notebooks/Exprimental/trrna.log"
data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data"

global rrna_dir
global contam_dir

logger = setup_logging(log_file)
print(f"Starting data preparation to : {data_dir}")

contam_dir = os.path.join(data_dir, "contam")
os.makedirs(contam_dir, exist_ok=True)

rrna_dir = os.path.join(contam_dir, "rrna")
os.makedirs(rrna_dir, exist_ok=True)

masking_dir = os.path.join(contam_dir, "masking")
os.makedirs(masking_dir, exist_ok=True)

# taxonomy_dir = os.path.join(data_dir, "taxdump")
# os.makedirs(taxonomy_dir, exist_ok=True)

reference_seqs = os.path.join(data_dir, "reference_seqs")
os.makedirs(reference_seqs, exist_ok=True)

mmseqs_ref_dir = os.path.join(reference_seqs, "mmseqs")
os.makedirs(mmseqs_ref_dir, exist_ok=True)

rvmt_dir = os.path.join(reference_seqs, "RVMT")
os.makedirs(rvmt_dir, exist_ok=True)

ncbi_ribovirus_dir = os.path.join(reference_seqs, "ncbi_ribovirus")
os.makedirs(ncbi_ribovirus_dir, exist_ok=True)

# Masking sequences preparation
rvmt_fasta_path = os.path.join(
    data_dir, "reference_seqs", "RVMT", "RVMT_cleaned_contigs.fasta"
)
ncbi_ribovirus_fasta_path = os.path.join(
    data_dir,
    "reference_seqs",
    "ncbi_ribovirus",
    "refseq_ribovirus_genomes.fasta",
)

rna_viruses_entropy_masked_path = os.path.join(
    masking_dir, "combined_entropy_masked.fasta"
)

## Fetching data (SILVA, NCBI rRNAs, tRNAs, mtDNAs):

In [None]:
silva_release = "138.2"

# Download SILVA rRNA sequences (SSU and LSU)
silva_ssu_path = os.path.join(
    rrna_dir, f"SILVA_{silva_release}_SSURef_NR99_tax_silva.fasta"
)
silva_lsu_path = os.path.join(
    rrna_dir, f"SILVA_{silva_release}_LSURef_NR99_tax_silva.fasta"
)
# skipping downloading (already done)
# fetch_and_extract(
#     f"https://www.arb-silva.de/fileadmin/silva_databases/release_{silva_release.replace('.', '_')}/Exports/SILVA_{silva_release}_SSURef_NR99_tax_silva.fasta.gz",
#     fetched_to=os.path.join(rrna_dir, "tmp_ssu.fasta.gz"),
#     extract_to=rrna_dir,
#     rename_extracted=silva_ssu_path,
#     logger=logger,
# )
# fetch_and_extract(
#     f"https://www.arb-silva.de/fileadmin/silva_databases/release_{silva_release.replace('.', '_')}/Exports/SILVA_{silva_release}_LSURef_NR99_tax_silva.fasta.gz",
#     fetched_to=os.path.join(rrna_dir, "tmp_lsu.fasta.gz"),
#     extract_to=rrna_dir,
#     rename_extracted=silva_lsu_path,
#     logger=logger,
# )

# Download SILVA taxonomy mappings (maps accessions to NCBI taxids)
silva_ssu_taxmap = pl.read_csv(
    "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/ncbi/taxmap_embl-ebi_ena_ssu_ref_nr99_138.2.txt.gz",
    truncate_ragged_lines=True,
    separator="\t",
    infer_schema_length=123123,
)
silva_lsu_taxmap = pl.read_csv(
    "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/ncbi/taxmap_embl-ebi_ena_lsu_ref_nr99_138.2.txt.gz",
    truncate_ragged_lines=True,
    separator="\t",
    infer_schema_length=123123,
)
silva_taxmap = pl.concat([silva_lsu_taxmap, silva_ssu_taxmap])
silva_taxmap.write_parquet(os.path.join(rrna_dir, "silva_taxmap_embl-ebi_ena.parquet"))

# Parse SILVA headers and extract accessions
silva_fasta_df = pl.concat(
    [
        from_fastx_eager(silva_ssu_path).with_columns(
            pl.lit("SSU").alias("rRNA_type")
        ),
        from_fastx_eager(silva_lsu_path).with_columns(
            pl.lit("LSU").alias("rRNA_type")
        ),
    ]
)
silva_fasta_df.write_parquet(os.path.join(rrna_dir, "silva99_fasta.parquet"))
print(f"total SILVA99 sequences {silva_fasta_df.height}")
silva_fasta_df.head(4)


header,sequence,rRNA_type
str,str,str
"""AY846379.1.1791 Eukaryota;Arch…","""AACCUGGUUGAUCCUGCCAGUAGUCAUAUG…","""SSU"""
"""AY846382.1.1778 Eukaryota;Arch…","""GUUGAUCCUGCCAGUAGUCAUAUGCUUGUC…","""SSU"""
"""AB000393.1.1510 Bacteria;Pseud…","""UGGCUCAGAUUGAACGCUGGCGGCAGGCCU…","""SSU"""
"""AY909590.1.2352 Eukaryota;Arch…","""GACUAAGCCAUGCAUGUCUAAGUAUAAACG…","""SSU"""


In [None]:
# Extract accession from header (format: >accession.version rest_of_header)
silva_fasta_df = silva_fasta_df.with_columns(
    primaryAccession=pl.col("header").str.extract(
        r"^([A-Za-z0-9_]+)(?:\.\d+)*", 1
    ),  # DQ150555.1.2478 -> DQ150555
    accession=pl.col("header").str.extract(
        r"^([A-Za-z0-9_]+(?:\.\d+)?)", 1
    ),  # AY846379 or DQ150555.1
    taxonomy_raw=pl.col("header").str.replace(r"^\S+\s+", ""),
)
    # silva_fasta_df = silva_fasta_df.with_columns(
    #     pl.col("sequence").str.len_chars().alias("seq_length")
    # )
    # silva_taxmap = silva_taxmap.with_columns(
    #     (pl.col("stop") - pl.col("start")).alias("seq_length")
    # )

silva_df = silva_fasta_df.join(
    silva_taxmap.select(
        ["primaryAccession", "ncbi_taxonid", "submitted_path"]
    ).unique(),  # seq_length
    on=["primaryAccession"],
    how="inner",
)
silva_df.write_parquet(os.path.join(rrna_dir, "silva_rrna_sequences.parquet"))
# silva_df.height
silva_df["ncbi_taxonid"].null_count()

# Load SILVA taxonomy mappings
print(
    f"Merged taxonomy for {silva_df.filter(pl.col('ncbi_taxonid').is_not_null()).height} SILVA sequences"
)

unique_taxids = (
    silva_df.filter(pl.col("ncbi_taxonid").is_not_null())
    .select("ncbi_taxonid")
    .unique()["ncbi_taxonid"]
    .to_list()
)
print(
    f"Total of {len(unique_taxids)} unique NCBI taxids found in SILVA sequences"
)


In [3]:
# silva_df = pl.read_parquet(os.path.join(rrna_dir, "silva_rrna_sequences.parquet"))

In [None]:
# Generate FTP download URLs for host genomes/transcriptomes
fetch_and_extract(
    url="https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt",
    fetched_to=os.path.join(rrna_dir, "assembly_summary_genbank.txt.gz"),
    extract=False,
)
print("Loading NCBI GenBank assembly summary")
# genbank_summary = pl.read_csv(os.path.join(rrna_dir, "assembly_summary_genbank.txt.gz",),
# infer_schema_length=100020, separator="\t", skip_rows=1,
# null_values=["na","NA","-"],ignore_errors=True,
# has_header=True)
# polars failed me, so using line by line iterator
from gzip import open as gz_open
with gz_open(
    os.path.join(rrna_dir, "assembly_summary_genbank.txt.gz"), "r"
) as f:
    header = None
    records = []
    i = 0
    for line in f:
        if i == 0:
            i += 1
            continue
        line = line.rstrip(b"\n")
        if i == 1:
            header = line.decode()[1:].strip().split("\t")
            i += 1
            continue
        fields = line.decode().strip().split("\t")
        record = dict(zip(header, fields))
        records.append(record)
genbank_summary = pl.from_records(records).rename({"taxid": "ncbi_taxonid"})
genbank_summary.collect_schema()
    # Schema([('assembly_accession', String),
#         ('bioproject', String),
#         ('biosample', String),
#         ('wgs_master', String),
#         ('refseq_category', String),
#         ('ncbi_taxonid', String),
#         ('species_taxid', String),
#         ('organism_name', String),
#         ('infraspecific_name', String),
#         ('isolate', String),
#         ('version_status', String),
#         ('assembly_level', String),
#         ('release_type', String),
#         ('genome_rep', String),
#         ('seq_rel_date', String),
#         ('asm_name', String),
#         ('asm_submitter', String),
#         ('gbrs_paired_asm', String),
#         ('paired_asm_comp', String),
#         ('ftp_path', String),
#         ('excluded_from_refseq', String),
#         ('relation_to_type_material', String),
#         ('asm_not_live_date', String),
#         ('assembly_type', String),
#         ('group', String),
#         ('genome_size', String),
#         ('genome_size_ungapped', String),
#         ('gc_percent', String),
#         ('replicon_count', String),
#         ('scaffold_count', String),
#         ('contig_count', String),
#         ('annotation_provider', String),
#         ('annotation_name', String),
#         ('annotation_date', String),
#         ('total_gene_count', String),
#         ('protein_coding_gene_count', String),
#         ('non_coding_gene_count', String),
#         ('pubmed_id', String)])

genbank_summary.write_csv(
    os.path.join(rrna_dir, "genbank_assembly_summary.tsv"), separator="\t"
)
genbank_summary = pl.read_csv(
    os.path.join(rrna_dir, "genbank_assembly_summary.tsv"),
    infer_schema_length=100020,
    separator="\t",
    null_values=["na", "NA", "-"],
    ignore_errors=True,
    has_header=True,
)

# In [91]: genbank_summary.collect_schema()
# Out[91]: 
# Schema([('assembly_accession', String),
#         ('bioproject', String),
#         ('biosample', String),
#         ('wgs_master', String),
#         ('refseq_category', String),
#         ('ncbi_taxonid', Int64),
#         ('species_taxid', Int64),
#         ('organism_name', String),
#         ('infraspecific_name', String),
#         ('isolate', String),
#         ('version_status', String),
#         ('assembly_level', String),
#         ('release_type', String),
#         ('genome_rep', String),
#         ('seq_rel_date', String),
#         ('asm_name', String),
#         ('asm_submitter', String),
#         ('gbrs_paired_asm', String),
#         ('paired_asm_comp', String),
#         ('ftp_path', String),
#         ('excluded_from_refseq', String),
#         ('relation_to_type_material', String),
#         ('asm_not_live_date', String),
#         ('assembly_type', String),
#         ('group', String),
#         ('genome_size', Int64),
#         ('genome_size_ungapped', Int64),
#         ('gc_percent', Float64),
#         ('replicon_count', Int64),
#         ('scaffold_count', Int64),
#         ('contig_count', Int64),
#         ('annotation_provider', String),
#         ('annotation_name', String),
#         ('annotation_date', String),
#         ('total_gene_count', Int64),
#         ('protein_coding_gene_count', Int64),
#         ('non_coding_gene_count', Int64),
#         ('pubmed_id', String)])

genbank_summary.write_parquet(
    os.path.join(rrna_dir, "genbank_assembly_summary.parquet")
)
genbank_summary.write_csv(
    os.path.join(rrna_dir, "genbank_assembly_summary.tsv"), separator="\t"
)
print(genbank_summary.head(4))

2025-12-01 14:15:15,584 - rolypoly.utils.logging.loggit - INFO - Downloading https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt to /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/assembly_summary_genbank.txt.gz
2025-12-01 14:15:45,465 - rolypoly.utils.logging.loggit - INFO - Successfully downloaded to /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/assembly_summary_genbank.txt.gz


In [None]:
# genbank_summary = pl.read_parquet(
#     os.path.join(rrna_dir, "genbank_assembly_summary.parquet")
# )
print(genbank_summary.head(4))

In [None]:
# next, for every unique ncbi_taxonid, we select the one that has the most protein_coding_gene_count, then refseq_category, then tie breaking with non_coding_gene_count, tie breaking by latest assembly (by seq_rel_date).
temp_genbank = genbank_summary.sort(
    by=[
        pl.col("protein_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("refseq_category").reverse(),
        pl.col("non_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("seq_rel_date").reverse(),
    ]
).unique(subset=["ncbi_taxonid"], keep="first")
print(
    f"Filtered GenBank summary to {temp_genbank.height} unique taxid entries for SILVA sequences"
)
temp_genbank = temp_genbank.filter(pl.col("ncbi_taxonid").is_in(silva_taxmap["ncbi_taxonid"])).unique()
temp_genbank.height
# only 30503 out of ~200k?


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  temp_genbank = temp_genbank.filter(pl.col("ncbi_taxonid").is_in(silva_taxmap["ncbi_taxonid"])).unique()


30503

In [None]:
# Merge SILVA sequences and apply entropy masking
print("Merging and masking SILVA sequences")
silva_merged = os.path.join(rrna_dir, "SILVA_merged.fasta")
silva_masked = os.path.join(rrna_dir, "SILVA_merged_masked.fasta")

# Concatenate SILVA files
run_command_comp(
    base_cmd="cat",
    positional_args=[silva_ssu_path, silva_lsu_path],
    positional_args_location="end",
    params={},
    output_file=silva_merged,
    logger=logger,
)

# Apply entropy masking
bbduk(
    in1=silva_merged,
    out=silva_masked,
    entropy=0.6,
    entropyk=4,
    entropywindow=24,
    maskentropy=True,
    ziplevel=9,
)

print(f"Created masked SILVA rRNA database: {silva_masked}")

# clean up
try:
    os.remove(deduplicated_fasta)
    os.remove(compressed_path)
except Exception as e:
    logger.warning(f"Could not remove intermediate files: {e}")

print(f"Masking sequences prepared in {masking_dir}")

# ##### Create rRNA DB #####
# cd $rolypoly/data/
# mkdir rRNA
# cd rRNA
# gzip SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz
# gzip SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz
# cat *fasta > merged.fas

# bbduk.sh -Xmx1g in=merged.fas out=merged_masked.fa zl=9 entropy=0.6 entropyk=4 entropywindow=24 maskentropy
# bbduk.sh -Xmx1g in=rmdup_rRNA_ncbi.fasta  out=rmdup_rRNA_ncbi_masked.fa zl=9 entropy=0.6 entropyk=4 entropywindow=24 maskentropy


## rRNAs, tRNA, mtDNA, and plastid-DNA from NCBI

rRNA from NCBI nuc X refseq

In [14]:
# %%bash
# search_term="((ribosomal RNA[Title]) OR rRNA[Title]) AND 150:7000[Sequence Length] AND txid131567[Organism:exp] "
# # Perform the search and download the sequences
# esearch -db nuccore -query "$search_term" | efetch -format fasta > "rrna_genes_refseq.fasta"

curl: (22) The requested URL returned error: 400 Bad Request
 ERROR:  curl command failed ( Mon Dec  1 15:02:00 PST 2025 ) with: 22
-X POST https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi -d query_key=1&WebEnv=MCID_692e1ba5bfc86b4c4b0614df&retstart=18200&retmax=50&db=nuccore&rettype=fasta&retmode=text&tool=edirect&edirect=24.0&edirect_os=Linux&email=uneri%40ln008.jgi
nquire -url https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ efetch.fcgi -query_key 1 -WebEnv MCID_692e1ba5bfc86b4c4b0614df -retstart 18200 -retmax 50 -db nuccore -rettype fasta -retmode text -tool edirect -edirect 24.0 -edirect_os Linux -email uneri@ln008.jgi
EMPTY RESULT
SECOND ATTEMPT
curl: (22) The requested URL returned error: 400 Bad Request
 ERROR:  curl command failed ( Mon Dec  1 15:03:03 PST 2025 ) with: 22
-X POST https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi -d query_key=1&WebEnv=MCID_692e1ba5bfc86b4c4b0614df&retstart=19700&retmax=50&db=nuccore&rettype=fasta&retmode=text&tool=edirect&edirect

tRNA from NCBI nuc

In [45]:
# first, getting the contents of https://ftp.ncbi.nlm.nih.gov/blast/db/ and filtering to files containing "16S", "18S", "28S", "ITS" 
# LSU, SSU
import ftplib
from ftplib import FTP

terms = ["16S", "18S", "28S", "ITS", "LSU", "SSU", "ribosomal"]
files_2_download = []
FTP_HOST = "ftp.ncbi.nlm.nih.gov"
FTP_DIR = "/blast/db/"

# List to store the file and directory names
file_list = []

# Connect to the FTP host
with FTP(FTP_HOST) as ftp:
    #  Log in anonymously (default behavior for .login())
    ftp.login()
    print(f"Connected to {FTP_HOST}")
    ftp.cwd(FTP_DIR)
    logger.debug(f"Changed directory to {FTP_DIR}")

    # nlst() returns only names, not detailed information
    file_list = ftp.nlst()
    
    print("\n--- Directory Contents ---")
    for item in file_list:
        if any(term in item for term in terms):
            files_2_download.append(item)
            logger.debug(item)
            
            # Download file directly using FTP
            local_path = os.path.join(rrna_dir, item)
            print(f"Downloading {item}")
            with open(local_path, 'wb') as local_file:
                ftp.retrbinary(f'RETR {item}', local_file.write)
            logger.debug(f"Successfully downloaded {item}")

print(f"Downloaded {len(files_2_download)} files matching terms: {terms}")

Connected to ftp.ncbi.nlm.nih.gov

--- Directory Contents ---
Downloading 18S_fungal_sequences.tar.gz

--- Directory Contents ---
Downloading 18S_fungal_sequences.tar.gz
Downloading 18S_fungal_sequences.tar.gz.md5
Downloading 18S_fungal_sequences.tar.gz.md5
Downloading 28S_fungal_sequences.tar.gz
Downloading 28S_fungal_sequences.tar.gz
Downloading 28S_fungal_sequences.tar.gz.md5
Downloading 28S_fungal_sequences.tar.gz.md5
Downloading ITS_RefSeq_Fungi.tar.gz
Downloading ITS_RefSeq_Fungi.tar.gz
Downloading ITS_RefSeq_Fungi.tar.gz.md5
Downloading ITS_RefSeq_Fungi.tar.gz.md5
Downloading ITS_eukaryote_sequences.tar.gz
Downloading ITS_eukaryote_sequences.tar.gz
Downloading ITS_eukaryote_sequences.tar.gz.md5
Downloading ITS_eukaryote_sequences.tar.gz.md5
Downloading LSU_eukaryote_rRNA.tar.gz
Downloading LSU_eukaryote_rRNA.tar.gz
Downloading LSU_eukaryote_rRNA.tar.gz.md5
Downloading LSU_eukaryote_rRNA.tar.gz.md5
Downloading LSU_prokaryote_rRNA.tar.gz
Downloading LSU_prokaryote_rRNA.tar.gz
Down

In [None]:
# for each of these, will run blastdbcmd to extract sequences into fasta 
from rolypoly.utils.various import extract
import suprocess as sp
for item in files_2_download:
    os.makedirs(os.path.join("temp"), exist_ok=True)
    extract(archive_path=os.path.join(rrna_dir, item),
        extract_to=rrna_dir,
        logger=logger,
    )
    sp.run(
        f"blastdbcmd -entry all -db {os.path.join(rrna_dir, item.replace('.tar.gz',''))} -out {os.path.join(rrna_dir, item.replace('.tar.gz','').replace('.tar','')+'.fasta')} -outfmt '%f'",

    # blastdbcmd -entry all -db LSU_prokaryote_rRNA  -out reference.fasta -outfmt "%T;%t;%s" # taxid;header;sequence
    run_command_comp(
        base_cmd="blastdbcmd",
        positional_args=[],
        positional_args_location="end",
        params={
            "db": os.path.join(rrna_dir, item.replace(".tar.gz", "")),
            "out": os.path.join(
                rrna_dir, item.replace(".tar.gz", "").replace(".tar", "") + ".fasta"
            ),
            "outfmt": "%f",
        },
        output_file=None,

        logger=logger,
    )

TypeError: extract() got an unexpected keyword argument 'file_path'

In [None]:
# %%bash
# search_term="(transfer RNA[title] OR tRNA[title]) AND srcdb_refseq[PROP] AND 50:450[SLEN]"
# # Perform the search and download the sequences
# esearch -db nuccore -query "$search_term" | efetch -format fasta > "trna_genes_refseq.fasta"


In [None]:
def prepare_plastid_data(data_dir, logger):
    """Prepare plastid sequence data for contamination filtering.

    Downloads NCBI RefSeq plastid sequences, combines them, and removes duplicates.

    Args:
        data_dir (str): Base directory for data storage
        logger: Logger object for recording progress and errors
    returns:
        None
    """
    plastid_dir = os.path.join(data_dir, "reference_seqs", "plastid_refseq")
    os.makedirs(plastid_dir, exist_ok=True)
    
    print("Downloading NCBI RefSeq plastid sequences")
    
    base_url = "https://ftp.ncbi.nlm.nih.gov/refseq/release/plastid/plastid."
    suffix = ".genomic.fna.gz"
    files_to_get = ["1.1", "1.2", "2.1", "2.2", "3.1"]
    
    all_files =[]
    downloaded_files = []
    
    for version in files_to_get:
        file_url = f"{base_url}{version}{suffix}"
        gz_filename = f"plastid.{version}.genomic.fna.gz"
        fasta_filename = f"plastid.{version}.genomic.fna"
        
        print(f"Downloading plastid version {version}")
        
        # Download and extract the file
        try:
            extracted_path = fetch_and_extract(
                url=file_url,
                fetched_to=os.path.join(plastid_dir, gz_filename),
                extract_to=plastid_dir,
                expected_file=fasta_filename,
                logger=logger,
            )
            downloaded_files.append(extracted_path)
            all_files.append(extracted_path)
            all_files.append(os.path.join(plastid_dir, gz_filename))
            print(f"Successfully downloaded and extracted {fasta_filename}")
        except Exception as e:
            logger.error(f"Failed to download plastid version {version}: {e}")
            continue
    
    if not downloaded_files:
        logger.error("No plastid files were successfully downloaded")
        return
    
    # Combine and deduplicate the sequences
    combined_fasta = os.path.join(plastid_dir, "combined_plastid_refseq.fasta")
    print(f"Combining and deduplicating {len(downloaded_files)} plastid files")
    
    stats = remove_duplicates(
        input_file=downloaded_files,
        output_file=combined_fasta,
        by="seq",
        revcomp_as_distinct=False,  # Treat reverse complement as duplicate
        return_stats=True,
        logger=logger,
    )
    
    if stats:
        print(
            f"Plastid deduplication stats: {stats['unique_records']} unique sequences from {stats['total_records']} total, {stats['duplicates_removed']} duplicates removed"
        )
    
    # Clean up individual files to save space (optional)
    try:
        for file_path in all_files:
            if os.path.exists(file_path):
                os.remove(file_path)
                logger.debug(f"Removed intermediate file: {os.path.basename(file_path)}")
    except Exception as e:
        logger.warning(f"Could not remove intermediate plastid files: {e}")
    
    print(f"Plastid sequences prepared in {plastid_dir}")


def prepare_mito_data(data_dir, logger):
    """Prepare mito sequence data for contamination filtering.

    Downloads NCBI RefSeq mito sequences, combines them, and removes duplicates.

    Args:
        data_dir (str): Base directory for data storage
        logger: Logger object for recording progress and errors
    returns:
        None
    """
    mito_dir = os.path.join(data_dir, "reference_seqs", "mito_refseq")
    os.makedirs(mito_dir, exist_ok=True)
    
    print("Downloading NCBI RefSeq mito sequences")
    
    base_url = "https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion/mitochondrion."
    suffix = ".genomic.fna.gz"
    files_to_get = ["1.1"] #
    
    all_files =[]
    downloaded_files = []
    
    for version in files_to_get:
        file_url = f"{base_url}{version}{suffix}"
        gz_filename = f"mito.{version}.genomic.fna.gz"
        fasta_filename = f"mito.{version}.genomic.fna"
        
        print(f"Downloading mito version {version}")
        
        # Download and extract the file
        try:
            extracted_path = fetch_and_extract(
                url=file_url,
                fetched_to=os.path.join(mito_dir, gz_filename),
                extract_to=mito_dir,
                expected_file=fasta_filename,
                logger=logger,
            )
            downloaded_files.append(extracted_path)
            all_files.append(extracted_path)
            all_files.append(os.path.join(mito_dir, gz_filename))
            print(f"Successfully downloaded and extracted {fasta_filename}")
        except Exception as e:
            logger.error(f"Failed to download mito version {version}: {e}")
            continue
    
    if not downloaded_files:
        logger.error("No mito files were successfully downloaded")
        return
    
    # Combine and deduplicate the sequences
    combined_fasta = os.path.join(mito_dir, "combined_mito_refseq.fasta")
    print(f"Combining and deduplicating {len(downloaded_files)} mito files")
    
    stats = remove_duplicates(
        input_file=downloaded_files,
        output_file=combined_fasta,
        by="seq",
        revcomp_as_distinct=False,  # Treat reverse complement as duplicate
        return_stats=True,
        logger=logger,
    )
    
    if stats:
        print(
            f"mito deduplication stats: {stats['unique_records']} unique sequences from {stats['total_records']} total, {stats['duplicates_removed']} duplicates removed"
        )
    
    # Clean up individual files to save space (optional)
    try:
        for file_path in all_files:
            if os.path.exists(file_path):
                os.remove(file_path)
                logger.debug(f"Removed intermediate file: {os.path.basename(file_path)}")
    except Exception as e:
        logger.warning(f"Could not remove intermediate mito files: {e}")
    
    print(f"mito sequences prepared in {mito_dir}")

temp_data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/notebooks/Exprimental"
prepare_plastid_data(temp_data_dir, logger)
prepare_mito_data(temp_data_dir, logger)

## gene2accession (NCBI)

In [None]:
# fetch_and_extract( url="http://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz",
#     fetched_to=os.path.join(rrna_dir, "gene2accession.gz"),
#     extract=False,
# )

silva_df = silva_df.with_columns(
    ncbi_taxonid=pl.col("ncbi_taxonid").cast(pl.String)
)

silva_df1 = silva_df.join(
    genbank_summary.select(["ncbi_taxonid", "ftp_path"]),
    on=["ncbi_taxonid"],
    how="left",
)
silva_df1

silva_df = silva_df.with_columns(
    genome_ftp_url=pl.when(pl.col("ncbi_taxonid").is_not_null())
    .then(
        pl.format(
            "https://ftp.ncbi.nlm.nih.gov/genomes/all/refseq/taxid_{}/",
            pl.col("ncbi_taxonid"),
        )
    )
    .otherwise(None),
    datasets_api_url=pl.when(pl.col("ncbi_taxonid").is_not_null())
    .then(
        pl.format(
            "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/taxon/{}/download?include_annotation_type=GENOME_FASTA,RNA_FASTA",
            pl.col("ncbi_taxonid"),
        )
    )
    .otherwise(None),
)

# Save metadata table
metadata_output = os.path.join(rrna_dir, "rrna_metadata.tsv")
silva_df.write_csv(metadata_output, separator="\t")
print(
    f"Saved rRNA metadata table with {len(silva_df)} entries to {metadata_output}"
)

gene2accession = pl.read_csv(
    os.path.join(rrna_dir, "gene2accession.gz"),
    separator="\t",
    # skip_rows=1,
    # infer_schema_length=100020,
    null_values=["na", "NA", "-"],
    ignore_errors=True,
    has_header=True,
    # n_rows=100
)
gene2accession.write_parquet(os.path.join(rrna_dir, "gene2accession.parquet"))
# gene2accession = pl.read_parquet(os.path.join(rrna_dir, "gene2accession.parquet"))
# gene2accession.collect_schema()
# Schema([('#tax_id', Int64),
#     ('GeneID', Int64),
#     ('status', String),
#     ('RNA_nucleotide_accession.version', String),
#     ('RNA_nucleotide_gi', String),
#     ('protein_accession.version', String),
#     ('protein_gi', Int64),
#     ('genomic_nucleotide_accession.version', String),
#     ('genomic_nucleotide_gi', Int64),
#     ('start_position_on_the_genomic_accession', Int64),
#     ('end_position_on_the_genomic_accession', Int64),
#     ('orientation', String),
#     ('assembly', String),
#     ('mature_peptide_accession.version', String),
#     ('mature_peptide_gi', String),
#     ('Symbol', String)])
gene2accession = gene2accession.rename({"#tax_id": "ncbi_taxonid"})
test_df = gene2accession.filter(pl.col("ncbi_taxonid").is_in(unique_taxids))
test_df.height # 148449745
test_df2 = gene2accession.select(["ncbi_taxonid","assembly"]).unique()
test_df2.height # 52548


