## Setup and Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
from pathlib import Path
import polars as pl
from tqdm.notebook import tqdm

from rolypoly.utils.bio.trees import TaxonomyTree
from rolypoly.utils.logging.loggit import setup_logging

# Configuration
threads = 6
data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data"
log_file = "notebooks/Exprimental/rrna_genome_mapping.log"

logger = setup_logging(log_file)
print(f"Working directory: {data_dir}")

# Directory structure
rrna_dir = os.path.join(data_dir, "contam", "rrna")
taxonomy_dir = os.path.join(data_dir, "taxdump")
os.makedirs(rrna_dir, exist_ok=True)
os.makedirs(taxonomy_dir, exist_ok=True)

Working directory: /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data


## Load Input Data

In [2]:
# Load NCBI taxonomy data
print("Loading NCBI taxonomy...")
nodes_path = os.path.join(taxonomy_dir, "nodes.dmp")
names_path = os.path.join(taxonomy_dir, "names.dmp")

# Parse nodes.dmp
records = []
with open(nodes_path, "r") as f:
    for line in tqdm(f, desc="Reading nodes"):
        fields = line.rstrip("\n").rstrip("\t|").split("\t|\t")
        records.append({
            "tax_id": int(fields[0]),
            "parent_tax_id": int(fields[1]),
            "rank": fields[2]
        })
nodes_df = pl.from_records(records)
print(f"Loaded {nodes_df.height:,} taxonomy nodes")

# Parse names.dmp (scientific names only)
records = []
with open(names_path, "r") as f:
    for line in tqdm(f, desc="Reading names"):
        fields = line.rstrip("\n").rstrip("\t|").split("\t|\t")
        if fields[3] == "scientific name":
            records.append({
                "tax_id": int(fields[0]),
                "scientific_name": fields[1]
            })
sci_name_df = pl.from_records(records)
print(f"Loaded {sci_name_df.height:,} scientific names")

Loading NCBI taxonomy...


Reading nodes: 0it [00:00, ?it/s]

Loaded 2,708,826 taxonomy nodes


Reading names: 0it [00:00, ?it/s]

Loaded 2,708,826 scientific names


## Load NCBI + SILVA rRNA sequences
reminder -
`taxid@Source@type@seq_md5_hash`
Using @ as separator as it's not common in fasta headers, and just going to assume the original header doesn't contain @, and that mappers who drop everything after the first white space will just ignore the really long headers
```fasta
>2888294@NCBI@ITS_RefSeq_Fungi@55c18f2cfc8490f5d5cb9b34162a44f5
atgtgatgaga...
```

In [3]:
# Load rRNA sequences
from rolypoly.utils.bio.polars_fastx import from_fastx_eager

print("\nLoading rRNA sequences...")
rrna_fas1 = Path(rrna_dir) / "ncbi_rRNA_all_sequences_masked_entropy.fasta" 
rrna_fas2 = Path(rrna_dir) / "silva_rRNA_all_sequences_masked_entropy.fasta"  
rrna_df = pl.concat(
    [
        from_fastx_eager(rrna_fas1),
        from_fastx_eager(rrna_fas2)
    ]
)
rrna_df = rrna_df.with_columns(
    pl.col("header").str.split("@").list.to_struct(fields=["taxid", "source", "type", "seq_hash"]).alias("header_struct")
).unnest("header_struct")
rrna_df


Loading rRNA sequences...


header,sequence,taxid,source,type,seq_hash
str,str,str,str,str,str
"""2888294@NCBI@ITS_RefSeq_Fungi@…","""CAAGGTTTCCGTAGGTGAACCTGCGGAAGG…","""2888294""","""NCBI""","""ITS_RefSeq_Fungi""","""55c18f2cfc8490f5d5cb9b34162a44…"
"""46767@NCBI@ITS_eukaryote_seque…","""GAGTTTCTAGTCTGTAAAAGGAATAGAGAT…","""46767""","""NCBI""","""ITS_eukaryote_sequences""","""1a46fd8eddab80f42a516695114fae…"
"""114883@NCBI@16S_ribosomal_RNA@…","""CTGGCTGTGTGCCTAATACATGCATGTCGA…","""114883""","""NCBI""","""16S_ribosomal_RNA""","""11454c6190b13c7d374db466485ae0…"
"""44386@NCBI@ITS_eukaryote_seque…","""GTAGGTGAACCTGCGGAAGGATCATTACCG…","""44386""","""NCBI""","""ITS_eukaryote_sequences""","""b86fa7d27eeae7bfc4cda72eb6d41d…"
"""7107@NCBI@ITS_eukaryote_sequen…","""CCCCATAAACGAGGAATTCCTAGTAAGCGC…","""7107""","""NCBI""","""ITS_eukaryote_sequences""","""568fc18e099967dea2d9f9070a248d…"
…,…,…,…,…,…
"""28031@silva@SSU@0663d00aed8d6d…","""CCCUAAUACAUGCAAGUCGAGCGAACAGAG…","""28031""","""silva""","""SSU""","""0663d00aed8d6d56790b7717932896…"
"""214473@silva@SSU@aceef2efde177…","""GGGGGGGGGGGGGGCUAUACAUGCAAGUCG…","""214473""","""silva""","""SSU""","""aceef2efde177d51b49b7e5dca236d…"
"""1729607@silva@SSU@2e04e968777a…","""GACAGAUCCCUUCGGGGAGACGUGGUGUGG…","""1729607""","""silva""","""SSU""","""2e04e968777a4ca756a38de76b650b…"
"""1399115@silva@SSU@853c6dae780c…","""UUCAUGGAGAGUUUGAUCCUGGCUCAGGAC…","""1399115""","""silva""","""SSU""","""853c6dae780c287eec1ce2361f3653…"


In [4]:
rrna_df = rrna_df.with_columns(
    pl.col("taxid").cast(pl.Int64)
)

print(f"Loaded {rrna_df.height:,} rRNA sequences")
print(f"Unique tax_ids in rRNA data: {rrna_df['taxid'].n_unique():,}")

Loaded 371,978 rRNA sequences
Unique tax_ids in rRNA data: 172,219


In [5]:
# Load GenBank assembly summary
print("\nLoading NCBI assembly data...")
ncbi_summary = pl.read_parquet(
    os.path.join(rrna_dir, "ncbi_assembly_summary.parquet")
)

# Select best assembly per tax_id
mini_ncbi = ncbi_summary.sort(
    by=[
        pl.col("refseq_category").reverse(),
        pl.col("protein_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("non_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("seq_rel_date").reverse(),
        pl.col("genome_size").reverse(),
    ]
).unique(subset=["ncbi_taxonid"], keep="first")

print(f"Selected {mini_ncbi.height:,} best assemblies")


Loading NCBI assembly data...
Selected 244,185 best assemblies


## Build Taxonomy Tree

In [6]:
# Prepare nodes with names
print("\nBuilding taxonomy tree...")
nodes_with_names = nodes_df.join(
    sci_name_df,
    on="tax_id",
    how="left"
).select(["tax_id", "parent_tax_id", "rank", "scientific_name"])

# Prepare data availability with metadata
data_available = mini_ncbi.filter(
    pl.col("total_gene_count") > 0
).select([
    pl.col("ncbi_taxonid").alias("tax_id"),
    "ftp_path",
    "protein_coding_gene_count",
    "total_gene_count",
    "genome_size",
    "assembly_level",
    "organism_name"
])

# Create tree
tax_tree = TaxonomyTree(
    nodes_with_names,
    data_availability_df=data_available,
    priority_columns=['protein_coding_gene_count', 'genome_size']
)

# Propagate data to ancestors
print("\nPropagating data to ancestor nodes...")
nodes_updated = tax_tree.propagate_data_to_ancestors(aggregation_method='any')
print(f"Updated {nodes_updated:,} ancestor nodes with descendant data")

# Print statistics
tax_tree.print_stats()


Building taxonomy tree...
Built tree with 2708826 nodes, 2457509 leaf nodes
Set data availability for 179968 tax_ids
Stored metadata for 179968 tax_ids

Propagating data to ancestor nodes...

Propagating data to ancestor nodes using method: any
Propagation complete:
  Initial nodes with data: 179,968
  Nodes updated: 33,191
  Final nodes with data: 213,159
Updated 33,191 ancestor nodes with descendant data
TAXONOMY TREE STATISTICS

Tree Structure:
  Total nodes:          2,708,826
  Leaf nodes:           2,457,509
  Internal nodes:       251,317

Data Availability:
  Nodes with data:      213,159
  Leaves with data:     174,662
  Leaves without data:  2,282,847
  Internal nodes w/data:38,497

Coverage:
  All nodes:            7.87%
  Leaf nodes only:      7.11%

Cache:
  Cached queries:       0

Priority columns: protein_coding_gene_count, genome_size

Top ranks by count:
  species             : 2,223,790 total (104,684 with data,   4.7%)
  no rank             :  253,736 total (67,306

## Map rRNA Sequences to Genome References (Genus-Constrained)

In [7]:
# Get all rRNA tax_ids
all_rrna_tax_ids = rrna_df["taxid"].unique().to_list()
print(f"\nMapping {len(all_rrna_tax_ids):,} rRNA tax_ids to genome references...")

# Run simplified search with max_rank constraint only
results, stats = tax_tree.find_best_reference_batch(
    tax_ids=all_rrna_tax_ids,
    max_rank='genus',
    return_stats=True
)

# Print statistics
print("\nResults:")
print(f"  Total queried:         {stats['total_queried']:,}")
print(f"  Self has data:         {stats['self_found']:,} ({stats['self_found']/stats['total_queried']*100:.1f}%)")
print(f"  Ancestor found:        {stats['ancestor_found']:,} ({stats['ancestor_found']/stats['total_queried']*100:.1f}%)")
print(f"  Relative found:        {stats['relative_found']:,} ({stats['relative_found']/stats['total_queried']*100:.1f}%)")
print(f"  No match:              {stats['no_match_found']:,} ({stats['no_match_found']/stats['total_queried']*100:.1f}%)")
print(f"  Limited by genus rank: {stats['max_rank_limited']:,} ({stats['max_rank_limited']/stats['total_queried']*100:.1f}%)")

total_with_ref = stats['self_found'] + stats['ancestor_found'] + stats['relative_found']
print(f"\nOverall coverage: {total_with_ref:,} / {stats['total_queried']:,} ({total_with_ref/stats['total_queried']*100:.1f}%)")



Mapping 172,219 rRNA tax_ids to genome references...
Searching 172,219 tax_ids with max_rank='genus'

Results:
  Total queried:         172,219
  Self has data:         26,937 (15.6%)
  Ancestor found:        1,058 (0.6%)
  Relative found:        50,358 (29.2%)
  No match:              93,866 (54.5%)
  Limited by genus rank: 134,832 (78.3%)

Overall coverage: 78,353 / 172,219 (45.5%)


## Extract and Save Mapping Results

In [None]:
# Build mapping table - even simpler now!
print("\nBuilding mapping table...")
mapping_df = tax_tree.results_to_dataframe(results)
print(f"Created mapping table with {mapping_df.height:,} entries")


Building mapping table...


Created mapping table with 172,219 entries


In [11]:
mapping_df

query_tax_id,query_name,query_rank,reference_tax_id,reference_name,reference_rank,relationship,max_rank_reached,ftp_path,protein_coding_genes,genome_size,assembly_level
i64,str,str,i64,str,str,str,bool,str,i64,i64,str
0,,,,,,,false,,,,
7,"""Azorhizobium caulinodans""","""species""",980631,"""Azorhizobium oxalatiphilum""","""species""","""relative""",true,"""https://ftp.ncbi.nlm.nih.gov/g…",5742,6396426,"""Scaffold"""
9,"""Buchnera aphidicola""","""species""",9,"""Buchnera aphidicola""","""species""","""self""",false,"""https://ftp.ncbi.nlm.nih.gov/g…",1050,657623,"""Contig"""
14,"""Dictyoglomus thermophilum""","""species""",14,"""Dictyoglomus thermophilum""","""species""","""self""",false,"""https://ftp.ncbi.nlm.nih.gov/g…",1778,1963348,"""Contig"""
17,"""Methylophilus methylotrophus""","""species""",17,"""Methylophilus methylotrophus""","""species""","""self""",false,"""https://ftp.ncbi.nlm.nih.gov/g…",1125,1188656,"""Contig"""
…,…,…,…,…,…,…,…,…,…,…,…
3463822,"""Acrohelicosporium aquaticum""","""species""",,,,,true,,,,
3463823,"""Acrohelicosporium guizhouense""","""species""",,,,,true,,,,
3463824,"""Acrohelicosporium viridisporum""","""species""",,,,,true,,,,
3463825,"""Helicoma astrictum""","""species""",,,,,true,,,,


In [12]:
# Save results
output_path = os.path.join(rrna_dir, "rrna_to_genome_mapping.parquet")
mapping_df.write_parquet(output_path)
print(f"\nSaved mapping table to: {output_path}")

# Also save as TSV for easy viewing
mapping_df.write_csv(
    os.path.join(rrna_dir, "rrna_to_genome_mapping.tsv"),
    separator="\t"
)

# Show sample
print("\nSample mappings:")
mapping_df.sample(min(10, mapping_df.height))



Saved mapping table to: /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/rrna_to_genome_mapping.parquet

Sample mappings:


query_tax_id,query_name,query_rank,reference_tax_id,reference_name,reference_rank,relationship,max_rank_reached,ftp_path,protein_coding_genes,genome_size,assembly_level
i64,str,str,i64,str,str,str,bool,str,i64,i64,str
424539,"""Ruttnera lamellosa""","""species""",,,,,True,,,,
1739746,"""Arthrobacter sp. IN209""","""species""",1588023.0,"""Arthrobacter sp. Hiyo8""","""species""","""relative""",True,"""https://ftp.ncbi.nlm.nih.gov/g…",7038.0,5026722.0,"""Complete Genome"""
284853,"""mucus bacterium 31""","""species""",,,,,False,,,,
317713,"""Pittosporum venulosum""","""species""",,,,,True,,,,
80292,"""Bursera coyucensis""","""species""",,,,,True,,,,
409508,"""Spiraea decumbens""","""species""",,,,,True,,,,
948534,"""Schistidium tenuinerve""","""species""",,,,,True,,,,
246516,"""Cordia oncocalyx""","""species""",,,,,True,,,,
543519,"""Myxobolus leptobarbi""","""species""",59785.0,"""Myxobolus squamalis""","""species""","""relative""",True,"""https://ftp.ncbi.nlm.nih.gov/g…",5712.0,43671844.0,"""Contig"""
862512,"""Corynebacterium accolens ATCC …","""strain""",862512.0,"""Corynebacterium accolens ATCC …","""strain""","""self""",False,"""https://ftp.ncbi.nlm.nih.gov/g…",2360.0,2465636.0,"""Scaffold"""


## Create Additional Lookup Tables for Easy Access

In [13]:
# Create a simplified taxid -> ftp_path lookup table
# This is now trivial since our new API returns clean results
print("\nCreating simplified lookup table...")

lookup_df = mapping_df.select([
    'query_tax_id',
    'query_name',
    'query_rank',
    'reference_tax_id',
    'ftp_path',
    'relationship'
]).rename({
    'query_tax_id': 'tax_id',
    'query_name': 'name',
    'query_rank': 'rank'
})

# Save the simplified lookup
lookup_path = os.path.join(rrna_dir, "taxid_to_ftp_lookup.parquet")
lookup_df.write_parquet(lookup_path)
print(f"Saved simplified lookup to: {lookup_path}")

# Also save as TSV
lookup_df.write_csv(
    os.path.join(rrna_dir, "taxid_to_ftp_lookup.tsv"),
    separator="\t"
)

# Show statistics
with_ftp = lookup_df.filter(pl.col("ftp_path").is_not_null()).height
print("\nLookup table statistics:")
print(f"  Total entries: {lookup_df.height:,}")
print(f"  With FTP paths: {with_ftp:,} ({with_ftp/lookup_df.height*100:.1f}%)")
print("\nBy relationship type:")
print(lookup_df.group_by("relationship").agg(pl.len().alias("count")).sort("count", descending=True))



Creating simplified lookup table...
Saved simplified lookup to: /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/taxid_to_ftp_lookup.parquet

Lookup table statistics:
  Total entries: 172,219
  With FTP paths: 78,353 (45.5%)

By relationship type:
shape: (4, 2)
┌──────────────┬───────┐
│ relationship ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ null         ┆ 93866 │
│ relative     ┆ 50358 │
│ self         ┆ 26937 │
│ ancestor     ┆ 1058  │
└──────────────┴───────┘


## Summary Statistics

In [14]:
print("=" * 70)
print("FINAL SUMMARY")
print("=" * 70)

print("\nInput data:")
print(f"  rRNA sequences:           {rrna_df.height:,}")
print(f"  Unique rRNA tax_ids:      {len(all_rrna_tax_ids):,}")
print(f"  Taxonomy nodes:           {nodes_df.height:,}")
print(f"  GenBank assemblies:       {mini_ncbi.height:,}")

print("\nTree after propagation:")
print(f"  Total nodes with data:    {len(tax_tree.data_available):,}")
print(f"  Leaf nodes with data:     {len(tax_tree.leaf_nodes & tax_tree.data_available):,}")
print(f"  Internal nodes with data: {len(tax_tree.data_available - tax_tree.leaf_nodes):,}")

print("\nMapping results (genus-constrained):")
by_relationship = mapping_df.group_by("relationship").agg(pl.len().alias("count")).sort("count", descending=True)
for row in by_relationship.iter_rows(named=True):
    rel = row['relationship'] or 'no_match'
    count = row['count']
    pct = count / len(all_rrna_tax_ids) * 100
    print(f"  {rel:15s}: {count:8,} ({pct:5.1f}%)")

total_mapped = mapping_df.filter(pl.col("relationship").is_not_null()).height
print(f"\nTotal with genome reference: {total_mapped:,} / {len(all_rrna_tax_ids):,} ({total_mapped/len(all_rrna_tax_ids)*100:.1f}%)")

# Show rank-limited cases
rank_limited = mapping_df.filter(pl.col("max_rank_reached") == True).height
print(f"Cases limited by genus rank: {rank_limited:,} ({rank_limited/len(all_rrna_tax_ids)*100:.1f}%)")
print("=" * 70)


FINAL SUMMARY

Input data:
  rRNA sequences:           371,978
  Unique rRNA tax_ids:      172,219
  Taxonomy nodes:           2,708,826
  GenBank assemblies:       244,185

Tree after propagation:
  Total nodes with data:    213,159
  Leaf nodes with data:     174,662
  Internal nodes with data: 38,497

Mapping results (genus-constrained):
  no_match       :   93,866 ( 54.5%)
  relative       :   50,358 ( 29.2%)
  self           :   26,937 ( 15.6%)
  ancestor       :    1,058 (  0.6%)

Total with genome reference: 78,353 / 172,219 (45.5%)
Cases limited by genus rank: 134,832 (78.3%)
