## Setup and Imports

In [1]:
import os
from pathlib import Path
import polars as pl
from tqdm.notebook import tqdm

from rolypoly.utils.bio.trees import TaxonomyTree
from rolypoly.utils.bio.sequences import write_fasta_file
from rolypoly.utils.logging.loggit import setup_logging

# Configuration
threads = 6
data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data"
log_file = "notebooks/Exprimental/rrna_genome_mapping.log"

logger = setup_logging(log_file)
print(f"Working directory: {data_dir}")

# Directory structure
rrna_dir = os.path.join(data_dir, "contam", "rrna")
taxonomy_dir = os.path.join(data_dir, "taxdump")
os.makedirs(rrna_dir, exist_ok=True)
os.makedirs(taxonomy_dir, exist_ok=True)

Working directory: /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data


## Load Input Data

In [2]:
# Load NCBI taxonomy data
print("Loading NCBI taxonomy...")
nodes_path = os.path.join(taxonomy_dir, "nodes.dmp")
names_path = os.path.join(taxonomy_dir, "names.dmp")

# Parse nodes.dmp
records = []
with open(nodes_path, "r") as f:
    for line in tqdm(f, desc="Reading nodes"):
        fields = line.rstrip("\n").rstrip("\t|").split("\t|\t")
        records.append({
            "tax_id": int(fields[0]),
            "parent_tax_id": int(fields[1]),
            "rank": fields[2]
        })
nodes_df = pl.from_records(records)
print(f"Loaded {nodes_df.height:,} taxonomy nodes")

# Parse names.dmp (scientific names only)
records = []
with open(names_path, "r") as f:
    for line in tqdm(f, desc="Reading names"):
        fields = line.rstrip("\n").rstrip("\t|").split("\t|\t")
        if fields[3] == "scientific name":
            records.append({
                "tax_id": int(fields[0]),
                "scientific_name": fields[1]
            })
sci_name_df = pl.from_records(records)
print(f"Loaded {sci_name_df.height:,} scientific names")

Loading NCBI taxonomy...


Reading nodes: 0it [00:00, ?it/s]

Loaded 2,708,826 taxonomy nodes


Reading names: 0it [00:00, ?it/s]

Loaded 2,708,826 scientific names


In [3]:
# Load rRNA sequences
print("\nLoading rRNA sequences...")
rrna_fasta_path = os.path.join(rrna_dir, "ncbi_rRNA_all_sequences.fasta")

# Load rRNA data table
rrna_df = pl.scan_csv(
    os.path.join(rrna_dir, "*.tab"),
    separator="@",
    has_header=False,
    null_values=["N/A"],
    new_columns=["taxid", "header", "name", "sequence"],
    include_file_paths="type"
).collect()

rrna_df = rrna_df.with_columns(
    pl.col("type").str.extract(r"([^/]+)\.tab$", 1).alias("rRNA_type"),
    pl.col("taxid").cast(pl.Int64)
).drop("type", "name")

print(f"Loaded {rrna_df.height:,} rRNA sequences")
print(f"Unique tax_ids in rRNA data: {rrna_df['taxid'].n_unique():,}")


Loading rRNA sequences...
Loaded 159,996 rRNA sequences
Unique tax_ids in rRNA data: 105,793
Loaded 159,996 rRNA sequences
Unique tax_ids in rRNA data: 105,793


In [4]:
# Load GenBank assembly summary
print("\nLoading GenBank assembly data...")
genbank_summary = pl.read_parquet(
    os.path.join(rrna_dir, "genbank_assembly_summary.parquet")
)

# Select best assembly per tax_id
mini_genebank = genbank_summary.sort(
    by=[
        pl.col("refseq_category").reverse(),
        pl.col("protein_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("non_coding_gene_count").cast(pl.Int64).reverse(),
        pl.col("seq_rel_date").reverse(),
        pl.col("genome_size").reverse(),
    ]
).unique(subset=["ncbi_taxonid"], keep="first")

print(f"Selected {mini_genebank.height:,} best assemblies")


Loading GenBank assembly data...
Selected 242,839 best assemblies
Selected 242,839 best assemblies


## Build Taxonomy Tree

In [5]:
# Prepare nodes with names
print("\nBuilding taxonomy tree...")
nodes_with_names = nodes_df.join(
    sci_name_df,
    on="tax_id",
    how="left"
).select(["tax_id", "parent_tax_id", "rank", "scientific_name"])

# Prepare data availability with metadata
data_available = mini_genebank.filter(
    pl.col("total_gene_count") > 0
).select([
    pl.col("ncbi_taxonid").alias("tax_id"),
    "ftp_path",
    "protein_coding_gene_count",
    "total_gene_count",
    "genome_size",
    "assembly_level",
    "organism_name"
])

# Create tree
tax_tree = TaxonomyTree(
    nodes_with_names,
    data_availability_df=data_available,
    priority_columns=['protein_coding_gene_count', 'genome_size']
)

# Propagate data to ancestors
print("\nPropagating data to ancestor nodes...")
nodes_updated = tax_tree.propagate_data_to_ancestors(aggregation_method='any')
print(f"Updated {nodes_updated:,} ancestor nodes with descendant data")

# Print statistics
tax_tree.print_stats()


Building taxonomy tree...
Built tree with 2708826 nodes, 2457509 leaf nodes
Built tree with 2708826 nodes, 2457509 leaf nodes
Set data availability for 165650 tax_ids
Stored metadata for 165650 tax_ids

Propagating data to ancestor nodes...

Propagating data to ancestor nodes using method: any
Set data availability for 165650 tax_ids
Stored metadata for 165650 tax_ids

Propagating data to ancestor nodes...

Propagating data to ancestor nodes using method: any
Propagation complete:
  Initial nodes with data: 165,650
  Nodes updated: 29,323
  Final nodes with data: 194,973
Updated 29,323 ancestor nodes with descendant data
Propagation complete:
  Initial nodes with data: 165,650
  Nodes updated: 29,323
  Final nodes with data: 194,973
Updated 29,323 ancestor nodes with descendant data
TAXONOMY TREE STATISTICS

Tree Structure:
  Total nodes:          2,708,826
  Leaf nodes:           2,457,509
  Internal nodes:       251,317

Data Availability:
  Nodes with data:      194,973
  Leaves wi

## Map rRNA Sequences to Genome References (Genus-Constrained)

In [6]:
# Get all rRNA tax_ids
all_rrna_tax_ids = rrna_df["taxid"].unique().to_list()
print(f"\nMapping {len(all_rrna_tax_ids):,} rRNA tax_ids to genome references...")
print("Search constraints: max_distance=10, max_rank='genus'\n")

# Run unified search
results, stats = tax_tree.find_nearest_with_data_unified_batch(
    tax_ids=all_rrna_tax_ids,
    max_distance=10,
    max_rank='genus',
    include_leaves=True,
    include_ancestors=True,
    return_stats=True
)

# Print statistics
print("Results:")
print(f"  Total queried:         {stats['total_queried']:,}")
print(f"  Self has data:         {stats['self_has_data']:,} ({stats['self_has_data']/stats['total_queried']*100:.1f}%)")
print(f"  Ancestor found:        {stats['ancestor_found']:,} ({stats['ancestor_found']/stats['total_queried']*100:.1f}%)")
print(f"  Leaf relatives found:  {stats['leaves_found']:,} ({stats['leaves_found']/stats['total_queried']*100:.1f}%)")
print(f"  No match:              {stats['no_match_found']:,} ({stats['no_match_found']/stats['total_queried']*100:.1f}%)")
print(f"  Limited by genus rank: {stats['max_rank_limited']:,} ({stats['max_rank_limited']/stats['total_queried']*100:.1f}%)")

total_with_ref = stats['self_has_data'] + stats['ancestor_found'] + stats['leaves_found']
print(f"\nOverall coverage: {total_with_ref:,} / {stats['total_queried']:,} ({total_with_ref/stats['total_queried']*100:.1f}%)")


Mapping 105,793 rRNA tax_ids to genome references...
Search constraints: max_distance=10, max_rank='genus'

Results:
  Total queried:         105,793
  Self has data:         17,540 (16.6%)
  Ancestor found:        21,747 (20.6%)
  Leaf relatives found:  21,994 (20.8%)
  No match:              66,215 (62.6%)
  Limited by genus rank: 87,250 (82.5%)

Overall coverage: 61,281 / 105,793 (57.9%)
Results:
  Total queried:         105,793
  Self has data:         17,540 (16.6%)
  Ancestor found:        21,747 (20.6%)
  Leaf relatives found:  21,994 (20.8%)
  No match:              66,215 (62.6%)
  Limited by genus rank: 87,250 (82.5%)

Overall coverage: 61,281 / 105,793 (57.9%)


## Extract and Save Mapping Results

In [7]:
# Build mapping table
print("\nBuilding mapping table...")
mapping_records = []

for tax_id, result in tqdm(results.items(), desc="Processing results"):
    query_info = result['query_info']
    
    # Base record
    base_record = {
        'query_tax_id': tax_id,
        'query_name': query_info['name'],
        'query_rank': query_info['rank'],
        'self_has_data': result['self_has_data'],
        'max_rank_reached': result['max_rank_reached']
    }
    
    # Ancestor data
    if result['ancestor']:
        anc = result['ancestor']
        meta = anc.get('metadata', {})
        base_record.update({
            'ancestor_tax_id': anc['tax_id'],
            'ancestor_name': anc['name'],
            'ancestor_rank': anc['rank'],
            'ancestor_distance': anc['distance'],
            'ancestor_ftp_path': meta.get('ftp_path'),
            'ancestor_data_source': meta.get('data_source', 'direct'),
            'ancestor_descendant_count': meta.get('descendant_count_with_data')
        })
    
    # Leaf relatives (take closest)
    if result['leaves']:
        leaf = result['leaves'][0]  # First leaf (closest/highest priority)
        meta = leaf.get('metadata', {})
        base_record.update({
            'leaf_tax_id': leaf['tax_id'],
            'leaf_name': leaf['name'],
            'leaf_distance': leaf['distance'],
            'leaf_ftp_path': meta.get('ftp_path'),
            'leaf_protein_genes': meta.get('protein_coding_gene_count'),
            'leaf_genome_size': meta.get('genome_size'),
            'total_leaf_relatives': len(result['leaves'])
        })
    
    mapping_records.append(base_record)

# Create mapping DataFrame
mapping_df = pl.from_records(mapping_records)
print(f"Created mapping table with {mapping_df.height:,} entries")

# Save results
output_path = os.path.join(rrna_dir, "rrna_to_genome_mapping.parquet")
mapping_df.write_parquet(output_path)
print(f"\nSaved mapping table to: {output_path}")

# Also save as TSV for easy viewing
mapping_df.write_csv(
    os.path.join(rrna_dir, "rrna_to_genome_mapping.tsv"),
    separator="\t"
)

# Show sample
print("\nSample mappings:")
mapping_df.head(10)


Building mapping table...


Processing results:   0%|          | 0/105793 [00:00<?, ?it/s]

Created mapping table with 105,793 entries

Saved mapping table to: /clusterfs/jgi/scratch/science/metagen/neri/code/rolypoly/data/contam/rrna/rrna_to_genome_mapping.parquet

Sample mappings:


query_tax_id,query_name,query_rank,self_has_data,max_rank_reached,ancestor_tax_id,ancestor_name,ancestor_rank,ancestor_distance,ancestor_ftp_path,ancestor_data_source,ancestor_descendant_count,leaf_tax_id,leaf_name,leaf_distance,leaf_ftp_path,leaf_protein_genes,leaf_genome_size,total_leaf_relatives
i64,str,str,bool,bool,i64,str,str,i64,str,str,i64,i64,str,i64,str,i64,i64,i64
0,,,False,False,,,,,,,,,,,,,,
7,"""Azorhizobium caulinodans""","""species""",True,False,7.0,"""Azorhizobium caulinodans""","""species""",0.0,,"""propagated""",1.0,,,,,,,
9,"""Buchnera aphidicola""","""species""",True,False,9.0,"""Buchnera aphidicola""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,
17,"""Methylophilus methylotrophus""","""species""",True,False,17.0,"""Methylophilus methylotrophus""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,
21,"""Phenylobacterium immobile""","""species""",False,True,20.0,"""Phenylobacterium""","""genus""",1.0,,"""propagated""",32.0,2823693.0,"""Phenylobacterium montanum""",2.0,"""https://ftp.ncbi.nlm.nih.gov/g…",5079.0,5649484.0,12.0
23,"""Shewanella colwelliana""","""species""",True,False,23.0,"""Shewanella colwelliana""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,
24,"""Shewanella putrefaciens""","""species""",True,False,24.0,"""Shewanella putrefaciens""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,
25,"""Shewanella hanedai""","""species""",True,False,25.0,"""Shewanella hanedai""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,25.0,"""Shewanella hanedai""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…",4887.0,5939027.0,1.0
33,"""Myxococcus fulvus""","""species""",True,False,33.0,"""Myxococcus fulvus""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,
34,"""Myxococcus xanthus""","""species""",True,False,34.0,"""Myxococcus xanthus""","""species""",0.0,"""https://ftp.ncbi.nlm.nih.gov/g…","""direct""",,,,,,,,


## Summary Statistics

In [None]:
print(f"\nInput data:")
print(f"  rRNA sequences:           {rrna_df.height:,}")
print(f"  Unique rRNA tax_ids:      {len(all_rrna_tax_ids):,}")
print(f"  Taxonomy nodes:           {nodes_df.height:,}")
print(f"  GenBank assemblies:       {mini_genebank.height:,}")

print(f"\nTree after propagation:")
print(f"  Total nodes with data:    {len(tax_tree.data_available):,}")
print(f"  Leaf nodes with data:     {len(tax_tree.leaf_nodes & tax_tree.data_available):,}")
print(f"  Internal nodes with data: {len(tax_tree.data_available - tax_tree.leaf_nodes):,}")

print(f"\nMapping results (genus-constrained):")
with_ancestor = mapping_df.filter(pl.col("ancestor_tax_id").is_not_null()).height
with_leaf = mapping_df.filter(pl.col("leaf_tax_id").is_not_null()).height
with_self = mapping_df.filter(pl.col("self_has_data") == True).height
no_match = mapping_df.filter(
    (pl.col("ancestor_tax_id").is_null()) & 
    (pl.col("leaf_tax_id").is_null()) & 
    (pl.col("self_has_data") == False)
).height

print(f"  Self has data:            {with_self:,} ({with_self/len(all_rrna_tax_ids)*100:.1f}%)")
print(f"  With ancestor reference:  {with_ancestor:,} ({with_ancestor/len(all_rrna_tax_ids)*100:.1f}%)")
print(f"  With leaf reference:      {with_leaf:,} ({with_leaf/len(all_rrna_tax_ids)*100:.1f}%)")
print(f"  No reference found:       {no_match:,} ({no_match/len(all_rrna_tax_ids)*100:.1f}%)")

total_mapped = with_self + with_ancestor + with_leaf
print(f"\nTotal with genome reference: {total_mapped:,} / {len(all_rrna_tax_ids):,} ({total_mapped/len(all_rrna_tax_ids)*100:.1f}%)")
print("=" * 70)

FINAL SUMMARY

Input data:
  rRNA sequences:           159,996
  Unique rRNA tax_ids:      105,793
  Taxonomy nodes:           2,708,826
  GenBank assemblies:       242,839

Tree after propagation:
  Total nodes with data:    194,973
  Leaf nodes with data:     161,573
  Internal nodes with data: 33,400

Mapping results (genus-constrained):
  Self has data:            17,540 (16.6%)
  With ancestor reference:  39,287 (37.1%)
  With leaf reference:      35,768 (33.8%)
  No reference found:       66,215 (62.6%)

Total with genome reference: 92,595 / 105,793 (87.5%)
