# Collecting all data for the rRNA decontamination step
The goal is the create a table with the fasta headers, NCBI/SILVA taxonomy (lineage with ;), taxid (if possible), and direct links to fetch the transcriptome (and if missing, the genome).  
Ideally, this could replace requiring NCBI taxdump files as a data dependency, and taxonkit and ncbi-datasets tools as software dependencies.


### First collecting all headers from the rRNA fasta files in the data directory

In [9]:
import polars as pl
import os 
os.chdir("../")
# data_dir = os.environ.get("ROLYPOLY_DATA", "")
# data_dir
data_dir = "/clusterfs/jgi/scratch/science/metagen/neri/projects/data2/data/"

In [10]:
from rolypoly.utils.bio.sequences import find_fasta_files
fastas = find_fasta_files(data_dir)
fastas

[]

In [3]:
from rolypoly.utils.bio.polars_fastx import from_fastx_eager as read_fasta_df
from pathlib import Path
silva_138_rRNAs = Path("/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/SILVA_138_merged_masked.fa")
ncbi_rRNAs = Path("/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/rmdup_rRNA_ncbi_masked.fa")

silva_df = read_fasta_df(silva_138_rRNAs).select("header")
print("SILVA_138")
print(silva_df.shape)
print(silva_df["header"].head().to_list())
print(silva_df["header"].tail().to_list())

ncbi_df = read_fasta_df(ncbi_rRNAs).select("header")
print("NCBI")
print(ncbi_df.shape)
print(ncbi_df["header"].head().to_list())
print(ncbi_df["header"].tail().to_list())


fasta_df = ncbi_df.vstack(silva_df)
print(fasta_df.shape)
fasta_df


SILVA_138
(605794, 1)
['GBGR01001525.53.2586 Eukaryota;Archaeplastida;Chloroplastida;Charophyta;Phragmoplastophyta;Punica granatum (pomegranate)', 'FM179380.569157.572757 Bacteria;Proteobacteria;Alphaproteobacteria;Rickettsiales;Mitochondria;Vitis vinifera (wine grape)', 'HAES01022518.1.2143 Eukaryota;Amorphea;Obazoa;Opisthokonta;Holozoa;Choanozoa;Metazoa;Animalia;BCP clade;Bilateria;Protostomia;Ecdysozoa;Arthropoda;Crustacea;Malacostraca;Eumalacostraca;Peracarida;Proasellus cavaticus', 'GALN01470925.16.3648 Eukaryota;Amorphea;Obazoa;Opisthokonta;Nucletmycea;Fungi;Mucoromycota;Glomeromycotina;Glomeromycetes;Glomerales;Persicaria minor', 'GCIB01000043.146.3312 Bacteria;Proteobacteria;Alphaproteobacteria;Rickettsiales;Mitochondria;Colobanthus quitensis', 'GEHI01004799.1016.4640 Eukaryota;Amorphea;Obazoa;Opisthokonta;Nucletmycea;Fungi;Dikarya;Basidiomycota;Agaricomycotina;Agaricomycetes;Agaricales;Agaricaceae;Leucoagaricus;Leucocoprinus sp. HH-2015b', 'GFGL01112324.103.3394 Eukaryota;Arch

header
str
"""XR_010511770.1 PREDICTED: Musa…"
"""XR_010511769.1 PREDICTED: Musa…"
"""XR_010511768.1 PREDICTED: Musa…"
"""XR_010511767.1 PREDICTED: Musa…"
"""XR_010511766.1 PREDICTED: Musa…"
…
"""JXAL01000015.3677.5237 Bacteri…"
"""JXPC01135172.144.1415 Eukaryot…"
"""JRRC01069566.1254.3048 Eukaryo…"
"""JREP01018504.7711.8917 Bacteri…"


In [None]:
ncbi_df = ncbi_df.with_columns(accession = ncbi_df["header"].str.split_exact(" ",1).struct[0])
ncbi_df

header,accession
str,str
"""XR_010511770.1 PREDICTED: Musa…","""XR_010511770.1"""
"""XR_010511769.1 PREDICTED: Musa…","""XR_010511769.1"""
"""XR_010511768.1 PREDICTED: Musa…","""XR_010511768.1"""
"""XR_010511767.1 PREDICTED: Musa…","""XR_010511767.1"""
"""XR_010511766.1 PREDICTED: Musa…","""XR_010511766.1"""
…,…
"""NR_076132.1 Xylanimonas cellul…","""NR_076132.1"""
"""NR_076131.1 Stackebrandtia nas…","""NR_076131.1"""
"""NR_076128.1 Jonesia denitrific…","""NR_076128.1"""
"""NR_076127.1 Micrococcus luteus…","""NR_076127.1"""


In [26]:
ncbi_df["accession"].n_unique()

72352

In [None]:
ncbi_df["header"].str.slice(0,4).value_counts()

header,count
str,u32
"""XR_9""",2
"""XR_8""",122
"""XR_6""",13
"""NR_0""",8815
"""NR_1""",19444
"""XR_7""",7
"""XR_0""",43935
"""XR_5""",12
"""XR_4""",2


In [None]:
# %%bash
# # silva --> NCBI mapping
# wget https://ftp.arb-silva.de/current/Exports/taxonomy/taxmap_slv_ssu_ref_138.2.txt.gz --quiet \
#     --output-document=/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/taxmap_slv_ssu_ref_138.2.txt.gz
# wget https://ftp.arb-silva.de/current/Exports/taxonomy/taxmap_slv_lsu_ref_138.2.txt.gz --quiet \
#     --output-document=/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/taxmap_slv_lsu_ref_138.2.txt.gz

# # NCBI
# wget https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release231.accession2geneid.gz --quiet \
#     --output-document=/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/ncbi_release231.accession2geneid.gz


In [17]:
silva_taxmap = pl.scan_csv("/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/*txt.gz", has_header=True, separator="\t")

In [41]:
silva_taxmap = silva_taxmap.collect()
silva_taxmap = silva_taxmap.with_columns(pl.col("taxid").cast(pl.String))


In [None]:
ncbi_refseq_accession2geneid_lf = pl.scan_csv(
    "/home/neri/Documents/GitHub/rolypoly/src/rolypoly/data/rRNA/ncbi_release231.accession2geneid.gz",
      has_header=False,
      schema={"taxid":pl.String,"entrez_geneid":pl.String,"transcript_accession.version":pl.String,"protein_accession.version":pl.String},
        separator="\t",
        null_values=["na"])
set_1 = (pl.col("transcript_accession.version").is_in(ncbi_df["accession"]))
set_2 = (pl.col("protein_accession.version").is_in(ncbi_df["accession"]))

In [None]:
ncbi_refseq_accession2geneid_1 = ncbi_refseq_accession2geneid_lf.filter(set_1).collect()
ncbi_refseq_accession2geneid_2 = ncbi_refseq_accession2geneid_lf.filter(set_2).collect()

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  ncbi_refseq_accession2geneid_1 = ncbi_refseq_accession2geneid_lf.filter(set_1).collect()
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  ncbi_refseq_accession2geneid_2 = ncbi_refseq_accession2geneid_lf.filter(set_2).collect()


In [35]:
print(ncbi_refseq_accession2geneid_2.shape)
print(ncbi_refseq_accession2geneid_1.shape)

(0, 4)
(43300, 4)


In [40]:
ncbi_refseq_accession2geneid_1

taxid,entrez_geneid,transcript_accession.version,protein_accession.version
str,str,str,str
"""1002788""","""128980604""","""XR_008489486.1""","""na"""
"""10029""","""100689259""","""NR_045212.1""","""na"""
"""10029""","""100689260""","""NR_045132.1""","""na"""
"""10029""","""113838061""","""XR_003488590.1""","""na"""
"""10029""","""113838089""","""XR_003488597.1""","""na"""
…,…,…,…
"""9983""","""133755685""","""XR_009865461.1""","""na"""
"""9986""","""100328976""","""NR_033238.1""","""na"""
"""9995""","""124081442""","""XR_006848041.1""","""na"""
"""9995""","""124081443""","""XR_006848042.1""","""na"""


In [37]:
ncbi_df.filter(~pl.col("accession").is_in(ncbi_refseq_accession2geneid_1["transcript_accession.version"]))

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  ncbi_df.filter(~pl.col("accession").is_in(ncbi_refseq_accession2geneid_1["transcript_accession.version"]))


header,accession
str,str
"""XR_010541776.1 PREDICTED: Para…","""XR_010541776.1"""
"""XR_010541775.1 PREDICTED: Para…","""XR_010541775.1"""
"""XR_010541768.1 PREDICTED: Para…","""XR_010541768.1"""
"""XR_010541740.1 PREDICTED: Para…","""XR_010541740.1"""
"""XR_010541728.1 PREDICTED: Para…","""XR_010541728.1"""
…,…
"""NR_076132.1 Xylanimonas cellul…","""NR_076132.1"""
"""NR_076131.1 Stackebrandtia nas…","""NR_076131.1"""
"""NR_076128.1 Jonesia denitrific…","""NR_076128.1"""
"""NR_076127.1 Micrococcus luteus…","""NR_076127.1"""


some of these are obsolete on ncbi

In [43]:
silva_taxmap.filter(pl.col("taxid").is_in(ncbi_refseq_accession2geneid_1["taxid"]))

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  silva_taxmap.filter(pl.col("taxid").is_in(ncbi_refseq_accession2geneid_1["taxid"]))


primaryAccession,start,stop,path,organism_name,taxid
str,i64,i64,str,str,str
"""EF990231""",1,2684,"""Eukaryota;SAR;Stramenopiles;Oc…","""Cutleria multifida""","""30331"""
"""GAJI01010608""",1,3352,"""Eukaryota;Amorphea;Obazoa;Opis…","""Pyrenochaeta lycopersici CRA-P…","""29053"""
"""GEBJ01024281""",117,3397,"""Eukaryota;Archaeplastida;Chlor…","""Casuarina equisetifolia""","""29729"""
"""GEZV01040304""",3076,6431,"""Eukaryota;Archaeplastida;Chlor…","""Dendrobium catenatum""","""29780"""
"""GCJU01003965""",19,3653,"""Eukaryota;Amorphea;Obazoa;Opis…","""Zymoseptoria tritici""","""29031"""
…,…,…,…,…,…
"""ASJV01119400""",4195,5992,"""Eukaryota;Archaeplastida;Chlor…","""Capsicum annuum var. glabriusc…","""10129"""
"""ASJV01119702""",4103,5877,"""Eukaryota;Archaeplastida;Chlor…","""Capsicum annuum var. glabriusc…","""10129"""
"""MHAK01000049""",11645,13192,"""Bacteria;Candidatus Kryptonia;…","""Ignavibacteria bacterium RIFCS…","""59916"""
"""CXWL01090914""",4018,5799,"""Eukaryota;SAR;Stramenopiles;Oc…","""groundwater metagenome""","""9733"""


In [None]:
# ncbi_refseq_accession2geneid["protein_accession.version"]

protein_accession.version
str
"""na"""
"""na"""
"""na"""
"""na"""
"""na"""
…
"""na"""
"""na"""
"""na"""
"""na"""
