# Vector Retrieval Experiment

In [1]:
import pandas as pd
from data_gatherer.data_gatherer import DataGatherer
from data_gatherer.parser.xml_parser import XMLParser
from data_gatherer.parser.html_parser import HTMLParser
from data_gatherer.logger_setup import setup_logging
from data_gatherer.retriever.embeddings_retriever import EmbeddingsRetriever
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from lxml import etree
import dspy
import logging
import re
import os
import time
from collections import defaultdict

In [2]:
logger = setup_logging('vector_retrieval_experiment', './logs/vector_retrieval_experiment.log', level=logging.INFO)

In [3]:
logger.info("Starting vector retrieval experiment")

[97m1301467205.py - line 1 - INFO - Starting vector retrieval experiment[0m


## 1. Load corpus and ground truth

In [4]:
input_corpus = pd.read_parquet('scripts/exp_input/Local_fetched_data_1.parquet')  # or load HTML and extract text
ground_truth = pd.read_parquet('scripts/output/gold/dataset_citation_records_Table.parquet')  # adjust as needed

# Add a warning about input data:
logger.info(f"Corpus shape: {str(input_corpus.shape)}")
logger.info(f"Ground truth shape: {str(ground_truth.shape)}")

[97m2826339252.py - line 5 - INFO - Corpus shape: (2190, 7)[0m
[97m2826339252.py - line 6 - INFO - Ground truth shape: (401327, 7)[0m


In [5]:
ground_truth['pmc_id'] = ground_truth['citing_publication_link'].str.extract(r'(PMC\d+)', flags=re.IGNORECASE)

In [6]:
input_corpus.head()  # Check the structure of the corpus

Unnamed: 0,pub_title,file_name,raw_cont,format,length,path,publication
0,Evolution of regulatory signatures in primate ...,PMC7668098__Evolution of regulatory signatures...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,167515,scripts/tmp/raw_files/PMC/PMC7668098__Evolutio...,pmc7668098
1,Functional annotation of noncoding mutations i...,PMC8321657__Functional annotation of noncoding...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,206647,scripts/tmp/raw_files/PMC/PMC8321657__Function...,pmc8321657
2,Lung adenocarcinomas without driver genes conv...,PMC11070398__Lung adenocarcinomas without driv...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,170204,scripts/tmp/raw_files/PMC/PMC11070398__Lung ad...,pmc11070398
3,SPNeoDeath_ A demographic and epidemiological ...,PMC7419335__SPNeoDeath_ A demographic and epid...,"<html lang=""en"" class=""""><head>\n\n <me...",html,183668,scripts/tmp/raw_files/PMC/PMC7419335__SPNeoDea...,pmc7419335
4,"O-linked α2,3 sialylation defines stem cell po...","PMC8741191__O-linked α2,3 sialylation defines ...","\n<!DOCTYPE html>\n<html lang=""en"" >\n <hea...",html,230136,scripts/tmp/raw_files/PMC/PMC8741191__O-linked...,pmc8741191


In [7]:
ground_truth.head()  # Check the structure of the ground truth

Unnamed: 0,identifier,repository,citing_publication_link,citation_record_source,citation_record_from_doi,doi,pmcid,pmc_id
0,PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1,proteomexchange_search.tsv,1,10.1038/S41467-025-56720-1,,
1,PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312,proteomexchange_search.tsv,1,10.6019/PXD051312,,
2,PXD051312,PRIDE,https://dx.doi.org/10.1002/prca.202400095,proteomexchange_search.tsv,1,10.1002/prca.202400095,,
3,PXD051312,PRIDE,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,proteomexchange_search.tsv,0,,PMC11895760,PMC11895760
4,PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571,proteomexchange_search.tsv,1,10.17159/SAJS.2025/18571,,


In [8]:
# Enhanced query using hackathon context trigger keywords
query_ontology_aware = """Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
"""
query_augmented = """Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code"""
query_base = "Available data, accession code, data repository, deposited data"

query_base_adj = "Data Availability Statement, Methods with dataset mention(s), Deposited data, Data Accession, Data Provenance, Downloaded data."

query = query_ontology_aware

In [9]:
xml_parser = XMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)
xml_parser.embeddings_retriever.embed_query(query)
html_parser = HTMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)
html_parser.embeddings_retriever.embed_query(query)

[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 19 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mxml_parser.py - line 25 - INFO - Initializing xmlRetriever[0m
[97membeddings_retriever.py - line 36 - INFO - Metal Performance Shaders available - using Apple Silicon acceleration[0m
[97membeddings_retriever.py - line 48 - INFO - Initialized model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)[0m
[97membeddings_retriever.py - line 54 - INFO - Using model's actual max sequence length: 256[0m
[97mbase_parser.py - line 45 - INFO - L

Note: some files are being skipped because of ground truth incompleteness.

In [10]:
# Systematic evaluation of different models and top-k values (OPTIMIZED)

# Define models to test
models_to_test = [
    # Base models
    'sentence-transformers/all-MiniLM-L6-v2', 
    #'sentence-transformers/all-mpnet-base-v2',
    ###'sentence-transformers/all-MiniLM-L12-v2',
    #'sentence-transformers/sentence-t5-base',

    # BioMed
    #"neuml/pubmedbert-base-embeddings",
    #'sentence-transformers/allenai-specter',
    #'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',

    # MSMARCO passage ranking models
    ###'sentence-transformers/msmarco-distilbert-base-v4',
    #'sentence-transformers/msmarco-bert-base-dot-v5',
    #'sentence-transformers/msmarco-distilbert-dot-v5',
    #'sentence-transformers/msmarco-distilbert-base-tas-b',

    # Paraphrase models
    ###'sentence-transformers/paraphrase-MiniLM-L3-v2',

    # Semantic Search
    ###'sentence-transformers/multi-qa-mpnet-base-cos-v1',
    #'sentence-transformers/multi-qa-distilbert-cos-v1',
    ###'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'

    # https://huggingface.co/Snowflake/snowflake-arctic-embed-m
]

# Define top-k values to test
topk_values = [1, 3, 5, 9]
max_k = max(topk_values)  # We'll retrieve this many and slice for smaller k values

# Store results - Initialize properly for all queries
results = defaultdict(lambda: defaultdict(dict))

logger.info(f"Starting OPTIMIZED systematic evaluation with enhanced query")
logger.info(f"Testing {len(models_to_test)} models with top-k values {topk_values}")
logger.info(f"Optimization: Single model load per model, reuse for all publications")

queries = ['query_ontology_aware', 'query_augmented', 'query_base', 'query_base_adj']

for q_i,query in enumerate([query_ontology_aware, query_augmented, query_base, query_base_adj]):
    if q_i != 0:
        continue
    q_name = queries[q_i]
    logger.info(f"Query {q_i} {q_name}: {query}")
    for model_name in models_to_test:
        logger.info(f"\n{'='*60}")
        logger.info(f"Testing model: {model_name}")
        logger.info(f"{'='*60}")
        
        # Initialize recall counters for all top-k values
        recalls = {k: 0 for k in topk_values}
        cnt = 0
        start_time = time.time()
        
        # Clean up previous embeddings
        if os.path.exists("corpus_embeddings.npy"):
            os.remove("corpus_embeddings.npy")

        
        logger.info(f"Model {model_name} loaded successfully!")
        
        for i, publication in input_corpus.iterrows():
            if i == 10:  # Limit to first 10 publications for faster testing
                break

            logger.info(f"Publication: {publication['publication']}")
            
            gt = ground_truth[ground_truth['pmc_id'].str.lower() == publication['publication'].lower()]
            idnts = gt['identifier'].tolist()

            logger.info(f"Identifiers in ground truth: {idnts}")
            
            if publication['format'] == 'xml':
                sections = xml_parser.extract_sections_from_xml(etree.fromstring(publication['raw_cont'].encode('utf-8')))
                sections = xml_parser.from_sections_to_corpus(sections)
                parser = xml_parser
            elif publication['format'] == 'html':
                clean_html = html_parser.normalize_HTML(publication['raw_cont'])
                sections = html_parser.extract_sections_from_html(clean_html)
                sections = html_parser.from_sections_to_corpus(sections)
                parser = html_parser

            else:
                logger.warning(f"Unsupported format {publication['format']} for publication {publication['publication']}. Skipping.")
                continue

            # Check if identifiers are in content
            idnts_in_cont = []
            for idnt in idnts:
                if idnt in publication['raw_cont']:
                    idnts_in_cont.append(idnt)
            
            logger.info(f"Identifiers in content: {idnts_in_cont}")
            
            if not idnts_in_cont:
                continue
                
            cnt += 1
            
            # Prepare corpus
            corpus = []
            for section in sections:
                corpus.append({
                    'sec_txt': 'Section Title: ' + section['section_title'] + 
                            '. Content: ' + section['sec_txt']
                })
            
            logger.info(f"Corpus:\n{str.join('\n',[item['sec_txt'] for item in corpus])}")
            
            # OPTIMIZATION: Only embed corpus (model already loaded)
            try:
                parser.embeddings_retriever.embed_corpus(corpus, batch_size=128)
                
                # OPTIMIZATION: Single retrieval with max_k, then slice for different k values
                full_result = parser.embeddings_retriever.search(query=None, k=max_k)

                for full_result_item in full_result:
                    logger.info(f"L2 Norm {full_result_item['L2_distance']} --> {full_result_item['text'][:150]}")

                # Evaluate for all top-k values using the same retrieval result
                for topk_docs_to_retrieve in topk_values:
                    logger.info(f"Evaluating with top-k = {topk_docs_to_retrieve}")
                    
                    # Slice results for current k value
                    result = full_result[:topk_docs_to_retrieve]
                    
                    # Combine all retrieved text
                    iterres = '. '.join([r['text'] for r in result])
                    
                    # Check matches
                    matches = set()
                    not_matched = set()
                    for j, row in gt.iterrows():
                        if row['identifier'].lower() in iterres.lower():
                            recalls[topk_docs_to_retrieve] += 1/len(idnts_in_cont)
                            matches.add(row['identifier'])
                    
                    not_matched = set(idnts_in_cont) - matches
                    
                    logger.info(f"Publication {publication['publication']}, Top-k {topk_docs_to_retrieve}: Found {len(matches)} matches out of {len(idnts_in_cont)} ground truth")
                    
                    logger.info(f"Missed citations: {not_matched}")

            except Exception as e:
                logger.error(f"Error processing publication {i+1} with model {model_name}: {e}")
                continue
        
        # Calculate final recalls and store results for all top-k values
        elapsed_time = time.time() - start_time
        
        for topk_docs_to_retrieve in topk_values:
            final_recall = recalls[topk_docs_to_retrieve]/cnt if cnt > 0 else 0
            
            # Store results - Create a unique key combining model and query
            result_key = f"{model_name}_{q_name}"
            results[result_key][topk_docs_to_retrieve] = {
                'recall': final_recall,
                'processed_docs': cnt,
                'time_seconds': elapsed_time,
                'query': q_name,
                'model': model_name
            }
            
            logger.info(f"Model: {model_name}, Top-k: {topk_docs_to_retrieve}, Recall: {final_recall:.4f}")
        
        logger.info(f"Total time for model {model_name}: {elapsed_time:.2f}s")

logger.info(f"\n{'='*60}")
logger.info("OPTIMIZED evaluation completed!")
logger.info(f"{'='*60}")

[97m4152910639.py - line 40 - INFO - Starting OPTIMIZED systematic evaluation with enhanced query[0m
[97m4152910639.py - line 41 - INFO - Testing 1 models with top-k values [1, 3, 5, 9][0m
[97m4152910639.py - line 42 - INFO - Optimization: Single model load per model, reuse for all publications[0m
[97m4152910639.py - line 50 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like d

Embedding corpus of 66 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9043202996253967 --> Section Title: Data Availability.. Content: Data Availability.
The data reported in this paper have been deposited in the Gene Expression Omnibus data[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.2487478256225586 --> Section Title: Results. Content: Results
Next, in each neuronal subtype, we identified DA GREs and differentially expressed (DE) genes that were enric[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.275949478149414 --> Section Title: Results. Content: Results
We produced high-quality H3K27ac ChIP-seq datasets ( Dataset S2 ), with biological replicates showing strong [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.3142105340957642 --> Section Title: RNA-Seq.. Content: RNA-Seq.
RNA wa

Embedding time: 0.46s (0.007s per chunk)
Corpus embedding completed. Shape: (66, 384)
Chunking results: 66 original documents → 66 embedded chunks


[97mxml_parser.py - line 312 - INFO - Section 'Introduction' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Results' split into 54 chunks from 54 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Functional annotations to prioritize noncoding mutations' split into 13 chunks from 13 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Mutational signatures in TFBSs' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Mutational patterns in TFBSs' split into 11 chunks from 11 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Mutated regulatory elements in cancer' split into 20 chunks from 20 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Enrichment of cancer genes and pathways in mutated regulatory elements' split into 6 chunks from 6 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Discussion' split into 4 chunks from 4 paragraphs[0m
[97mx

Embedding corpus of 163 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.8121072053909302 --> Section Title: Data Availability. Content: Data Availability
The PCAWG datasets used in the study are available at Synapse ( https://www.synapse.org/ [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0451792478561401 --> Section Title: Materials and Methods. Content: Materials and Methods
Next, genomic datasets were collected from various assays and cell lines. For eac[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0615922212600708 --> Section Title: Motif identification and annotation. Content: Motif identification and annotation
Next, genomic datasets were collected from various as[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1407973766326904 --> Section Title: Materials and Methods. Content: M

Embedding time: 1.20s (0.007s per chunk)
Corpus embedding completed. Shape: (163, 384)
Chunking results: 163 original documents → 163 embedded chunks


[97mxml_parser.py - line 312 - INFO - Section 'Introduction' split into 6 chunks from 6 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Methods' split into 11 chunks from 11 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Gene list acquisition' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Mutational frequency' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Identifying conserved pathways and functions' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Results' split into 31 chunks from 31 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Mutational and expression features of NCD lung adenocarcinoma' split into 9 chunks from 9 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Convergent adaptive strategies: loss of differentiated tissue functions' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.p

Embedding corpus of 127 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9825218915939331 --> Section Title: Data availability. Content: Data availability
All data used in this analysis are included in the Supplemental Material.
[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1075328588485718 --> Section Title: Gene list acquisition. Content: Gene list acquisition
We divided the TCGA lung adenocarcinoma cohort based on known driver mutations in[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.162867784500122 --> Section Title: Gene list acquisition. Content: Gene list acquisition
TCGA data is whole exome sequencing with paired tumor/normal analysis to exclude [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1660921573638916 --> Section Title: Declarations. Content: Declarations
JKT has been 

Embedding time: 0.91s (0.007s per chunk)
Corpus embedding completed. Shape: (127, 384)
Chunking results: 127 original documents → 127 embedded chunks


[97mhtml_parser.py - line 821 - INFO - Pre-deduplication: 136 corpus documents[0m
[97mhtml_parser.py - line 852 - INFO - HTML sections converted: 32 sections → 71 unique corpus documents (processed 65 duplicates with title merging)[0m
[97m4152910639.py - line 99 - INFO - Identifiers in content: ['syn22240254'][0m
[97m4152910639.py - line 114 - INFO - Corpus:
Section Title: No Title. Content: <p class="usa-banner__header-text"> An official website of the United States government </p> <p> <strong> Official websites use .gov </strong> <br/> A <strong> .gov </strong> website belongs to an official government organization in the United States. </p> <p> <strong> Secure .gov websites use HTTPS </strong> <br/> A <strong> lock </strong> ( <span class="icon-lock"> </span> ) or <strong> https:// </strong> means you've safely connected to the .gov website. Share sensitive information only on official, secure websites. </p>
Section Title: SPNeoDeath: A demographic and epidemiological dataset

Embedding corpus of 71 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.13904869556427 --> Section Title: Experimental design, materials and methods. Content: <p id="para0011"> The raw data from SINASC and SIM can be obtained directly from D[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1442917585372925 --> Section Title: Table 1.. Content: <p> SPNeoDeath dataset data dictionary.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1508917808532715 --> Section Title: Abstract. Content: In order to build the dataset, DBF files were downloaded from DATASUS ftp repository and converted to CSV format, th[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1747729778289795 --> Section Title: Abstract. Content: In order to build the dataset, DBF files were downloaded from DATASUS ftp repository and conv

Embedding time: 0.89s (0.013s per chunk)
Corpus embedding completed. Shape: (71, 384)
Chunking results: 71 original documents → 71 embedded chunks


[97mhtml_parser.py - line 325 - INFO - Extracted 51 sections from HTML.[0m
[97mhtml_parser.py - line 725 - INFO - Converting 51 HTML sections to embeddings corpus[0m
[97mhtml_parser.py - line 821 - INFO - Pre-deduplication: 303 corpus documents[0m
[97mhtml_parser.py - line 852 - INFO - HTML sections converted: 51 sections → 170 unique corpus documents (processed 133 duplicates with title merging)[0m
[97m4152910639.py - line 99 - INFO - Identifiers in content: ['GSE115058', 'syn26451619'][0m
[97m4152910639.py - line 114 - INFO - Corpus:
Section Title: No Title. Content: <p class="usa-banner__header-text"> An official website of the United States government </p> <p> <strong> Official websites use .gov </strong> <br/> A <strong> .gov </strong> website belongs to an official government organization in the United States. </p> <p> <strong> Secure .gov websites use HTTPS </strong> <br/> A <strong> lock </strong> ( <span class="icon-lock"> </span> ) or <strong> https:// </strong> me

Embedding corpus of 170 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.92525315284729 --> Section Title: Abstract. Content: S1 to S5 </p> <p> Tables S1 to S3 </p> <p> <a class="usa-link usa-link--external" href="https://en.bio-protocol.org/[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9859894514083862 --> Section Title: Abstract. Content: S1 to S5 </p> <p> Tables S1 to S3 </p> <p> <a class="usa-link usa-link--external" href="https://en.bio-protocol.org/[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0210323333740234 --> Section Title: Abstract | RESULTS. Content: We then performed SLBR-N affinity capture on CSC lysates and subjected the captured glycoproteins to mass [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0240824222564697 --> Section Title: No Title. Content: <p> <strong> Dat

Embedding time: 1.21s (0.007s per chunk)
Corpus embedding completed. Shape: (170, 384)
Chunking results: 170 original documents → 170 embedded chunks


[97mhtml_parser.py - line 821 - INFO - Pre-deduplication: 271 corpus documents[0m
[97mhtml_parser.py - line 852 - INFO - HTML sections converted: 58 sections → 151 unique corpus documents (processed 120 duplicates with title merging)[0m
[97m4152910639.py - line 99 - INFO - Identifiers in content: ['syn1729383', 'syn1734155'][0m
[97m4152910639.py - line 114 - INFO - Corpus:
Section Title: No Title. Content: <p class="usa-banner__header-text"> An official website of the United States government </p> <p> <strong> Official websites use .gov </strong> <br/> A <strong> .gov </strong> website belongs to an official government organization in the United States. </p> <p> <strong> Secure .gov websites use HTTPS </strong> <br/> A <strong> lock </strong> ( <span class="icon-lock"> </span> ) or <strong> https:// </strong> means you've safely connected to the .gov website. Share sensitive information only on official, secure websites. </p>
Section Title: Mutational landscape and significance 

Embedding corpus of 151 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0377492904663086 --> Section Title: Discussion. Content: Ultimately, these data and their associations with different clinical features and subtypes should contribute to t[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0587420463562012 --> Section Title: Abstract. Content: Cross-cancer survival analysis was based on the Cox proportional hazards model, as implemented in the R package ‘sur[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.062429428100586 --> Section Title: Abstract | Online Methods. Content: Per sample, per gene coverage values were obtained using WIG-formatted reference coverage files ass[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0666110515594482 --> Section Title: Standardization and tracking of mu

Embedding time: 0.98s (0.006s per chunk)
Corpus embedding completed. Shape: (151, 384)
Chunking results: 151 original documents → 151 embedded chunks


[97mxml_parser.py - line 312 - INFO - Section 'RESULTS' split into 20 chunks from 20 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'A systems biology framework prioritizes associations between gut microbial metabolites and the GPCRome' split into 1 chunks from 1 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Multi-omics analysis reveals potential GPCR targets for Alzheimer’s disease (AD)' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'MR analysis reveals AD-relevant gut microbial metabolites' split into 1 chunks from 1 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Machine-learning-based discovery of the gut metabolite-GPCRome interactome' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Human gut microbial metabolite-GPCRome interactome in AD' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Human gut metabolite-G

Embedding corpus of 175 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.7516184449195862 --> Section Title: Data and code availability. Content: Data and code availability
All predicted GPCR-metabolite pairs are freely available at https://Gut[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.8358136415481567 --> Section Title: RESOURCE AVAILABILITY. Content: RESOURCE AVAILABILITY
All predicted GPCR-metabolite pairs are freely available at https://Gut-GPCRome.l[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.8697059154510498 --> Section Title: Data and code availability. Content: Data and code availability
Codes for machine learning frameworks and other data analyses are avail[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9579589366912842 --> Section Title: RESOURCE AVAILABILITY. Content: R

Embedding time: 1.70s (0.010s per chunk)
Corpus embedding completed. Shape: (175, 384)
Chunking results: 175 original documents → 175 embedded chunks


[97mxml_parser.py - line 312 - INFO - Section 'Main' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'BTSP can be induced in single spines' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Dendritic, delayed, stochastic CaMKII activity' split into 6 chunks from 6 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'DDSC coincides with Ca' split into 4 chunks from 4 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'DDSC as an instructive signal for BTSP' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Intracellular calcium release underlies DDSC' split into 3 chunks from 3 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Discussion' split into 2 chunks from 2 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Methods' split into 18 chunks from 18 paragraphs[0m
[97mxml_parser.py - line 312 - INFO - Section 'Animals

Embedding corpus of 103 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.1080962419509888 --> Section Title: Data availability. Content: Data availability
Source data associated with the figures presented in this paper are available at Synapse [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.206272840499878 --> Section Title: Online content. Content: Online content
Any methods, additional references, Nature Portfolio reporting summaries, source data, extended[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.2627151012420654 --> Section Title: Extended data. Content: Extended data
is available for this paper at 10.1038/s41586-024-08021-8.
[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.3470213413238525 --> Section Title: Plasmid constructs. Content: Plasmid constructs
We fused two monomeric d

Embedding time: 0.73s (0.007s per chunk)
Corpus embedding completed. Shape: (103, 384)
Chunking results: 103 original documents → 103 embedded chunks


[97mhtml_parser.py - line 325 - INFO - Extracted 54 sections from HTML.[0m
[97mhtml_parser.py - line 725 - INFO - Converting 54 HTML sections to embeddings corpus[0m
[97mhtml_parser.py - line 821 - INFO - Pre-deduplication: 440 corpus documents[0m
[97mhtml_parser.py - line 852 - INFO - HTML sections converted: 54 sections → 180 unique corpus documents (processed 260 duplicates with title merging)[0m
[97m4152910639.py - line 99 - INFO - Identifiers in content: ['syn3388564', 'syn3157275', 'syn4896408'][0m
[97m4152910639.py - line 114 - INFO - Corpus:
Section Title: No Title. Content: <p class="usa-banner__header-text"> An official website of the United States government </p> <p> <strong> Official websites use .gov </strong> <br/> A <strong> .gov </strong> website belongs to an official government organization in the United States. </p> <p> <strong> Secure .gov websites use HTTPS </strong> <br/> A <strong> lock </strong> ( <span class="icon-lock"> </span> ) or <strong> https:/

Embedding corpus of 180 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9844000339508057 --> Section Title: Data availability | Associated Data | Data Availability Statement. Content: DNA methylation data that support the findings of this stud[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.986541211605072 --> Section Title: Methods. Content: The Bowtie 1 software package <a aria-describedby="b54" class="usa-link" href="#b54"> <sup> 54 </sup> </a> was used t[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9901485443115234 --> Section Title: Data availability | Data Availability Statement. Content: <p> RNA-seq data that support the findings of this study have been deposited [0m
[97m4152910639.py - line 124 - INFO - L2 Norm 1.0204366445541382 --> Section Title: Associated Data. Content: Acrophas

Embedding time: 1.69s (0.009s per chunk)
Corpus embedding completed. Shape: (180, 384)
Chunking results: 180 original documents → 180 embedded chunks


[97mhtml_parser.py - line 148 - INFO - Length of normalized HTML: 455255[0m
[97mhtml_parser.py - line 245 - INFO - Function_call: extract_sections_from_html(html_content) with content length 455254[0m
[97mhtml_parser.py - line 325 - INFO - Extracted 60 sections from HTML.[0m
[97mhtml_parser.py - line 725 - INFO - Converting 60 HTML sections to embeddings corpus[0m
[97mhtml_parser.py - line 821 - INFO - Pre-deduplication: 448 corpus documents[0m
[97mhtml_parser.py - line 852 - INFO - HTML sections converted: 60 sections → 187 unique corpus documents (processed 261 duplicates with title merging)[0m
[97m4152910639.py - line 99 - INFO - Identifiers in content: ['syn6156761'][0m
[97m4152910639.py - line 114 - INFO - Corpus:
Section Title: No Title. Content: <p class="usa-banner__header-text"> An official website of the United States government </p> <p> <strong> Official websites use .gov </strong> <br/> A <strong> .gov </strong> website belongs to an official government organ

Embedding corpus of 187 documents using sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 169 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 151 - INFO - Computing L2 distances using numpy.[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.8318021297454834 --> Section Title: Abstract | Results | False positive analysis. Content: In this work, we analyze the reported false positives by looking into curated bi[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9690244793891907 --> Section Title: Abstract | Introduction. Content: Currently, CYC2008 [ <a aria-describedby="pone.0183460.ref027" class="usa-link" href="#pone.0183460.r[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.9828751087188721 --> Section Title: Abstract. Content: It is important to note that it might be a variation in the number of subunits and repeats with respect to the infor[0m
[97m4152910639.py - line 124 - INFO - L2 Norm 0.986499011516571 --> Section Title: Biological measures. Content: We u

Embedding time: 2.19s (0.012s per chunk)
Corpus embedding completed. Shape: (187, 384)
Chunking results: 187 original documents → 187 embedded chunks


In [11]:
print("="*80)
print("VECTOR RETRIEVAL EVALUATION RESULTS")
print("="*80)

# Group results by query type - Updated logic for new structure
query_results = {'ontology_aware': {}, 'augmented': {}, 'base': {}}

for result_key, model_results in results.items():
    # Extract model name and query name from the combined key
    if '_query_' in result_key:
        model_name, query_name = result_key.split('_query_', 1)
    else:
        # Fallback: extract from stored data
        first_result = list(model_results.values())[0]
        model_name = first_result['model']
        query_name = first_result['query']
    
    if query_name not in query_results:
        query_results[query_name] = {}
    if model_name not in query_results[query_name]:
        query_results[query_name][model_name] = {}
    
    for topk, metrics in model_results.items():
        query_results[query_name][model_name][topk] = metrics

print(f"Found query types: {list(query_results.keys())}")
print(f"Results structure check:")
for q_type, q_data in query_results.items():
    print(f"  {q_type}: {len(q_data)} models")

# Create and save tables for each query type
all_tables = {}

for query_name, query_data in query_results.items():
    if not query_data:  # Skip empty query results
        print(f"Skipping empty query: {query_name}")
        continue
        
    print(f"\n{'='*80}")
    print(f"RESULTS FOR {query_name.upper().replace('_', ' ')}")
    print(f"{'='*80}")
    
    # Create table data
    table_data = []
    for model_name, model_results in query_data.items():
        row = {'Model': model_name.split('/')[-1]}  # Just the model name without org
        
        # Add recall for each top-k value
        for topk in sorted(model_results.keys()):
            row[f'Top-K {topk}'] = f"{model_results[topk]['recall']:.4f}"
        
        # Add processing info
        first_result = list(model_results.values())[0]
        row['Processed Docs'] = first_result['processed_docs']
        row['Time (s)'] = f"{first_result['time_seconds']:.2f}"
        
        table_data.append(row)
    
    # Convert to DataFrame and display
    df = pd.DataFrame(table_data)
    print(df.to_string(index=False))
    
    # Find best model for this query
    best_recall = 0
    best_config = None
    for model_name, model_results in query_data.items():
        for topk, metrics in model_results.items():
            if metrics['recall'] > best_recall:
                best_recall = metrics['recall']
                best_config = (model_name.split('/')[-1], topk)
    
    if best_config:
        print(f"\n🏆 Best for {query_name}: {best_recall:.4f} - {best_config[0]} (Top-K: {best_config[1]})")

print(f"\n{'='*80}")
print("ALL QUERIES EVALUATION COMPLETE!")
print("Tables saved to scripts/output/")
print(f"{'='*80}")

# Debug: Show actual results structure
print(f"\nDEBUG - Raw results structure:")
for result_key, model_results in results.items():
    print(f"Result Key: {result_key}")
    for topk, metrics in model_results.items():
        print(f"  Top-K {topk}: query={metrics['query']}, model={metrics['model']}, recall={metrics['recall']:.4f}")

VECTOR RETRIEVAL EVALUATION RESULTS
Found query types: ['ontology_aware', 'augmented', 'base']
Results structure check:
  ontology_aware: 1 models
  augmented: 0 models
  base: 0 models

RESULTS FOR ONTOLOGY AWARE
           Model Top-K 1 Top-K 3 Top-K 5 Top-K 9  Processed Docs Time (s)
all-MiniLM-L6-v2  0.3333  0.5500  0.7000  0.8000              10    45.06

🏆 Best for ontology_aware: 0.8000 - all-MiniLM-L6-v2 (Top-K: 9)
Skipping empty query: augmented
Skipping empty query: base

ALL QUERIES EVALUATION COMPLETE!
Tables saved to scripts/output/

DEBUG - Raw results structure:
Result Key: sentence-transformers/all-MiniLM-L6-v2_query_ontology_aware
  Top-K 1: query=query_ontology_aware, model=sentence-transformers/all-MiniLM-L6-v2, recall=0.3333
  Top-K 3: query=query_ontology_aware, model=sentence-transformers/all-MiniLM-L6-v2, recall=0.5500
  Top-K 5: query=query_ontology_aware, model=sentence-transformers/all-MiniLM-L6-v2, recall=0.7000
  Top-K 9: query=query_ontology_aware, model=se

In [12]:
# 7. Define your trainset (queries and expected retrievals)
trainset = [
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Data Availability Statement\nThe datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: NCBI GEO repository,\nGSE123128\n."]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data Availability Statement\nRaw sequencing data from this study have been deposited in the GEO database with the accession number\nGSE171155\n. The mass spectrometry proteomics data have been deposited to the ProteomeXchange Consortium via the PRIDE [1] partner repository with the data set identifier PXD024161 and 10.6019/PXD024161.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Associated Data\nThis section collects any data citations, data availability statements, or supplementary materials included in this article.\nSupplementary Materials\nDocument S1. Figures\xa0S1–S6 and Tables\xa0S1–S6\nmmc1.pdf\n(2.5MB, pdf)\nDocument S2. Article plus supplemental information\nmmc2.pdf\n(9.1MB, pdf)\nData Availability Statement\n•\nThe next-generation DNA sequencing dataset generated during this study is available at the National Genomics Data Center: HRA003231 (URL:\nhttps://ngdc.cncb.ac.cn\n). The mass spectrometry proteomics data reported in this paper have been deposited to the ProteomeXchange Consortium: PXD037076(\nhttp://proteomecentral.proteomexchange.org\n) via iProx partner repository\n61\n.\n•\nThis paper does not report the original code.\n•\nAny additional information required to reanalyze the data reported in this work paper is available from the\nlead contact\nupon request.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data and Code Availability\nRNA-seq data generated in this study are available at NCBI GEO database with the accession number\nGSE151029\n. The 53BP1 mass spectrometry data have been deposited to the ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD020090. The accession number for the FOXK1 and FOXK2 MS data reported in this paper is PRIDE: PXD001383']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["METHODS\nConstruction of Plasmids\nThe protein-coding regions of the NST3 gene were amplified from the Arabidopsis thaliana cDNA library with appropriate primers (see Supplemental Table 2 online). The 5′ upstream region of 3027 bp, which extended from the site of initiation of translation of the NST3 gene, was used for preparation of the ProNST3:GUS, ProNST3:NST3, and ProNST3:NST3SRDX gene constructs. These genes and 35S:NST3 were constructed from modified vectors derived from pGreenII0029 (Hellens et al., 2000) and p35SSRDXG (Mitsuda et al., 2006). For complementation analysis, we used genomic fragments including NST1 (9580 bp) and NST3 (5199 bp), which contained 6523 and 3069 bp of the respective promoter regions. The region corresponding to the transgene of each vector, with the exception of the pGreen-based vectors, was transferred to the pBCKH plant expression vector (Mitsuda et al., 2006) using the Gateway system (Invitrogen).\n\nConditions for Plant Growth and Transformation\nArabidopsis plants were grown in soil at 22°C with 16 h (long-day condition) or 8 h (short-day condition) of light daily. Unless otherwise stated, plants were grown under the long-day condition. For transformation, a T-DNA vector carrying the appropriate construct was introduced into Agrobacterium tumefaciens strain GV3101 by electroporation, and the resultant Agrobacterium was infiltrated into Arabidopsis using the floral dip method (Clough and Bent, 1998).\n\n\nAssessment of the Mechanical Strength of Inflorescence Stems\nWe used the bottom 5 cm of inflorescence stems taller than 25 cm for measurement of Young's modulus according to a previously described method (Kojima and Yamamoto, 2004).\n\nExamination of the Crystal State of Cellulose Microfibrils of Inflorescence Stems\nThe bottom region of the inflorescence stems, as described above, was used for x-ray diffraction analysis according to a previously described method (Abe and Yamamoto, 2005). Nickel-filtered Cu Kα radiation (wavelength, 0.154 nm) at 30 kV and 35 mA was used with the reflection technique.\n\nIsolation of RNA, Microarray Experiments, and Analysis\nTotal RNA was isolated with Trizol as described previously (Fukuda et al., 1991) from the bottom 4 cm of the inflorescence stems of three independent plants grown under the short-day condition and with a height of between 13 and 17 cm. Microarray analyses were performed with the Arabidopsis 2 Oligo Microarray (Agilent Technologies). All microarray experiments and the analysis of data were performed as described previously (Mitsuda et al., 2005) with the exceptions summarized below. P values for differences between nst1-1 nst3-1 and wild-type plants were calculated by Welch's t test, based on a two-tailed distribution (n = 3). To minimize type-I family-wise errors in multiple and simultaneous statistical tests, we adopted a strategy for suppression of false positives. We calculated a Q-value to estimate the false discovery rate from the P value described above using QVALUE software (Storey and Tibshirani, 2003) with the default setting. We considered genes with a Q-value of <0.1 to be genes expressed at different levels in nst1-1 nst3-1 and wild-type plants. Comprehensive gene group analysis using Fisher's exact test was performed with the R program package (http://www.r-project.org/). Quantitative RT-PCR was performed as described previously (Mitsuda et al., 2005). For the analysis of NST transcripts in the mutant lines, RT-PCR was performed with appropriate primers (see Supplemental Table 2 online).\n\nLight and Fluorescence Microscopy\nFor observations of lignin autofluorescence, we used a filter with the following specifications: glass, 365; dichroic mirror, 395; long-pass, 400. To observe ectopic secondary wall thickening, we cleared tissues by incubating them overnight in 70% lactic acid at 50°C. To prepare 70- to 150-μm sections of inflorescence stems and hypocotyls, we embedded the tissues in 3% agar then sectioned them on a vibrating microtome (HM-650V; Microm). Assays of GUS activity were performed with T1 or T2 transgenic plants. Plant tissues were fixed briefly, in some cases, in solution containing 0.3% formalin, 0.2% MES, pH 5.8, and 0.3 M mannitol before incubation in 100 mM sodium phosphate buffer, pH 7.0, containing 0.1% Triton X-100, 1 mM 5-bromo-4-chloro-3-indolyl-β-d-glucuronide, and 0.5 mM potassium ferricyanide at 37°C for up to 12 h. Stained stems and hypocotyls were embedded in 3% agar and sectioned. All observations by light and fluorescence microscopy were made with the Axioskop2 plus system (Carl Zeiss).\n\nUltrastructural Observation by Transmission Electron Microscopy\nShort pieces of inflorescence stems were fixed in 30 mM HEPES buffer containing 2% paraformaldehyde and 2% glutaraldehyde then fixed in HEPES buffer containing 2% osmium tetroxide. Fixed tissues were embedded in Q651 resin (Nissin EM). Sections of 80 to 90 nm thick were post-stained with uranyl acetate and lead citrate and observed by a JEM1200EX transmission electron microscope (JEOL) at an accelerating voltage of 80 kV.\n\nIdentification of NST Homologs in Poplar\nPoplar NAC genes resembling the Arabidopsis NST genes were collected using the Advanced Search tool of the Joint Genome Initiative poplar database (http://genome.jgi-psf.org/Poptr1/Poptr1.home.html) with the command, “find by homology to related protein with E-value <1.0e-20”; the database for Populus trichocarpa; and the query “At2g46770.” The 62 extracted sequences and amino acid sequences of subfamily IIb of NAC transcription factors of Arabidopsis, as defined in a previous study (Mitsuda et al., 2005), were aligned using the ClustalW program with default settings (Chenna et al., 2003). The amino acid sequences corresponding to conserved NAC domains were extracted and realigned. A phylogenetic tree was built by neighboring-joining method using ClustalW with default settings (an alignment and the sequences are shown in Supplemental Table 3 online). Bootstrap values were calculated from 100 trials. The subtree including the NST and VND genes is shown in Figure 7.\n\nAccession Numbers and Data Deposition\nNST1 and NST3 reported in this study correspond to the Arabidopsis Genome Initiative locus identifiers At2g46770 and At1g32770, respectively. Microarray data performed in this study can be found in the National Center for Biotechnology Information Gene Expression Omnibus data library under accession number GSE5187.\n"]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Materials and methods\nMaterials\nDilution series\nIllumina HumanCNV370-Duo BeadChip Infinium SNP data for dilution series of 12 mixtures of cancer cell line (HCC1395) mixed with its paired normal cell line (HCC1395BL) were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE11976]. We excluded chromosome 6 and 16 from analysis due to copy genomic aberrations present in the normal cell line HCC1395BL.\n\nCancer cell lines\nIllumina HumanHap300 data for the promyelocytic leukemia cancer cell HL-60 and colon cancer cell line HT-29 were obtained from Illumina, and Human-610 Quad SNP genotyping data for the colon cancer cell lines SW403, SW480, SW620, SW837, SW1417 and LIM1863 were generated at the Ludwig Institute of Cancer Research using standard processing protocols. The genotyping data for breast cancer cell lines MDA-175 and MDA-468 were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE18799] [23].\n\nPrimary breast tumors\nThree breast tumors (cases 114, 601 and 3,364) that had not received non-neoadjuvant therapy were analyzed in detail using material derived from microdissection. For each case, material containing pure tumor and pure stroma cells respectively was microdissected and compared to data obtained from surgically obtained material from the same tumors. Case 114 was of Luminal B type (23 mm tumor, moderately differentiated infiltrating ductal carcinoma with an extensive in-situ component. Node +ve, ER +ve (6.8 fm/mg protein), EGFR -ve (7.8 fm/mg protein)). Case 601 (20 mm 30 mm tumor, grade 3 with intraductal in-situ ca. and in filtrating ductal carcinoma, node +ve, ER -ve (1.5 fm/mg protein), Her2 +ve (histoscore of 3), EGFR +ve (histoscore of 208)) was classified as ERBB2 positive based on expression microarray data with a fractional rank of 0.982, Case 3,364 was 25 mm grade 3 infiltrating ductal carcinoma, ER positive (8 fm/mg protein), PR positive (histoscore 8/8), Her2 positive (histoscore 3+, one of ten axillary nodes +ve). For each case, DNA was extracted from microdissected stroma and tumor, as well as the original non-dissected sample and analyzed using Illumina Human-610 Quad SNP arrays applying standard protocols.\n\nData processing\nGenome Alteration Print was downloaded [43] and used to analyze all datasets using default settings and the highest ranked copy number and LOH predictions used for comparisons. However, for the cancer cell line dilution series, we re-used the results that had previously been generated by [23] and made available on the aforementioned website.\n\nGenoCN v1.06 was downloaded [44] and used with default settings and stromal contamination settings on for all datasets generated using Illumina Infonaut II SNP arrays. Adjusted GenoCN parameters for the Log R Ratio levels were used for Infonaut HD SNP array processing and in these instances we used the same levels that we specified for OncoSNP. The copy number and LOH predictions from the Viterbi sequence were used for comparisons.\n\nOncoSNP was run on all datasets using 15 EM iterations and with both stromal and intra-tumor heterogeneity options. In all cases, the ploidy prediction with the highest maximum likelihood was chosen and the Viterbi sequence of tumor states used for comparisons. We filtered detected aberrations using a Log Bayes Factor of 30.\n\nStatistical model\nA complete description of our statistical model is provided in Supplementary Information in Additional file 1.\n\nLet xi denote the tumor state at the i-th probe location and (xi, n, xi, t) denote the associated normal and tumor copy numbers. Furthermore, let zi = (zi, n,zi, t) denote the B allele count for the normal and tumor genotype respectively. The combinations (zi, n, (xi, n) and (zi, t, xi, t) fully define the normal and tumor genotypes respectively. The tumor state at each probe denotes the allowable combinations of normal-tumor genotypes at that location as shown in Table 1.\n\nLet π0 denote the normal DNA fraction of the tumor sample due to stromal contamination and 𝜋={𝜋𝑖}𝑛\n𝑖=1 denote the proportion of tumor cells having the normal genotype at each probe. The data 𝑦={𝑦𝑖}𝑛\n𝑖=1 consists of a set of two-dimensional vectors yi = [ri, bi]' whose elements correspond to the Log R Ratio and B allele frequency respectively.\n\nGiven (x, z, π, π0) the data is assumed to be distributed according to a (K + 1)-component mixture of Student t-distributions, where ki indicates the mixture component assignment of the i-th data point,\n\n𝑦𝑖|𝑥𝑖,𝑧𝑖,𝑘𝑖,𝑚,𝛿, 𝛴={ \n𝑆⁢𝑡(𝑚⁡(𝑥𝑖,𝑧𝑖)+𝛿(𝑙𝑙)\n𝑘𝑙,∑(𝑙𝑖)\n𝑘𝑖,𝜈),	𝑘≠0,\n𝑈𝑟⁡(𝑟min,𝑟max)×U𝑏⁢(0,1),	𝑘=0,\n \n(1)\nwhere 𝑆⁢𝑡⁡(𝛿(𝑙)\n𝑘,𝛴(𝑙)\n𝑘,𝑣) is the probability density function of the Student t-distribution with mean 𝛿(𝑙)\n𝑘 and covariance matrix 𝛴(𝑙)\n𝑘 associated with the k-th mixture component and the l-th genotype class and v degrees of freedom. The 0-th component is an outlier class which assumes uniformly distributed data over a specified range.\n\nThe elements of the mean vectors m(xi, zi) = [mr(xi), mb(zi, xi)]' are given by the following:\n\n𝑚𝑟⁡(𝑥𝑖)=(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢\n̅\n𝑟\n𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢\n̅\n𝑟\n𝑥𝑥𝑖,⁢𝑡+𝛽0+𝛽1⁢𝑔𝑖,\n(2)\nwhere gi is the local GC content at the i-th probe location and\n\n𝑚𝑏⁡(𝑧𝑖,𝑥𝑖)=\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑧𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑧𝑖,𝑡\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑥𝑖,𝑡\n \n.\n(3)\nPrior distributions\nThe prior distribution on the mixture weights is given by a Dirichlet distribution:\n\n𝑤(𝑙)|𝛼~𝐷⁢𝑖⁢𝑟⁡(𝛼),\n(4)\nwhere α is a concentration parameter which in the numerical results we used α = 1 to give a at prior on the mixture weights.\n\nThe prior distributions on the mixture centers and covariance matrices are given by standard conjugate Normal-Inverse Wishart distributions:\n\n𝛿(𝑙)\n𝑘|𝜏, 𝛴(𝑙)\n𝑘~𝑁⁡(0,𝜏 𝛴(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(5)\n𝛴(𝑙)\n𝑘|𝛾, 𝑆(𝑙)\n𝑘~𝐼⁢𝑊⁡(𝛾,𝑆(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(6)\nwhere τ is a hyperparameter that controls the strength of the prior and IW(γ, Λ) denotes the Inverse-Wishart distribution with parameter γ and scale matrix Λ.\n\nA beta prior is assumed for the outlier rate,\n\n𝜂|𝛼𝜂, 𝛽𝜂~𝐵⁢𝑒⁡(𝛼𝜂,𝛽𝜂),\n(7)\nwhere (αn, βn) are hyperparameters associated with the Beta prior. For the numerical results we set these as (1,1) to give a uniform distribution. \n\nA normal prior is assumed for the local GC content regression parameters,\n\n𝛽|𝜆𝛽~𝑁⁡(0,𝜆𝛽⁢𝐼2),\n(8)\nwhere Ip is a p × p identity matrix.\n\nA discrete prior is assumed for the stromal contamination content and intra-tumour heterogeneity levels,\n\n𝑝⁡(𝜋0)={ \n𝛼𝜋0,𝜋0=0,\n𝛽𝜋0,𝜋0>0,\n \n(9)\nand\n\n𝑝⁡(𝜋𝑖)={ \n𝛼𝜋,𝜋𝑖=0,\n𝛽𝜋,𝜋𝑖>0,\n  𝑖=1,…,𝑛,\n(10)\nwhere in the numerical results we have used απ0 = βπ0 = 1 and απ = 1, βπ = 2.\n\nThe tumor states are assumed to form an inhomogeneous Markov Chain with transition matrix,\n\n𝑝⁡(𝑥𝑖|𝑥𝑖−1)={ \n1−𝜌,𝑥𝑖=𝑥𝑖−1,\n𝜌,𝑥𝑖≠𝑥𝑖−1,\n \n(11)\nwhere ρ = (1/2) (1-exp(-(1/2L) (si-si-1) and si is the physical coordinate of the i-th probe and L is a characteristic length which we set as L = 2,000,000 for the numerical results.\n\nPosterior inference\nWe estimated the unknown model parameters using an expectation-maximization algorithm. Multiple restarts were used to explore different baseline of the Log R Ratio and the baseline with the greatest likelihood was chosen for the calculation of summary statistics.\n\nSummary statistics\nWe used the Viterbi algorithm to extract the most likely sequence of tumors states and for each aberrant segment in the Viterbi sequence we calculated an approximate Bayes Factor (score) of that segment belonging to each of the tumor states. In addition we also recorded the maximum a posteriori estimates of the Log R Ratio baseline adjustment β0 and the stromal contamination π0.\n\nAvailability\nA MATLAB based implementation (for 64 bit Linux systems) of our software is available for academic and non-commercial use from the associated website [45]. In addition, SNP data analyzed in this paper are also available from this website and from the Gene Expression Omnibus Database under Accession No.[GEO:GSE23785]."]
        )
]