# Vector Retrieval Experiment

In [1]:
import pandas as pd
from data_gatherer.data_gatherer import DataGatherer
from data_gatherer.parser.xml_parser import XMLParser
from data_gatherer.parser.html_parser import HTMLParser
from data_gatherer.logger_setup import setup_logging
from data_gatherer.retriever.embeddings_retriever import EmbeddingsRetriever
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from lxml import etree
import dspy
import logging
import re
import os
import time
from collections import defaultdict

In [2]:
logger = setup_logging('vector_retrieval_experiment', './logs/vector_retrieval_experiment.log', level=logging.INFO)

In [3]:
logger.info("Starting vector retrieval experiment")

[97m1301467205.py - line 1 - INFO - Starting vector retrieval experiment[0m


## 1. Load corpus and ground truth

In [4]:
input_corpus = pd.read_parquet('scripts/exp_input/Local_fetched_data_1.parquet')  # or load HTML and extract text
ground_truth = pd.read_parquet('scripts/output/gold/dataset_citation_records_Table.parquet')  # adjust as needed

# Add a warning about input data:
logger.info(f"Corpus shape: {str(input_corpus.shape)}")
logger.info(f"Ground truth shape: {str(ground_truth.shape)}")

[97m2826339252.py - line 5 - INFO - Corpus shape: (2190, 7)[0m
[97m2826339252.py - line 6 - INFO - Ground truth shape: (401327, 7)[0m
[97m2826339252.py - line 6 - INFO - Ground truth shape: (401327, 7)[0m


In [5]:
ground_truth['pmc_id'] = ground_truth['citing_publication_link'].str.extract(r'(PMC\d+)', flags=re.IGNORECASE)

In [6]:
input_corpus.head()  # Check the structure of the corpus

Unnamed: 0,pub_title,file_name,raw_cont,format,length,path,publication
0,Evolution of regulatory signatures in primate ...,PMC7668098__Evolution of regulatory signatures...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,167515,scripts/tmp/raw_files/PMC/PMC7668098__Evolutio...,pmc7668098
1,Functional annotation of noncoding mutations i...,PMC8321657__Functional annotation of noncoding...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,206647,scripts/tmp/raw_files/PMC/PMC8321657__Function...,pmc8321657
2,Lung adenocarcinomas without driver genes conv...,PMC11070398__Lung adenocarcinomas without driv...,<?xml version='1.0' encoding='UTF-8'?>\n<!DOCT...,xml,170204,scripts/tmp/raw_files/PMC/PMC11070398__Lung ad...,pmc11070398
3,SPNeoDeath_ A demographic and epidemiological ...,PMC7419335__SPNeoDeath_ A demographic and epid...,"<html lang=""en"" class=""""><head>\n\n <me...",html,183668,scripts/tmp/raw_files/PMC/PMC7419335__SPNeoDea...,pmc7419335
4,"O-linked α2,3 sialylation defines stem cell po...","PMC8741191__O-linked α2,3 sialylation defines ...","\n<!DOCTYPE html>\n<html lang=""en"" >\n <hea...",html,230136,scripts/tmp/raw_files/PMC/PMC8741191__O-linked...,pmc8741191


In [7]:
ground_truth.head()  # Check the structure of the ground truth

Unnamed: 0,identifier,repository,citing_publication_link,citation_record_source,citation_record_from_doi,doi,pmcid,pmc_id
0,PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1,proteomexchange_search.tsv,1,10.1038/S41467-025-56720-1,,
1,PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312,proteomexchange_search.tsv,1,10.6019/PXD051312,,
2,PXD051312,PRIDE,https://dx.doi.org/10.1002/prca.202400095,proteomexchange_search.tsv,1,10.1002/prca.202400095,,
3,PXD051312,PRIDE,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,proteomexchange_search.tsv,0,,PMC11895760,PMC11895760
4,PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571,proteomexchange_search.tsv,1,10.17159/SAJS.2025/18571,,


In [8]:
xml_parser = XMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)
html_parser = HTMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)

[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 19 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mllm_client.py - line 19 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mxml_parser.py - line 24 - INFO - Initializing xmlRetriever[0m
[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 19 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mxml_parser.py - line 24 - INFO - Initializing xmlRetriever[0m
[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 19 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mhtml_parser.py - line 82 - INFO - Initializing htmlRetriever[0m
[97mhtml_parser.py - line 82 - INFO - Initializing htmlRetriever[0m


Note: some files are being skipped because of ground truth incompleteness.

In [9]:
# Systematic evaluation of different models and top-k values (OPTIMIZED)

# Define models to test
models_to_test = [
    # Base models
    'sentence-transformers/all-MiniLM-L6-v2', 
    #'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/all-MiniLM-L12-v2',
    #'sentence-transformers/sentence-t5-base',

    # BioMed
    #"neuml/pubmedbert-base-embeddings",
    #'sentence-transformers/allenai-specter',
    #'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',

    # MSMARCO passage ranking models
    #'sentence-transformers/msmarco-distilbert-base-v4',
    #'sentence-transformers/msmarco-bert-base-dot-v5',
    #'sentence-transformers/msmarco-distilbert-dot-v5',
    #'sentence-transformers/msmarco-distilbert-base-tas-b',

    # Paraphrase models
    'sentence-transformers/paraphrase-MiniLM-L3-v2',

    # Semantic Search
    'sentence-transformers/multi-qa-mpnet-base-cos-v1',
    #'sentence-transformers/multi-qa-distilbert-cos-v1',
    'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'

    # https://huggingface.co/Snowflake/snowflake-arctic-embed-m
]

# Define top-k values to test
topk_values = [1, 3, 5]
max_k = max(topk_values)  # We'll retrieve this many and slice for smaller k values

# Store results
results = defaultdict(dict)

# Enhanced query using hackathon context trigger keywords
query_ontology_aware = """Mentions of dataset repositories, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
"""
query_augmented = """Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code"""
query_base = "Available data, accession code, data repository, deposited data"

query = query_base

logger.info(f"Starting OPTIMIZED systematic evaluation with enhanced query")
logger.info(f"Query: {query}")
logger.info(f"Testing {len(models_to_test)} models with top-k values {topk_values}")
logger.info(f"Optimization: Single model load per model, reuse for all publications")

for model_name in models_to_test:
    logger.info(f"\n{'='*60}")
    logger.info(f"Testing model: {model_name}")
    logger.info(f"{'='*60}")
    
    # Initialize recall counters for all top-k values
    recalls = {k: 0 for k in topk_values}
    cnt = 0
    start_time = time.time()
    
    # Clean up previous embeddings
    if os.path.exists("corpus_embeddings.npy"):
        os.remove("corpus_embeddings.npy")

    # OPTIMIZATION: Create model ONCE and reuse for all publications
    logger.info(f"Loading model {model_name}...")
    retriever = EmbeddingsRetriever.create_model_only(
        model_name=model_name,
        device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu",
        logger=logger
    )

    retriever.embed_query(query)

    logger.info(f"Model {model_name} loaded successfully!")
    
    for i, publication in input_corpus.iterrows():
        if i == 10:  # Limit to first 10 publications for faster testing
            break

        logger.info(f"Publication: {publication['publication']}")
        
        gt = ground_truth[ground_truth['pmc_id'].str.lower() == publication['publication'].lower()]
        idnts = gt['identifier'].tolist()

        logger.info(f"Identifiers in ground truth: {idnts}")
        
        if publication['format'] == 'xml':
            sections = xml_parser.extract_sections_from_xml(etree.fromstring(publication['raw_cont'].encode('utf-8')))
        elif publication['format'] == 'html':
            clean_html = html_parser.normalize_HTML(publication['raw_cont'])
            sections = html_parser.extract_sections_from_html(clean_html)
        else:
            logger.warning(f"Unsupported format {publication['format']} for publication {publication['publication']}. Skipping.")
            continue

        # Check if identifiers are in content
        idnts_in_cont = []
        for idnt in idnts:
            if idnt in publication['raw_cont']:
                idnts_in_cont.append(idnt)
        
        logger.info(f"Identifiers in content: {idnts_in_cont}")
        
        if not idnts_in_cont:
            continue
            
        cnt += 1
        
        # Prepare corpus
        corpus = []
        for section in sections:
            corpus.append({
                'sec_txt': 'section_title: ' + section['section_title'] + 
                          '.\nsection_type: ' + section['sec_type'] + 
                          '.\ncontent: ' + section['sec_txt']
            })
        
        # OPTIMIZATION: Only embed corpus (model already loaded)
        try:
            retriever.embed_corpus(corpus)
            
            # OPTIMIZATION: Single retrieval with max_k, then slice for different k values
            full_result = retriever.search(query=None, k=max_k)

            for full_result_item in full_result:
                logger.info(f"L2 Norm {full_result_item['L2_distance']} --> {full_result_item['text'][:150]}")

            # Evaluate for all top-k values using the same retrieval result
            for topk_docs_to_retrieve in topk_values:
                logger.info(f"Evaluating with top-k = {topk_docs_to_retrieve}")
                
                # Slice results for current k value
                result = full_result[:topk_docs_to_retrieve]
                
                # Combine all retrieved text
                iterres = '. '.join([r['text'] for r in result])
                
                # Check matches
                matches = set()
                not_matched = set()
                for j, row in gt.iterrows():
                    if row['identifier'].lower() in iterres.lower():
                        recalls[topk_docs_to_retrieve] += 1/len(idnts_in_cont)
                        matches.add(row['identifier'])
                
                not_matched = set(idnts_in_cont) - matches
                
                logger.info(f"Publication {publication['publication']}, Top-k {topk_docs_to_retrieve}: Found {len(matches)} matches out of {len(idnts_in_cont)} ground truth")
                
                logger.info(f"Missed citations: {not_matched}")

        except Exception as e:
            logger.error(f"Error processing publication {i+1} with model {model_name}: {e}")
            continue
    
    # Calculate final recalls and store results for all top-k values
    elapsed_time = time.time() - start_time
    
    for topk_docs_to_retrieve in topk_values:
        final_recall = recalls[topk_docs_to_retrieve]/cnt if cnt > 0 else 0
        
        # Store results
        results[model_name][topk_docs_to_retrieve] = {
            'recall': final_recall,
            'processed_docs': cnt,
            'time_seconds': elapsed_time
        }
        
        logger.info(f"Model: {model_name}, Top-k: {topk_docs_to_retrieve}, Recall: {final_recall:.4f}")
    
    logger.info(f"Total time for model {model_name}: {elapsed_time:.2f}s")

logger.info(f"\n{'='*60}")
logger.info("OPTIMIZED evaluation completed!")
logger.info(f"{'='*60}")

[97m1577195191.py - line 51 - INFO - Starting OPTIMIZED systematic evaluation with enhanced query[0m
[97m1577195191.py - line 52 - INFO - Query: Available data, accession code, data repository, deposited data[0m
[97m1577195191.py - line 53 - INFO - Testing 5 models with top-k values [1, 3, 5][0m
[97m1577195191.py - line 54 - INFO - Optimization: Single model load per model, reuse for all publications[0m
[97m1577195191.py - line 57 - INFO - 
[97m1577195191.py - line 58 - INFO - Testing model: sentence-transformers/all-MiniLM-L6-v2[0m
[97m1577195191.py - line 71 - INFO - Loading model sentence-transformers/all-MiniLM-L6-v2...[0m
[97m1577195191.py - line 52 - INFO - Query: Available data, accession code, data repository, deposited data[0m
[97m1577195191.py - line 53 - INFO - Testing 5 models with top-k values [1, 3, 5][0m
[97m1577195191.py - line 54 - INFO - Optimization: Single model load per model, reuse for all publications[0m
[97m1577195191.py - line 57 - INFO - 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 1.82s (0.114s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (16, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2743942737579346 --> section_title: Data Availability..
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5614886283874512 --> section_title: Supplementary Material.
section_type: supplementary-material.
content: [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5950610637664795 --> section_title: Specimens..
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.47s (0.095s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2430741786956787 --> section_title: Data Availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2430741786956787 --> sectio

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.02s (0.096s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5123822689056396 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5744290351867676 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.583330512046814 --> section_title: No Title.
section_type: supplementary-material.
content: 
<p xmlns:mml=

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.53s (0.106s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2155413627624512 --> section_title: Experimental design, materials and methods.
section_type: unknown.
content: Experimental design, materials and methods
The raw data fro[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.369705319404602 --> section_title: On this page.
section_type: usa-in-page-nav.
content: On this page
Abstract
Data description
Experimental design, materials and methods[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.402523159980774 --> section_title: No Title.
section_type: tw.
content: Subject
Public Health and Health Po

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 3.25s (0.088s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4579782485961914 --> section_title: No Title.
section_type: unknown.
content: View/request a protocol for this paper from
Bio-protocol
.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5115501880645752 --> section_title: No Title.
section_type: unknown.
content: Data and materials availability:
The lectin microarray data are available through Synapse.org[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5208659172058105 --> section_title: RNA-seq data analysis.
section_type: unknown.
content: RNA-seq data analysis
For all analyses, default pa

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 3.06s (0.073s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2209899425506592 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2209899425506592 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2582637071609497 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(32.1MB, zip)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 3.85s (0.101s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1664334535598755 --> section_title: Data and code availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="htt[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3653374910354614 --> section_title: RESOURCE AVAILABILITY.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.or[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.439039945602417 --> section_title: STAR★METHODS.
section_type: unknown.
content: 
<p xmlns:mml="http://www

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.77s (0.095s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4719922542572021 --> section_title: Online content.
section_type: materials|methods.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4719922542572021 --> sectio

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.82s (0.067s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1235625743865967 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
RNA-seq data that support [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2039618492126465 --> section_title: Data availability.
section_type: unknown.
content: Data availability
RNA-seq data that support the findings of this study have been dep[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3543484210968018 --> section_title: Footnotes.
section_type: fn-group.
content: Footnotes
The authors decl

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.99s (0.069s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 0.9085713624954224 --> section_title: Data Availability.
section_type: unknown.
content: Data Availability
All relevant data are within the paper, its Supporting Information[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 0.958905816078186 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
All relevant data are with[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3720241785049438 --> section_title: No Title.
section_type: sm.
content: S1 File
Table A1: Mining algorithm

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 1.34s (0.084s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (16, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.185430645942688 --> section_title: Data Availability..
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5417289733886719 --> section_title: RNA-Seq..
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/ali[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.565658688545227 --> section_title: Supplementary Material.
section_type: supplementary-material.
content: 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.21s (0.085s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2384380102157593 --> section_title: Data Availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4921808242797852 --> section_title: Pathway enrichment analysis.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4923585653305054 --> section_title: Materials and Methods.
section_type: materials|methods.
content: 
<p x

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 1.76s (0.084s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2287788391113281 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2343218326568604 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3973374366760254 --> section_title: No Title.
section_type: supplementary-material.
content: 
<p xmlns:mml

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 1.82s (0.076s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0496220588684082 --> section_title: Experimental design, materials and methods.
section_type: unknown.
content: Experimental design, materials and methods
The raw data fro[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3254826068878174 --> section_title: No Title.
section_type: history.
content: Received 2020 Jul 15; Revised 2020 Jul 22; Accepted 2020 Jul 23; Collection date 2020 Oct.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3614237308502197 --> section_title: On this page.
section_type: usa-in-page-nav.
content: On this page
Abstra

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.64s (0.071s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2780802249908447 --> section_title: No Title.
section_type: sm.
content: Figs. S1 to S5
Tables S1 to S3
Click here for additional data file.
(1MB, pdf)[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2780802249908447 --> section_title: No Title.
section_type: sm.
content: Figs. S1 to S5
Tables S1 to S3
Click here for additional data file.
(1MB, pdf)[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3224411010742188 --> section_title: No Title.
section_type: history.
content: Received 2021 Jun 15; Accepted 2021 Nov 11; Collection date 2022 Jan

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 3.14s (0.075s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.021242618560791 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.021242618560791 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97membeddings_retriever.py - line 120 - INFO - Searching f

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 4.27s (0.112s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3424755334854126 --> section_title: RESOURCE AVAILABILITY.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.or[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3466167449951172 --> section_title: Data and code availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="htt[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3742893934249878 --> section_title: Materials availability.
section_type: unknown.
content: 
<p xmlns:mml=

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.36s (0.082s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2421495914459229 --> section_title: Supplementary information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2512493133544922 --> section_title: Online content.
section_type: materials|methods.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3536858558654785 --> section_title: Reporting summary.
section_type: unknown.
content: 
<p xmlns:mml="http

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 2.93s (0.070s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.187523603439331 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
RNA-seq data that support [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.254080057144165 --> section_title: Data availability.
section_type: unknown.
content: Data availability
RNA-seq data that support the findings of this study have been dep[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching f

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 3.08s (0.072s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 0.876539945602417 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
All relevant data are with[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 0.9184410572052002 --> section_title: Data Availability.
section_type: unknown.
content: Data Availability
All relevant data are within the paper, its Supporting Information[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2098966836929321 --> section_title: No Title.
section_type: sm.
content: S1 File
Table A1: Mining algorithm

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.60s (0.038s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (16, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.2950439453125 --> section_title: Data Availability..
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 22.27967071533203 --> section_title: Discussion.
section_type: discussion.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schema[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 22.569995880126953 --> section_title: Results.
section_type: results.
content: 
<p xmlns:mml="http://www.w3.org

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.98s (0.038s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.223514556884766 --> section_title: Data Availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.355762481689453 --> section_title: Identification of mutated elements.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http:[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.696041107177734 --> section_title: Author Contributions.
section_type: unknown.
content: 
<p xmlns:mml="h

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.65s (0.031s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 22.47622299194336 --> section_title: Mutational frequency.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 23.14689826965332 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 23.208763122558594 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.56s (0.023s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 18.32941246032715 --> section_title: Experimental design, materials and methods.
section_type: unknown.
content: Experimental design, materials and methods
The raw data fro[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 19.647945404052734 --> section_title: No Title.
section_type: tw.
content: Subject
Public Health and Health Policy
Specific subject area
Demographic and epidemiological data[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.320613861083984 --> section_title: No Title.
section_type: unknown.
content: 1.
Oliveira M.M.d., de Araújo

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.82s (0.022s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 17.78496551513672 --> section_title: No Title.
section_type: unknown.
content: Data and materials availability:
The lectin microarray data are available through Synapse.org[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 19.164363861083984 --> section_title: RNA-seq data analysis.
section_type: unknown.
content: RNA-seq data analysis
For all analyses, default parameters were used unless othe[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 19.36008644104004 --> section_title: CCLE dataset analysis.
section_type: unknown.
content: CCLE dataset anal

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.80s (0.019s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.41464614868164 --> section_title: Mutation frequency and spectrum analysis.
section_type: unknown.
content: Mutation frequency and spectrum analysis
We calculate mutatio[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.61330795288086 --> section_title: Standardization and tracking of mutation data from 12 cancer types.
section_type: unknown.
content: Standardization and tracking of mut[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.725711822509766 --> section_title: Clonality and mutation VAF analysis.
section_type: unknown.
content: Clo

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.91s (0.024s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.815021514892578 --> section_title: Data and code availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="htt[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.075258255004883 --> section_title: Differential expression analyses of brain single-cell/nucleus RNA-sequencing dat

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.68s (0.023s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.745962142944336 --> section_title: Online content.
section_type: materials|methods.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.57119369506836 --> section_title: Dendritic, delayed, stochastic CaMKII activity.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmln[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 21.933435440063477 --> section_title: Fluorescence correlation spectroscopy.
section_type: unknown.
content: 

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.95s (0.023s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 16.937339782714844 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
RNA-seq data that support [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 18.508827209472656 --> section_title: Data availability.
section_type: unknown.
content: Data availability
RNA-seq data that support the findings of this study have been dep[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 20.657875061035156 --> section_title: Evaluation of transcript expression.
section_type: unknown.
content: E

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 0.86s (0.020s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 384)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 16.336959838867188 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
All relevant data are with[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 17.275056838989258 --> sectio

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 22.10s (1.381s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (16, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0208756923675537 --> section_title: Data Availability..
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.466174840927124 --> section_title: Specimens..
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/a[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.511732816696167 --> section_title: Methods.
section_type: methods.
content: 
<p xmlns:mml="http://www.w3.o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 52.36s (2.014s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 768)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0364954471588135 --> section_title: Data Availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.406472086906433 --> sectio

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 44.13s (2.101s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3583515882492065 --> section_title: Supplementary Information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.428483486175537 --> section_title: Identifying conserved pathways and functions.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4332164525985718 --> section_title: Gene list acquisition.
section_type: unknown.
content: 
<p xmlns:mml="

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 41.92s (1.747s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.231968879699707 --> section_title: Experimental design, materials and methods.
section_type: unknown.
content: Experimental design, materials and methods
The raw data fro[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3888587951660156 --> section_title: No Title.
section_type: tw.
content: Subject
Public Health and Health Policy
Specific subject area
Demographic and epidemiological data[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 47.65s (1.288s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3436335325241089 --> section_title: No Title.
section_type: unknown.
content: Data and materials availability:
The lectin microarray data are available through Synapse.org[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3436335325241089 --> secti

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 53.91s (1.284s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3896219730377197 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3896219730377197 --> section_title: No Title.
section_type: sm.
content: Supplementary Data
(22.3MB, zip)
This zipped file contains Supplementary Tables 1 and 3-14. (ZIP 2[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3985424041748047 --> section_title: Extended data figures and tables.
section_type: unknown.
content: Ext

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 55.74s (1.467s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0393216609954834 --> section_title: Data and code availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="htt[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2196063995361328 --> secti

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 47.92s (1.652s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3502777814865112 --> section_title: Extended data.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schema[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3589370250701904 --> section_title: Supplementary information.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.nis[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4132907390594482 --> section_title: Online content.
section_type: materials|methods.
content: 
<p xmlns:m

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 57.24s (1.363s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 0.9904440641403198 --> section_title: Data availability.
section_type: unknown.
content: Data availability
RNA-seq data that support the findings of this study have been dep[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0218825340270996 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
RNA-seq data that support [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4626680612564087 --> section_title: Study participants.
section_type: unknown.
content: Study participant

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 47.46s (1.104s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 768)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0529696941375732 --> section_title: Data Availability.
section_type: unknown.
content: Data Availability
All relevant data are within the paper, its Supporting Information[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1275951862335205 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
All relevant data are with[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.239210844039917 --> section_title: No Title.
section_type: sm.
content: S1 File
Table A1: Mining algorith

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 6.37s (0.398s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (16, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3736333847045898 --> section_title: Data Availability..
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.6809864044189453 --> section_title: Specimens..
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/a[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.6881980895996094 --> section_title: RNA-Seq..
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 19.93s (0.767s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (26, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1783170700073242 --> section_title: Data Availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.n[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.6096922159194946 --> section_title: Author Contributions.
section_type: unknown.
content: 
<p xmlns:mml="http://www

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 10.46s (0.498s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (21, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5500264167785645 --> section_title: Gene list acquisition.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.or[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.6044206619262695 --> section_title: Methods.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 9.14s (0.381s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (24, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.2953803539276123 --> section_title: No Title.
section_type: tw.
content: Subject
Public Health and Health Policy
Specific subject area
Demographic and epidemiological data[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.3133571147918701 --> section_title: Experimental design, materials and methods.
section_type: unknown.
content: Experimental design, materials and methods
The raw data fro[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5175786018371582 --> section_title: Table 1..
section_type: tw.
content: Table 1.
SPNeoDeath dataset data 

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 13.33s (0.360s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (37, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.355638861656189 --> section_title: No Title.
section_type: unknown.
content: Data and materials availability:
The lectin microarray data are available through Synapse.org[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5880420207977295 --> section_title: Patient-derived organoids.
section_type: unknown.
content: Patient-derived organoids
Tumor tissues of freshly resected biopsies from pa[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.6006584167480469 --> section_title: Sample processing for lectin Array.
section_type: unknown.
content: Sa

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 12.49s (0.297s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4246423244476318 --> section_title: Supplementary information.
section_type: unknown.
content: Supplementary information
The online version of this article (doi:10.1038/na[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4246423244476318 --> section_title: Supplementary information.
section_type: unknown.
content: Supplementary information
The online version of this article (doi:10.1038/na[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4246423244476318 --> section_title: Supplementary information.
section_type: unknown.
content: Supplement

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 13.95s (0.367s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (38, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4909651279449463 --> section_title: RESOURCE AVAILABILITY.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.or[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.518831729888916 --> section_title: Data and code availability.
section_type: data-availability.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="htt[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5648542642593384 --> section_title: Multi-omics analysis.
section_type: unknown.
content: 
<p xmlns:mml="h

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 14.63s (0.504s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (29, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.4470504522323608 --> section_title: Online content.
section_type: materials|methods.
content: 
<p xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.475421667098999 --> section_title: Extended data.
section_type: unknown.
content: 
<p xmlns:mml="http://www.w3.org/

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 13.10s (0.312s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (42, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0041468143463135 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
RNA-seq data that support [0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.0057947635650635 --> section_title: Data availability.
section_type: unknown.
content: Data availability
RNA-seq data that support the findings of this study have been dep[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5522273778915405 --> section_title: Evaluation of transcript expression.
section_type: unknown.
content: 

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97membeddings_retriever.py - line 55 - INFO - Embedding time: 12.61s (0.293s per doc)[0m
[97membeddings_retriever.py - line 56 - INFO - Corpus embedding completed. Shape: (43, 384)[0m
[97membeddings_retriever.py - line 120 - INFO - Searching for top-5 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 102 - INFO - Computing L2 distances using numpy.[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1236889362335205 --> section_title: Data Availability Statement.
section_type: data-availability-statement.
content: Data Availability Statement
All relevant data are with[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.1332188844680786 --> section_title: Data Availability.
section_type: unknown.
content: Data Availability
All relevant data are within the paper, its Supporting Information[0m
[97m1577195191.py - line 132 - INFO - L2 Norm 1.5177226066589355 --> section_title: Funding Statement.
section_type: unknown.
content: Funding Statement


In [10]:

print("="*80)
print("VECTOR RETRIEVAL EVALUATION RESULTS")
print("="*80)

# Create a summary table
summary_data = []
for model_name, model_results in results.items():
    for topk, metrics in model_results.items():
        summary_data.append({
            'Model': model_name.split('/')[-1],  # Just the model name without org
            'Top-K': topk,
            'Recall': f"{metrics['recall']:.4f}",
            'Processed Docs': metrics['processed_docs'],
            'Time (s)': f"{metrics['time_seconds']:.2f}"
        })

# Convert to DataFrame for nice display
summary_df = pd.DataFrame(summary_data)

print("\nSUMMARY TABLE:")
print("-" * 80)
print(summary_df.to_string(index=False))

# Find best performing configurations
print("\n" + "="*80)
print("BEST PERFORMING CONFIGURATIONS:")
print("="*80)

# Best overall recall
best_overall = max(summary_data, key=lambda x: float(x['Recall']))
print(f"🏆 Best Overall Recall: {best_overall['Recall']} - {best_overall['Model']} (Top-K: {best_overall['Top-K']})")

# Best for each top-k value
for k in topk_values:
    k_results = [x for x in summary_data if x['Top-K'] == k]
    if k_results:
        best_k = max(k_results, key=lambda x: float(x['Recall']))
        print(f"🎯 Best for Top-K={k}: {best_k['Recall']} - {best_k['Model']}")

# Performance comparison by model
print("\n" + "="*80)
print("PERFORMANCE BY MODEL (Average Recall):")
print("="*80)

model_averages = {}
for model_name, model_results in results.items():
    avg_recall = sum(metrics['recall'] for metrics in model_results.values()) / len(model_results)
    model_averages[model_name.split('/')[-1]] = avg_recall

for model, avg_recall in sorted(model_averages.items(), key=lambda x: x[1], reverse=True):
    print(f"📊 {model}: {avg_recall:.4f}")

print("\n" + "="*80)
print("EVALUATION COMPLETE!")
print("="*80)

VECTOR RETRIEVAL EVALUATION RESULTS

SUMMARY TABLE:
--------------------------------------------------------------------------------
                     Model  Top-K Recall  Processed Docs Time (s)
          all-MiniLM-L6-v2      1 0.3000              10    32.61
          all-MiniLM-L6-v2      3 0.6000              10    32.61
          all-MiniLM-L6-v2      5 0.6000              10    32.61
         all-MiniLM-L12-v2      1 0.3000              10    30.04
         all-MiniLM-L12-v2      3 0.3000              10    30.04
         all-MiniLM-L12-v2      5 0.3000              10    30.04
   paraphrase-MiniLM-L3-v2      1 0.4000              10    12.19
   paraphrase-MiniLM-L3-v2      3 0.5500              10    12.19
   paraphrase-MiniLM-L3-v2      5 0.5500              10    12.19
multi-qa-mpnet-base-cos-v1      1 0.4000              10   476.43
multi-qa-mpnet-base-cos-v1      3 0.7000              10   476.43
multi-qa-mpnet-base-cos-v1      5 0.7000              10   476.43
 multi-qa

In [11]:
# 7. Define your trainset (queries and expected retrievals)
trainset = [
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Data Availability Statement\nThe datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: NCBI GEO repository,\nGSE123128\n."]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data Availability Statement\nRaw sequencing data from this study have been deposited in the GEO database with the accession number\nGSE171155\n. The mass spectrometry proteomics data have been deposited to the ProteomeXchange Consortium via the PRIDE [1] partner repository with the data set identifier PXD024161 and 10.6019/PXD024161.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Associated Data\nThis section collects any data citations, data availability statements, or supplementary materials included in this article.\nSupplementary Materials\nDocument S1. Figures\xa0S1–S6 and Tables\xa0S1–S6\nmmc1.pdf\n(2.5MB, pdf)\nDocument S2. Article plus supplemental information\nmmc2.pdf\n(9.1MB, pdf)\nData Availability Statement\n•\nThe next-generation DNA sequencing dataset generated during this study is available at the National Genomics Data Center: HRA003231 (URL:\nhttps://ngdc.cncb.ac.cn\n). The mass spectrometry proteomics data reported in this paper have been deposited to the ProteomeXchange Consortium: PXD037076(\nhttp://proteomecentral.proteomexchange.org\n) via iProx partner repository\n61\n.\n•\nThis paper does not report the original code.\n•\nAny additional information required to reanalyze the data reported in this work paper is available from the\nlead contact\nupon request.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data and Code Availability\nRNA-seq data generated in this study are available at NCBI GEO database with the accession number\nGSE151029\n. The 53BP1 mass spectrometry data have been deposited to the ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD020090. The accession number for the FOXK1 and FOXK2 MS data reported in this paper is PRIDE: PXD001383']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["METHODS\nConstruction of Plasmids\nThe protein-coding regions of the NST3 gene were amplified from the Arabidopsis thaliana cDNA library with appropriate primers (see Supplemental Table 2 online). The 5′ upstream region of 3027 bp, which extended from the site of initiation of translation of the NST3 gene, was used for preparation of the ProNST3:GUS, ProNST3:NST3, and ProNST3:NST3SRDX gene constructs. These genes and 35S:NST3 were constructed from modified vectors derived from pGreenII0029 (Hellens et al., 2000) and p35SSRDXG (Mitsuda et al., 2006). For complementation analysis, we used genomic fragments including NST1 (9580 bp) and NST3 (5199 bp), which contained 6523 and 3069 bp of the respective promoter regions. The region corresponding to the transgene of each vector, with the exception of the pGreen-based vectors, was transferred to the pBCKH plant expression vector (Mitsuda et al., 2006) using the Gateway system (Invitrogen).\n\nConditions for Plant Growth and Transformation\nArabidopsis plants were grown in soil at 22°C with 16 h (long-day condition) or 8 h (short-day condition) of light daily. Unless otherwise stated, plants were grown under the long-day condition. For transformation, a T-DNA vector carrying the appropriate construct was introduced into Agrobacterium tumefaciens strain GV3101 by electroporation, and the resultant Agrobacterium was infiltrated into Arabidopsis using the floral dip method (Clough and Bent, 1998).\n\n\nAssessment of the Mechanical Strength of Inflorescence Stems\nWe used the bottom 5 cm of inflorescence stems taller than 25 cm for measurement of Young's modulus according to a previously described method (Kojima and Yamamoto, 2004).\n\nExamination of the Crystal State of Cellulose Microfibrils of Inflorescence Stems\nThe bottom region of the inflorescence stems, as described above, was used for x-ray diffraction analysis according to a previously described method (Abe and Yamamoto, 2005). Nickel-filtered Cu Kα radiation (wavelength, 0.154 nm) at 30 kV and 35 mA was used with the reflection technique.\n\nIsolation of RNA, Microarray Experiments, and Analysis\nTotal RNA was isolated with Trizol as described previously (Fukuda et al., 1991) from the bottom 4 cm of the inflorescence stems of three independent plants grown under the short-day condition and with a height of between 13 and 17 cm. Microarray analyses were performed with the Arabidopsis 2 Oligo Microarray (Agilent Technologies). All microarray experiments and the analysis of data were performed as described previously (Mitsuda et al., 2005) with the exceptions summarized below. P values for differences between nst1-1 nst3-1 and wild-type plants were calculated by Welch's t test, based on a two-tailed distribution (n = 3). To minimize type-I family-wise errors in multiple and simultaneous statistical tests, we adopted a strategy for suppression of false positives. We calculated a Q-value to estimate the false discovery rate from the P value described above using QVALUE software (Storey and Tibshirani, 2003) with the default setting. We considered genes with a Q-value of <0.1 to be genes expressed at different levels in nst1-1 nst3-1 and wild-type plants. Comprehensive gene group analysis using Fisher's exact test was performed with the R program package (http://www.r-project.org/). Quantitative RT-PCR was performed as described previously (Mitsuda et al., 2005). For the analysis of NST transcripts in the mutant lines, RT-PCR was performed with appropriate primers (see Supplemental Table 2 online).\n\nLight and Fluorescence Microscopy\nFor observations of lignin autofluorescence, we used a filter with the following specifications: glass, 365; dichroic mirror, 395; long-pass, 400. To observe ectopic secondary wall thickening, we cleared tissues by incubating them overnight in 70% lactic acid at 50°C. To prepare 70- to 150-μm sections of inflorescence stems and hypocotyls, we embedded the tissues in 3% agar then sectioned them on a vibrating microtome (HM-650V; Microm). Assays of GUS activity were performed with T1 or T2 transgenic plants. Plant tissues were fixed briefly, in some cases, in solution containing 0.3% formalin, 0.2% MES, pH 5.8, and 0.3 M mannitol before incubation in 100 mM sodium phosphate buffer, pH 7.0, containing 0.1% Triton X-100, 1 mM 5-bromo-4-chloro-3-indolyl-β-d-glucuronide, and 0.5 mM potassium ferricyanide at 37°C for up to 12 h. Stained stems and hypocotyls were embedded in 3% agar and sectioned. All observations by light and fluorescence microscopy were made with the Axioskop2 plus system (Carl Zeiss).\n\nUltrastructural Observation by Transmission Electron Microscopy\nShort pieces of inflorescence stems were fixed in 30 mM HEPES buffer containing 2% paraformaldehyde and 2% glutaraldehyde then fixed in HEPES buffer containing 2% osmium tetroxide. Fixed tissues were embedded in Q651 resin (Nissin EM). Sections of 80 to 90 nm thick were post-stained with uranyl acetate and lead citrate and observed by a JEM1200EX transmission electron microscope (JEOL) at an accelerating voltage of 80 kV.\n\nIdentification of NST Homologs in Poplar\nPoplar NAC genes resembling the Arabidopsis NST genes were collected using the Advanced Search tool of the Joint Genome Initiative poplar database (http://genome.jgi-psf.org/Poptr1/Poptr1.home.html) with the command, “find by homology to related protein with E-value <1.0e-20”; the database for Populus trichocarpa; and the query “At2g46770.” The 62 extracted sequences and amino acid sequences of subfamily IIb of NAC transcription factors of Arabidopsis, as defined in a previous study (Mitsuda et al., 2005), were aligned using the ClustalW program with default settings (Chenna et al., 2003). The amino acid sequences corresponding to conserved NAC domains were extracted and realigned. A phylogenetic tree was built by neighboring-joining method using ClustalW with default settings (an alignment and the sequences are shown in Supplemental Table 3 online). Bootstrap values were calculated from 100 trials. The subtree including the NST and VND genes is shown in Figure 7.\n\nAccession Numbers and Data Deposition\nNST1 and NST3 reported in this study correspond to the Arabidopsis Genome Initiative locus identifiers At2g46770 and At1g32770, respectively. Microarray data performed in this study can be found in the National Center for Biotechnology Information Gene Expression Omnibus data library under accession number GSE5187.\n"]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Materials and methods\nMaterials\nDilution series\nIllumina HumanCNV370-Duo BeadChip Infinium SNP data for dilution series of 12 mixtures of cancer cell line (HCC1395) mixed with its paired normal cell line (HCC1395BL) were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE11976]. We excluded chromosome 6 and 16 from analysis due to copy genomic aberrations present in the normal cell line HCC1395BL.\n\nCancer cell lines\nIllumina HumanHap300 data for the promyelocytic leukemia cancer cell HL-60 and colon cancer cell line HT-29 were obtained from Illumina, and Human-610 Quad SNP genotyping data for the colon cancer cell lines SW403, SW480, SW620, SW837, SW1417 and LIM1863 were generated at the Ludwig Institute of Cancer Research using standard processing protocols. The genotyping data for breast cancer cell lines MDA-175 and MDA-468 were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE18799] [23].\n\nPrimary breast tumors\nThree breast tumors (cases 114, 601 and 3,364) that had not received non-neoadjuvant therapy were analyzed in detail using material derived from microdissection. For each case, material containing pure tumor and pure stroma cells respectively was microdissected and compared to data obtained from surgically obtained material from the same tumors. Case 114 was of Luminal B type (23 mm tumor, moderately differentiated infiltrating ductal carcinoma with an extensive in-situ component. Node +ve, ER +ve (6.8 fm/mg protein), EGFR -ve (7.8 fm/mg protein)). Case 601 (20 mm 30 mm tumor, grade 3 with intraductal in-situ ca. and in filtrating ductal carcinoma, node +ve, ER -ve (1.5 fm/mg protein), Her2 +ve (histoscore of 3), EGFR +ve (histoscore of 208)) was classified as ERBB2 positive based on expression microarray data with a fractional rank of 0.982, Case 3,364 was 25 mm grade 3 infiltrating ductal carcinoma, ER positive (8 fm/mg protein), PR positive (histoscore 8/8), Her2 positive (histoscore 3+, one of ten axillary nodes +ve). For each case, DNA was extracted from microdissected stroma and tumor, as well as the original non-dissected sample and analyzed using Illumina Human-610 Quad SNP arrays applying standard protocols.\n\nData processing\nGenome Alteration Print was downloaded [43] and used to analyze all datasets using default settings and the highest ranked copy number and LOH predictions used for comparisons. However, for the cancer cell line dilution series, we re-used the results that had previously been generated by [23] and made available on the aforementioned website.\n\nGenoCN v1.06 was downloaded [44] and used with default settings and stromal contamination settings on for all datasets generated using Illumina Infonaut II SNP arrays. Adjusted GenoCN parameters for the Log R Ratio levels were used for Infonaut HD SNP array processing and in these instances we used the same levels that we specified for OncoSNP. The copy number and LOH predictions from the Viterbi sequence were used for comparisons.\n\nOncoSNP was run on all datasets using 15 EM iterations and with both stromal and intra-tumor heterogeneity options. In all cases, the ploidy prediction with the highest maximum likelihood was chosen and the Viterbi sequence of tumor states used for comparisons. We filtered detected aberrations using a Log Bayes Factor of 30.\n\nStatistical model\nA complete description of our statistical model is provided in Supplementary Information in Additional file 1.\n\nLet xi denote the tumor state at the i-th probe location and (xi, n, xi, t) denote the associated normal and tumor copy numbers. Furthermore, let zi = (zi, n,zi, t) denote the B allele count for the normal and tumor genotype respectively. The combinations (zi, n, (xi, n) and (zi, t, xi, t) fully define the normal and tumor genotypes respectively. The tumor state at each probe denotes the allowable combinations of normal-tumor genotypes at that location as shown in Table 1.\n\nLet π0 denote the normal DNA fraction of the tumor sample due to stromal contamination and 𝜋={𝜋𝑖}𝑛\n𝑖=1 denote the proportion of tumor cells having the normal genotype at each probe. The data 𝑦={𝑦𝑖}𝑛\n𝑖=1 consists of a set of two-dimensional vectors yi = [ri, bi]' whose elements correspond to the Log R Ratio and B allele frequency respectively.\n\nGiven (x, z, π, π0) the data is assumed to be distributed according to a (K + 1)-component mixture of Student t-distributions, where ki indicates the mixture component assignment of the i-th data point,\n\n𝑦𝑖|𝑥𝑖,𝑧𝑖,𝑘𝑖,𝑚,𝛿, 𝛴={ \n𝑆⁢𝑡(𝑚⁡(𝑥𝑖,𝑧𝑖)+𝛿(𝑙𝑙)\n𝑘𝑙,∑(𝑙𝑖)\n𝑘𝑖,𝜈),	𝑘≠0,\n𝑈𝑟⁡(𝑟min,𝑟max)×U𝑏⁢(0,1),	𝑘=0,\n \n(1)\nwhere 𝑆⁢𝑡⁡(𝛿(𝑙)\n𝑘,𝛴(𝑙)\n𝑘,𝑣) is the probability density function of the Student t-distribution with mean 𝛿(𝑙)\n𝑘 and covariance matrix 𝛴(𝑙)\n𝑘 associated with the k-th mixture component and the l-th genotype class and v degrees of freedom. The 0-th component is an outlier class which assumes uniformly distributed data over a specified range.\n\nThe elements of the mean vectors m(xi, zi) = [mr(xi), mb(zi, xi)]' are given by the following:\n\n𝑚𝑟⁡(𝑥𝑖)=(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢\n̅\n𝑟\n𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢\n̅\n𝑟\n𝑥𝑥𝑖,⁢𝑡+𝛽0+𝛽1⁢𝑔𝑖,\n(2)\nwhere gi is the local GC content at the i-th probe location and\n\n𝑚𝑏⁡(𝑧𝑖,𝑥𝑖)=\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑧𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑧𝑖,𝑡\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑥𝑖,𝑡\n \n.\n(3)\nPrior distributions\nThe prior distribution on the mixture weights is given by a Dirichlet distribution:\n\n𝑤(𝑙)|𝛼~𝐷⁢𝑖⁢𝑟⁡(𝛼),\n(4)\nwhere α is a concentration parameter which in the numerical results we used α = 1 to give a at prior on the mixture weights.\n\nThe prior distributions on the mixture centers and covariance matrices are given by standard conjugate Normal-Inverse Wishart distributions:\n\n𝛿(𝑙)\n𝑘|𝜏, 𝛴(𝑙)\n𝑘~𝑁⁡(0,𝜏 𝛴(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(5)\n𝛴(𝑙)\n𝑘|𝛾, 𝑆(𝑙)\n𝑘~𝐼⁢𝑊⁡(𝛾,𝑆(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(6)\nwhere τ is a hyperparameter that controls the strength of the prior and IW(γ, Λ) denotes the Inverse-Wishart distribution with parameter γ and scale matrix Λ.\n\nA beta prior is assumed for the outlier rate,\n\n𝜂|𝛼𝜂, 𝛽𝜂~𝐵⁢𝑒⁡(𝛼𝜂,𝛽𝜂),\n(7)\nwhere (αn, βn) are hyperparameters associated with the Beta prior. For the numerical results we set these as (1,1) to give a uniform distribution. \n\nA normal prior is assumed for the local GC content regression parameters,\n\n𝛽|𝜆𝛽~𝑁⁡(0,𝜆𝛽⁢𝐼2),\n(8)\nwhere Ip is a p × p identity matrix.\n\nA discrete prior is assumed for the stromal contamination content and intra-tumour heterogeneity levels,\n\n𝑝⁡(𝜋0)={ \n𝛼𝜋0,𝜋0=0,\n𝛽𝜋0,𝜋0>0,\n \n(9)\nand\n\n𝑝⁡(𝜋𝑖)={ \n𝛼𝜋,𝜋𝑖=0,\n𝛽𝜋,𝜋𝑖>0,\n  𝑖=1,…,𝑛,\n(10)\nwhere in the numerical results we have used απ0 = βπ0 = 1 and απ = 1, βπ = 2.\n\nThe tumor states are assumed to form an inhomogeneous Markov Chain with transition matrix,\n\n𝑝⁡(𝑥𝑖|𝑥𝑖−1)={ \n1−𝜌,𝑥𝑖=𝑥𝑖−1,\n𝜌,𝑥𝑖≠𝑥𝑖−1,\n \n(11)\nwhere ρ = (1/2) (1-exp(-(1/2L) (si-si-1) and si is the physical coordinate of the i-th probe and L is a characteristic length which we set as L = 2,000,000 for the numerical results.\n\nPosterior inference\nWe estimated the unknown model parameters using an expectation-maximization algorithm. Multiple restarts were used to explore different baseline of the Log R Ratio and the baseline with the greatest likelihood was chosen for the calculation of summary statistics.\n\nSummary statistics\nWe used the Viterbi algorithm to extract the most likely sequence of tumors states and for each aberrant segment in the Viterbi sequence we calculated an approximate Bayes Factor (score) of that segment belonging to each of the tumor states. In addition we also recorded the maximum a posteriori estimates of the Log R Ratio baseline adjustment β0 and the stromal contamination π0.\n\nAvailability\nA MATLAB based implementation (for 64 bit Linux systems) of our software is available for academic and non-commercial use from the associated website [45]. In addition, SNP data analyzed in this paper are also available from this website and from the Gene Expression Omnibus Database under Accession No.[GEO:GSE23785]."]
        )
]