# Vector Retrieval Experiment

In [1]:
import pandas as pd
from data_gatherer.data_gatherer import DataGatherer
from data_gatherer.parser.xml_parser import XMLParser
from data_gatherer.parser.html_parser import HTMLParser
from data_gatherer.logger_setup import setup_logging
from data_gatherer.retriever.embeddings_retriever import EmbeddingsRetriever
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from lxml import etree
import dspy
import logging
import re
import os
import time
from collections import defaultdict

In [2]:
logger = setup_logging('vector_retrieval_experiment', './logs/vector_retrieval_experiment.log', level=logging.INFO)

In [3]:
logger.info("Starting vector retrieval experiment")

[97m1301467205.py - line 1 - INFO - Starting vector retrieval experiment[0m


## 1. Load corpus and ground truth

In [4]:
input_corpus = pd.read_parquet('scripts/exp_input/Local_fetched_data.parquet')  # or load HTML and extract text
ground_truth = pd.read_parquet('scripts/output/gold/dataset_citation_records_Table.parquet')  # adjust as needed

# Add a warning about input data:
logger.info(f"Corpus shape: {str(input_corpus.shape)}")
logger.info(f"Ground truth shape: {str(ground_truth.shape)}")

[97m3230141443.py - line 5 - INFO - Corpus shape: (3503, 7)[0m
[97m3230141443.py - line 6 - INFO - Ground truth shape: (401327, 7)[0m


In [5]:
ground_truth['pmc_id'] = ground_truth['citing_publication_link'].str.extract(r'(PMC\d+)', flags=re.IGNORECASE)

In [6]:
input_corpus.head()  # Check the structure of the corpus

Unnamed: 0,file_name,raw_cont,format,length,path,publication,pub_title
0,miR-33b-3p Acts as a Tumor Suppressor by Targe...,"<html lang=""en"" class=""""><head>\n\n <me...",html,205313,../html_xml_samples/PMC/miR-33b-3p Acts as a T...,pmc8595470,
1,Murine neuronatin deficiency is associated wit...,"<html lang=""en"" class=""""><head>\n\n <me...",html,238825,../html_xml_samples/PMC/Murine neuronatin defi...,pmc8413370,
2,Using patient-derived organoids to predict loc...,"<html lang=""en"" class=""""><head>\n\n <me...",html,302206,../html_xml_samples/PMC/Using patient-derived ...,pmc9975107,
3,FOXK1 Participates in DNA Damage Response by C...,"<html lang=""en"" class=""""><head>\n\n <me...",html,249460,../html_xml_samples/PMC/FOXK1 Participates in ...,pmc7458625,
4,JAK-STAT Pathway Inhibition Partially Restores...,"<html lang=""en"" class=""""><head>\n\n <me...",html,231479,../html_xml_samples/PMC/JAK-STAT Pathway Inhib...,pmc7911100,


In [7]:
ground_truth.head()  # Check the structure of the ground truth

Unnamed: 0,identifier,repository,citing_publication_link,citation_record_source,citation_record_from_doi,doi,pmcid,pmc_id
0,PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1,proteomexchange_search.tsv,1,10.1038/S41467-025-56720-1,,
1,PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312,proteomexchange_search.tsv,1,10.6019/PXD051312,,
2,PXD051312,PRIDE,https://dx.doi.org/10.1002/prca.202400095,proteomexchange_search.tsv,1,10.1002/prca.202400095,,
3,PXD051312,PRIDE,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,proteomexchange_search.tsv,0,,PMC11895760,PMC11895760
4,PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571,proteomexchange_search.tsv,1,10.17159/SAJS.2025/18571,,


In [8]:
# Enhanced query using hackathon context trigger keywords
query_ontology_aware = """Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
"""
query_augmented = """Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code"""
query_base = "Available data, accession code, data repository, deposited data"

query_base_adj = "Data Availability Statement, Methods with dataset mention(s), Deposited data, Data Accession, Data Provenance, Downloaded data."

query = query_ontology_aware

In [9]:
#xml_parser = XMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)
#html_parser = HTMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True)

Note: some files are being skipped because of ground truth incompleteness.

In [None]:
# Systematic evaluation of different models and top-k values (OPTIMIZED)

# Define models to test
models_to_test = [
    # Base models
    ###'sentence-transformers/all-MiniLM-L6-v2', 
    #'sentence-transformers/all-mpnet-base-v2',
    #'sentence-transformers/all-MiniLM-L12-v2',
    #'sentence-transformers/sentence-t5-base',

    # BioMed
    #"sentence-transformers/embeddinggemma-300m-medical",
    #"neuml/pubmedbert-base-embeddings",
    #'sentence-transformers/allenai-specter',
    ###'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',

    # MSMARCO passage ranking models
    ###'sentence-transformers/msmarco-distilbert-base-v4',
    #'sentence-transformers/msmarco-bert-base-dot-v5',
    #'sentence-transformers/msmarco-distilbert-dot-v5',
    #'sentence-transformers/msmarco-distilbert-base-tas-b',

    # Paraphrase models
    ###'sentence-transformers/paraphrase-MiniLM-L3-v2',

    # Semantic Search / QA
    #'sentence-transformers/multi-qa-mpnet-base-cos-v1',
    #'sentence-transformers/multi-qa-distilbert-cos-v1',
    ###'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
    'deepset/roberta-base-squad2'
]

# Define top-k values to test
topk_values = [1, 3, 5, 9]
max_k = max(topk_values)  # We'll retrieve this many and slice for smaller k values

# Store results - Initialize properly for all queries
results = defaultdict(lambda: defaultdict(dict))

logger.info(f"Starting OPTIMIZED systematic evaluation with enhanced query")
logger.info(f"Testing {len(models_to_test)} models with top-k values {topk_values}")
logger.info(f"Optimization: Single model load per model, reuse for all publications")

queries = ['query_ontology_aware', 'query_augmented', 'query_base', 'query_base_adj']


for model_name in models_to_test:
    # Clean up previous embeddings
    if os.path.exists("corpus_embeddings.npy"):
        os.remove("corpus_embeddings.npy")

    start_time = time.time()  
    cnt = 0

    #html_parser.embeddings_retriever.embed_query(query)
    logger.info(f"\n{'='*60}")
    logger.info(f"Testing model: {model_name}")
    logger.info(f"{'='*60}")
    html_parser = HTMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True, embeddings_model_name=model_name)
    xml_parser = XMLParser('open_bio_data_repos.json', logger, llm_name='gemini-2.0-flash', use_portkey=True, embeddings_model_name=model_name)

    queries_recall = {(q_name, topk): 0 for q_name in queries for topk in topk_values}
    
    for i, publication in input_corpus.sample(n=100).iterrows():
        if 1 == 100:  # Limit to first 100 publications for faster testing
            break
        logger.info(f"Publication: {publication['publication']}")
            
        gt = ground_truth[ground_truth['pmc_id'].str.lower() == publication['publication'].lower()]
        idnts = gt['identifier'].tolist()

        logger.info(f"Identifiers in ground truth: {idnts}")
            
        if publication['format'] == 'xml':
            sections = xml_parser.extract_sections_from_xml(etree.fromstring(publication['raw_cont'].encode('utf-8')))
            sections = xml_parser.from_sections_to_corpus(sections)
            parser = xml_parser
        elif publication['format'] == 'html':
            clean_html = html_parser.normalize_HTML(publication['raw_cont'])
            sections = html_parser.extract_sections_from_html(clean_html)
            sections = html_parser.from_sections_to_corpus(sections)
            parser = html_parser

        else:
            logger.warning(f"Unsupported format {publication['format']} for publication {publication['publication']}. Skipping.")
            continue
        
         # Check if identifiers are in content
        idnts_in_cont = []
        for idnt in idnts:
            if idnt in publication['raw_cont']:
                 idnts_in_cont.append(idnt)
            
        logger.info(f"Identifiers in content: {idnts_in_cont}")
            
        if not idnts_in_cont:
            continue

        cnt += 1

        # Prepare corpus
        corpus = []
        for section in sections:
            corpus.append({
                'sec_txt': 'Section Title: ' + section['section_title'] + 
                        '. Content: ' + section['sec_txt']
            })
            
        logger.info(f"Corpus:\n{str.join('\n',[item['sec_txt'] for item in corpus])}")

        try:
            parser.embeddings_retriever.embed_corpus(corpus, batch_size=128)
        except Exception as e:
            logger.error(f"Error embedding corpus for publication {publication['publication']}: {e}")
    
        for q_i,query in enumerate([query_ontology_aware, query_augmented, query_base, query_base_adj]):
            q_name = queries[q_i]
            logger.info(f"Query {q_i} {q_name}: {query}")
            
            html_parser.embeddings_retriever.embed_query(query)
            xml_parser.embeddings_retriever.embed_query(query)
            
            # OPTIMIZATION: Only embed corpus (model already loaded)
            try:
                #parser.embeddings_retriever.embed_corpus(corpus, batch_size=128)
                
                # OPTIMIZATION: Single retrieval with max_k, then slice for different k values
                full_result = parser.embeddings_retriever.search(query=None, k=max_k)

                for full_result_item in full_result:
                    logger.info(f"L2 Norm {full_result_item['L2_distance']} --> {full_result_item['text'][:150]}")

                # Evaluate for all top-k values using the same retrieval result
                for topk_docs_to_retrieve in topk_values:
                    logger.info(f"Evaluating with top-k = {topk_docs_to_retrieve}")
                    
                    # Slice results for current k value
                    result = full_result[:topk_docs_to_retrieve]
                    
                    # Combine all retrieved text
                    iterres = '. '.join([r['text'] for r in result])
                    
                    # Check matches
                    matches = set()
                    not_matched = set()
                    for j, row in gt.iterrows():
                        if row['identifier'].lower() in iterres.lower():
                            queries_recall[q_name,topk_docs_to_retrieve] += 1/len(idnts_in_cont)
                            matches.add(row['identifier'])
                    
                    not_matched = set(idnts_in_cont) - matches
                    
                    logger.info(f"Publication {publication['publication']}, Top-k {topk_docs_to_retrieve}: Found {len(matches)} matches out of {len(idnts_in_cont)} ground truth")
                    
                    logger.info(f"Missed citations: {not_matched}")

            except Exception as e:
                logger.error(f"Error processing publication {i+1} with model {model_name}: {e}")
                continue
        
            # Calculate final recalls and store results for all top-k values
            
    for topk_docs_to_retrieve in topk_values:
        for q_i, q_name in enumerate(queries):
            final_recall = queries_recall[q_name,topk_docs_to_retrieve]/cnt if cnt > 0 else 0

            # Store results - Create a unique key combining model and query
            result_key = f"{model_name}_{q_name}"
            results[result_key][topk_docs_to_retrieve] = {
                'recall': final_recall,
                'processed_docs': cnt,
                'query': q_name,
                'model': model_name
            }
                
            logger.info(f"Model: {model_name}, Top-k: {topk_docs_to_retrieve}, Recall: {final_recall:.4f}")

    elapsed_time = time.time() - start_time    
    logger.info(f"Total time for model {model_name}: {elapsed_time:.2f}s")

logger.info(f"\n{'='*60}")
logger.info("OPTIMIZED evaluation completed!")
logger.info(f"{'='*60}")

[97m3210742098.py - line 40 - INFO - Starting OPTIMIZED systematic evaluation with enhanced query[0m
[97m3210742098.py - line 41 - INFO - Testing 1 models with top-k values [1, 3, 5, 9][0m
[97m3210742098.py - line 42 - INFO - Optimization: Single model load per model, reuse for all publications[0m
[97m3210742098.py - line 56 - INFO - 
[97m3210742098.py - line 57 - INFO - Testing model: deepset/roberta-base-squad2[0m
[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 22 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mhtml_parser.py - line 83 - INFO - Initializing htmlRetriever with model: deepset/roberta-base-squad2[0m
[97membeddings_retriever.py - line 39 - INFO - Metal Performance Shaders available - using Apple Silicon acceleration[0m


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

[97membeddings_retriever.py - line 52 - INFO - Initialized model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)[0m
[97membeddings_retriever.py - line 56 - INFO - Using model's actual max sequence length: 512[0m
[97mbase_parser.py - line 45 - INFO - LLMParser initialized.[0m
[97mllm_client.py - line 22 - INFO - Initializing LLMClient with model: gemini-2.0-flash[0m
[97mxml_parser.py - line 26 - INFO - Initializing xmlRetriever with model: deepset/roberta-base-squad2[0m
[97membeddings_retriever.py - line 39 - INFO - Metal Performance Shaders available - using Apple Silicon acceleration[0m




Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[97membeddings_retriever.py - line 52 - INFO - Initialized model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)[0m
[97membeddings_retriever.py - line 56 - INFO - Using model's actual max sequence length: 512[0m
[97m3210742098.py - line 67 - INFO - Publication: pmc8379159[0m
[97m3210742098.py - line 72 - INFO - Identifiers i

Embedding corpus of 71 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 14.69s (0.207s per chunk)
Corpus embedding completed. Shape: (71, 768)
Chunking results: 71 original documents → 71 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.8282270431518555 --> Section Title: Results > Integration of GWAS and eQTL/ mQTL data from fetal and adult brain. Content: Results > Integration of GWAS and eQTL/ mQTL dat[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.347771644592285 --> Section Title: Samples > Alcohol use disorder. Content: Samples > Alcohol use disorder
We meta-analyzed three published GWAS: the Million Veteran Prog[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.472995758056641 --> Section Title: Results > LDSC analysis using tissue specific epigenetic annotations. Content: Results > LDSC analysis using tissue specific epigenetic[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.207089900970459 --> Section Title: Results > AUD meta-analysis. Content

Embedding corpus of 71 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 12.30s (0.173s per chunk)
Corpus embedding completed. Shape: (71, 768)
Chunking results: 71 original documents → 71 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.43112325668335 --> Section Title: No Title. Content: Table AF3 Proteins from N. caninum Quantitative Approach. A. Proteins identified and quantified in the total extract[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.845775604248047 --> Section Title: No Title. Content: Table AF4 Up- and down-regulated proteins in the discharged N. caninum tachyzoite. A. Proteins in the UP-REGULATED g[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.189298152923584 --> Section Title: No Title. Content: Table AF9 N. caninum interaction network. A. Predicted protein interactions among the 2,011 quantified N. caninum pr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.729009628295898 --> Section Title: Associated Data. Content: This section

Embedding corpus of 23 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 3.56s (0.155s per chunk)
Corpus embedding completed. Shape: (23, 768)
Chunking results: 23 original documents → 23 embedded chunks


[97m3210742098.py - line 153 - INFO - Publication pmc4281950, Top-k 3: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2450512'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc4281950, Top-k 5: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2450512'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc4281950, Top-k 9: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2450512'}[0m
[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data arch

Embedding corpus of 55 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 10.42s (0.189s per chunk)
Corpus embedding completed. Shape: (55, 768)
Chunking results: 55 original documents → 55 embedded chunks


[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 1[0m
[97m3210742098.py - line 153 - INFO - Publication pmc6776068, Top-k 1: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn3159438'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 3[0m
[97m3210742098.py - line 153 - INFO - Publication pmc6776068, Top-k 3: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn3159438'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc6776068, Top-k 5: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn3159438'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc6776068, Top-k 9: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - M

Embedding corpus of 43 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 6.28s (0.146s per chunk)
Corpus embedding completed. Shape: (43, 768)
Chunking results: 43 original documents → 43 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.704492092132568 --> Section Title: Ortholog search. Content: Orthologs for Zucchini/MitoPLD, PARN, PNLDC1, and Nibbler/Mut7 were first searched using orthoDB v9 against m[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.780992031097412 --> Section Title: 2S rRNA depletion from total RNA. Content: For the depletion of 2S rRNA from 10 μg of total ovarian RNA, 100 μl slurry of Myone Strepta[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.979450225830078 --> Section Title: Extended Data. Content: d) Histogram showing the frequencies of a cloned ping-pong piRNA 5' end downstream of a responder piRNA 5' end [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.128995895385742 --> Section Title: Fly husbandry and strains. Content: F

Embedding corpus of 59 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 10.20s (0.173s per chunk)
Corpus embedding completed. Shape: (59, 768)
Chunking results: 59 original documents → 59 embedded chunks


[97m3210742098.py - line 153 - INFO - Publication pmc9090919, Top-k 1: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn1688369'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 3[0m
[97m3210742098.py - line 153 - INFO - Publication pmc9090919, Top-k 3: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn1688369'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc9090919, Top-k 5: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn1688369'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc9090919, Top-k 9: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn1688369'}[0m
[97m3210742098.py - line 118 - I

Embedding corpus of 34 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 5.49s (0.161s per chunk)
Corpus embedding completed. Shape: (34, 768)
Chunking results: 34 original documents → 34 embedded chunks


[97m3210742098.py - line 153 - INFO - Publication pmc7270112, Top-k 5: Found 0 matches out of 2 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'GSE142668', 'syn12299750'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc7270112, Top-k 9: Found 1 matches out of 2 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'GSE142668'}[0m
[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[

Embedding corpus of 62 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 13.10s (0.211s per chunk)
Corpus embedding completed. Shape: (62, 768)
Chunking results: 62 original documents → 62 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.47910213470459 --> Section Title: Chromatin immunoprecipitation sequencing analysis. Content: FastQC ( https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ ), was [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.67603874206543 --> Section Title: Protein pulldown, immunoprecipitation and mass spectrometry. Content: To assess covariance among all proteins identified by mass spectr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.036128997802734 --> Section Title: Protein pulldown, immunoprecipitation and mass spectrometry. Content: Schizonts, following 8 h in vitro culture, and male gametocytes 1[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.158417701721191 --> Section Title: NDC80–GFP shows unusual dynamics throug

Embedding corpus of 43 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 8.87s (0.206s per chunk)
Corpus embedding completed. Shape: (43, 768)
Chunking results: 43 original documents → 43 embedded chunks


[97m3210742098.py - line 153 - INFO - Publication pmc8120261, Top-k 3: Found 0 matches out of 2 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2580853', 'syn21069604'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8120261, Top-k 5: Found 0 matches out of 2 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2580853', 'syn21069604'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8120261, Top-k 9: Found 0 matches out of 2 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn2580853', 'syn21069604'}[0m
[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental

Embedding corpus of 40 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 8.31s (0.208s per chunk)
Corpus embedding completed. Shape: (40, 768)
Chunking results: 40 original documents → 40 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.223512172698975 --> Section Title: Authors’ contributions. Content: WXW carried out the copy number experiment, performed the statistical and bioinformatics analyses, con[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.306694984436035 --> Section Title: Loss ofAPOBEC3Bdeletion is associated with tumour-infiltrating immune cells. Content: Using CIBERSORT, a bioinformatics tool used to in[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.673503875732422 --> Section Title: Statistical analyses. Content: The association between odds for breast cancer and APOBEC3B copy number was modelled using logistic regr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.054405212402344 --> Section Title: Acknowledgements. Content: We thank p

Embedding corpus of 47 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 7.98s (0.170s per chunk)
Corpus embedding completed. Shape: (47, 768)
Chunking results: 47 original documents → 47 embedded chunks


[97m3210742098.py - line 155 - INFO - Missed citations: {'syn22418021'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8239004, Top-k 5: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn22418021'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8239004, Top-k 9: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn22418021'}[0m
[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passa

Embedding corpus of 56 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 13.62s (0.243s per chunk)
Corpus embedding completed. Shape: (56, 768)
Chunking results: 56 original documents → 56 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.924792766571045 --> Section Title: BM-EVs as Drug Delivery Vesicles. Content: Loading of nucleic acid cargo into BM-EVs by Exo-Fect. (A) siRNA (Cy3-labeled transfection c[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.214044570922852 --> Section Title: Packaging siRNA Into BM-EVs. Content: For the knockdown of gene expression of glyceraldehyde 3-phosphate dehydrogenase (GAPDH), HUVECs [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.801082134246826 --> Section Title: Isolation of BM-EVs. Content: Isolation of BM-EVs via salting-out. (A) The precipitation of whey proteins was achieved with different s[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.878612518310547 --> Section Title: Proteomics Profiling. Content: Accord

Embedding corpus of 111 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 28.97s (0.261s per chunk)
Corpus embedding completed. Shape: (111, 768)
Chunking results: 111 original documents → 111 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.951988220214844 --> Section Title: Methods > Evaluation of protein-based prediction models. Content: Methods > Evaluation of protein-based prediction models
In addition, [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.330361366271973 --> Section Title: Methods > Construction of protein-based prediction models. Content: Methods > Construction of protein-based prediction models
The study[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.4148454666137695 --> Section Title: Methods > Protein measurements and proteomic data processing. Content: Methods > Protein measurements and proteomic data processing
Blo[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.581899642944336 --> Section Title: Methods > Construction of protein-ba

Embedding corpus of 120 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 24.55s (0.205s per chunk)
Corpus embedding completed. Shape: (120, 768)
Chunking results: 120 original documents → 120 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.657562255859375 --> Section Title: Materials and Methods > Plasmids, siRNA, and transfection. Content: Materials and Methods > Plasmids, siRNA, and transfection
For gene [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.803577423095703 --> Section Title: Materials and Methods > Plasmids, siRNA, and transfection. Content: Materials and Methods > Plasmids, siRNA, and transfection
For gene [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.520106315612793 --> Section Title: Results > LGP2 is required for growth retardation of tumor cells and cell‐intrinsic inflammation upon loss of ADAR1, which is potentiat[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.717828750610352 --> Section Title: Results > LGP2 is required for growth

Embedding corpus of 56 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 11.09s (0.198s per chunk)
Corpus embedding completed. Shape: (56, 768)
Chunking results: 56 original documents → 56 embedded chunks


[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archived at, available at, stored on, hosted by, accessible via, retrieved from, provided by, experimental data, raw data, public repository, data archive, data portal, accession code[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 16.994041442871094 --> Section Title: Protein Analysis. Content: Leaves from 5-week-old plants grown under climate-controlled chamber conditions were harvested 4 h after the[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.324390411376953 --> Section Title: Footnotes. Content: This work was supported by the Deutsche Forschungsgemeinschaft (project no. 197471519, FOR2092 project no. 23948485[0m
[97m321074209

Embedding corpus of 40 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 8.25s (0.206s per chunk)
Corpus embedding completed. Shape: (40, 768)
Chunking results: 40 original documents → 40 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.421195030212402 --> Section Title: CRediT authorship contribution statement. Content: Kevin Cummins: Writing – review & editing, Supervision, Methodology, Formal analysis[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.455294609069824 --> Section Title: 3.3. Neurocognition. Content: Attention. For the attention domain, there was one task (Continuous Performance Task – Number Letter Vers[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.115358352661133 --> Section Title: 1. Introduction. Content: TBI exposure has been extensively and independently linked to subsequent mental health problems ( Alway et al[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.214828491210938 --> Section Title: 1. Introduction. Content: Traumatic b

Embedding corpus of 73 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 24.04s (0.329s per chunk)
Corpus embedding completed. Shape: (73, 768)
Chunking results: 73 original documents → 73 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.30702018737793 --> Section Title: Results and Discussion > AXIN1 frameshift deletions confer acquired resistance to WNT pathway inhibition in RSPO3‐addicted cells. Conte[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.74324893951416 --> Section Title: Materials and Methods > Detection of RSPO3 fusion transcripts. Content: Materials and Methods > Detection of RSPO3 fusion transcripts
T[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.862617015838623 --> Section Title: Materials and Methods > RNA interference. Content: Materials and Methods > RNA interference
The siRNA‐targeting reagents were purchased[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.913545608520508 --> Section Title: Materials and Methods > Immunofluoresce

Embedding corpus of 64 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 15.57s (0.243s per chunk)
Corpus embedding completed. Shape: (64, 768)
Chunking results: 64 original documents → 64 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.743081092834473 --> Section Title: Bioinformatics analysis. Content: SRRT mRNA expression and patient survival plots, grouped by SRRT levels, were derived from the REMBRA[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.829158782958984 --> Section Title: Chromatin immunoprecipitation. Content: For each ChIP reaction, ~1 × 10 6 × 01 cells were crosslinked with 1% formaldehyde for 10 min a[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.100564002990723 --> Section Title: RNA-sequencing data processing. Content: RNA-Seq libraries were prepared using the TruSeq RNA Library Prep kit (Illumina) and were sent[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.131523132324219 --> Section Title: MAGL modulates self-renewal through P

Embedding corpus of 30 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 7.67s (0.256s per chunk)
Corpus embedding completed. Shape: (30, 768)
Chunking results: 30 original documents → 30 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.212774276733398 --> Section Title: Co-expression network construction. Content: The R package WGCNA (v1.66) [ 37 ] was used to construct circRNA-mRNA co-expression networ[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.515786170959473 --> Section Title: Brain region-specific circRNA profiling and changes in Alzheimer’s disease. Content: One hundred and forty-seven circRNAs were signific[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.609654426574707 --> Section Title: ABSTRACT. Content: Alzheimer’s disease (AD) has devastating consequences for patients during its slow, progressive course. It is import[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.258163452148438 --> Section Title: Introduction. Content: The global inc

Embedding corpus of 65 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 20.67s (0.318s per chunk)
Corpus embedding completed. Shape: (65, 768)
Chunking results: 65 original documents → 65 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.255078315734863 --> Section Title: 2. Materials and Methods | 2.14. Survival Analysis for GREM1, BAG2, TRIP6, OLFM4 and MAGE-A9. Content: We wished to study the impact of[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.1584978103637695 --> Section Title: 3.4. Increased Expression of GREM1, BAG2, OLFM4, TRIP6 in the Diffuse Subtype and MAGE-A9 in the Intestinal Subtype. Content: We select[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.563459396362305 --> Section Title: 2. Materials and Methods | 2.7. Basic pH Reversed-Phase Liquid Chromatography (bRPLC). Content: Pooled TMT-labeled samples were fractio[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.9786577224731445 --> Section Title: 3.1. Proteomic Analysis of Diffuse 

Embedding corpus of 37 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 23.01s (0.622s per chunk)
Corpus embedding completed. Shape: (37, 768)
Chunking results: 37 original documents → 37 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.622011184692383 --> Section Title: Cisregulatory landscape analysis. Content: Cell-type specific covariates corrected expression matrices from four datasets (i.e. two cel[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.250950813293457 --> Section Title: Sex-specific CRDs and TRDs associated with schizophrenia. Content: a) Schematic of a CRD obtained from the pairwise correlation of five[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.381206512451172 --> Section Title: Sex-specific enhancer-promoter regulatory landscape. Content: Moreover, our analysis revealed sex-specific distal regulatory landscapes[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.753456115722656 --> Section Title: Sex-specific enhancer-promoter regula

Embedding corpus of 37 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 9.03s (0.244s per chunk)
Corpus embedding completed. Shape: (37, 768)
Chunking results: 37 original documents → 37 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.915311336517334 --> Section Title: Enzyme‑Linked Immunosorbent Assay (ELISA). Content: Serum kallikrein (KLKB1) concentrations in 32 healthy subjects, 35 cirrhosis patien[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.9869771003723145 --> Section Title: Introduction. Content: In this study, we implemented DIA for discovery and PRM for confirmation of HCC biomarkers in non-fractionated s[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.184040069580078 --> Section Title: TCGA Data Analysis. Content: KLKB1 expression data and clinicopathological features of TCGA-HCC patients were obtained from the Genomic[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.633502960205078 --> Section Title: Introduction. Content: Therefore, fi

Embedding corpus of 175 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 53.82s (0.308s per chunk)
Corpus embedding completed. Shape: (175, 768)
Chunking results: 175 original documents → 175 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.804335594177246 --> Section Title: Caussim: extensive simulation settings > Family of candidate estimators. Content: Caussim: extensive simulation settings > Family of ca[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.836731910705566 --> Section Title: Caussim: extensive simulation settings > Data generation. Content: Caussim: extensive simulation settings > Data generation
We generate[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.719315528869629 --> Section Title: Introduction > Extending prediction to prescription needs causality. Content: Introduction > Extending prediction to prescription needs[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.537914276123047 --> Section Title: Results: Factors Driving Good Model S

Embedding corpus of 82 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 33.16s (0.404s per chunk)
Corpus embedding completed. Shape: (82, 768)
Chunking results: 82 original documents → 82 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.554939270019531 --> Section Title: Materials and methods > Statistical analysis. Content: Materials and methods > Statistical analysis
The Wilcoxon Rank Sum Test (two-sid[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.993268013000488 --> Section Title: Materials and methods > Data retrieval. Content: Materials and methods > Data retrieval
For human lung atlas, data generated by Travagl[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.396733283996582 --> Section Title: Results > Complement components. Content: Results > Complement components
a Dot plot depicting expression of genes encoding complement [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.562769889831543 --> Section Title: Materials and methods > Data analysis

Embedding corpus of 84 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 41.66s (0.496s per chunk)
Corpus embedding completed. Shape: (84, 768)
Chunking results: 84 original documents → 84 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.485081195831299 --> Section Title: Construction of ADAT luciferase reporters. Content: SDC3-RLuc and SDC3(G-end)-RLuc plasmids were generated using the backbone vector ps[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.563314437866211 --> Section Title: Depletion of I34-tRNAs impairs cell adhesion and sensitises cells to translation inhibitors. Content: We then tested whether translatio[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.780686378479004 --> Section Title: Depletion of I34-tRNAs impairs translation of MLD-containing proteins in different cell lines. Content: To rule out cell-specific effec[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.9370269775390625 --> Section Title: Construction of ADAT eGFP reporters.

Embedding corpus of 111 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 40.39s (0.364s per chunk)
Corpus embedding completed. Shape: (111, 768)
Chunking results: 111 original documents → 111 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.6365461349487305 --> Section Title: RESULTS > Prioritization of rare likely‐deleterious variants. Content: RESULTS > Prioritization of rare likely‐deleterious variants
The[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.597729682922363 --> Section Title: METHODS > Discovery of novel AD‐associated genes by SKAT‐O. Content: METHODS > Discovery of novel AD‐associated genes by SKAT‐O
We aggr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.402661323547363 --> Section Title: METHODS > Endophenotype analysis. Content: METHODS > Endophenotype analysis
We conducted the gene‐based SKAT‐O with cognitive and Magne[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.418891906738281 --> Section Title: METHODS > Discovery of novel AD‐asso

Embedding corpus of 92 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 28.02s (0.305s per chunk)
Corpus embedding completed. Shape: (92, 768)
Chunking results: 92 original documents → 92 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.63339900970459 --> Section Title: Whole-exome sequencing > Alignment of WES reads and mutation calling.. Content: Whole-exome sequencing > Alignment of WES reads and mut[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.190401554107666 --> Section Title: Results > Episodic A3A expression alters the EMT trajectory of HGSOC.. Content: Results > Episodic A3A expression alters the EMT trajec[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.387443542480469 --> Section Title: Methods > Whole-exome sequencing. Content: Methods > Whole-exome sequencing
All samples were analyzed using a DRAGEN BioIT processor ru[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.4720563888549805 --> Section Title: Methods > Mutational signature extrac

Embedding corpus of 60 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 18.95s (0.316s per chunk)
Corpus embedding completed. Shape: (60, 768)
Chunking results: 60 original documents → 60 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.289216995239258 --> Section Title: 3. Results | 3.3. Bulk proteins: amino acid composition. Content: The P. pollicipes bulk proteins separate into two major (PpLrCP and P[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.446202754974365 --> Section Title: 3. Results | 3.6. Enzymes and protease inhibitors. Content: Many of the identified adhesive proteins likely function as enzymes or prot[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.878657817840576 --> Section Title: 4. Discussion. Content: Beyond proteins that either function in immunity or sclerotization, several identified proteins suggest the pot[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.881485462188721 --> Section Title: 3. Results | 3.5. Pheromones. Content

Embedding corpus of 34 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 9.74s (0.286s per chunk)
Corpus embedding completed. Shape: (34, 768)
Chunking results: 34 original documents → 34 embedded chunks


[97m3210742098.py - line 131 - INFO - L2 Norm 8.03571891784668 --> Section Title: 2.2. Data Preparation and Extraction. Content: Data was prepared from the primary national database using Paxata in the DataRobot platf[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.19581413269043 --> Section Title: 1. Introduction. Content: Despite the focus on deep learning algorithms or classically inspired approaches such as SARIMA or NARNN, oth[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.611541748046875 --> Section Title: 2.4. Model Selection. Content: In order to assess any model’s performance, out-of-time validation (OTV) is employed, which allows the s[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.064520835876465 --> Section Title: 2.4. Model Selection. Content: After the data had been examined by the platform, it began the modeling process. A wide variety of model[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.112390518188477 --> Section Title: 3.3. Time Series Forec

Embedding corpus of 45 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 13.75s (0.306s per chunk)
Corpus embedding completed. Shape: (45, 768)
Chunking results: 45 original documents → 45 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.5792975425720215 --> Section Title: Simultaneous Extraction of Proteins and Metabolites | HPLC nESI-MS/MS measurement and data analysis. Content: For shotgun proteomics me[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.089319229125977 --> Section Title: Simultaneous Extraction of Proteins and Metabolites | Derivatization and Analysis of Metabolites With GC-BT-TOF-MS. Content: The untarg[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.663653373718262 --> Section Title: Shotgun Proteomics Analysis. Content: Proteomics analysis of the vacuum exposed and control cells after 6 h of recovery in a complex me[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.031167984008789 --> Section Title: Discussion | Regulation of the Vacuu

Embedding corpus of 58 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 11.02s (0.190s per chunk)
Corpus embedding completed. Shape: (58, 768)
Chunking results: 58 original documents → 58 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.820657730102539 --> Section Title: Bioinformatic analysis with MaxQuant. Content: Peptide identification from the raw files are searched using MaxQuant (version 1.6.2.10)[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.940332412719727 --> Section Title: SUMO1/2/3 knockout in striatal cells. Content: Striatal STHdh Q7/Q7 cells deleted for SUMO1, 2 and 3 using CRISPR/Cas9 SUMO gRNAs as de[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.850801467895508 --> Section Title: Author contributions. Content: S.S conceptualized and designed the project. O.R carried out all the biochemical SUMO work. M.S carried [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.859765529632568 --> Section Title: Rhes regulates the expression of gene

Embedding corpus of 39 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 11.64s (0.298s per chunk)
Corpus embedding completed. Shape: (39, 768)
Chunking results: 39 original documents → 39 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.47268533706665 --> Section Title: Results. Content: HLA class Ι peptide ligand identification. (A) Cumulative unique peptides (black), predicted peptide ligands (green),[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.690481185913086 --> Section Title: Introduction. Content: The analysis of HLA class Ι ligandomes benefit in general from advances in shotgun proteomics because not all bu[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.54687213897705 --> Section Title: Results | Peptide Characteristics of HLA Class Ι Peptide Ligands Detected with High-pH Reversed-Phase or SCX Pre-fractionation. Content[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.856579780578613 --> Section Title: Immuno-Affinity Purification. Content: 

Embedding corpus of 68 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 16.15s (0.237s per chunk)
Corpus embedding completed. Shape: (68, 768)
Chunking results: 68 original documents → 68 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.4976325035095215 --> Section Title: 3.2. Primary and In Vitro Datasets Are Only Slightly Correlated. Content: Similarity between stroma datasets of detected genes changing[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.425812721252441 --> Section Title: 3.2. Primary and In Vitro Datasets Are Only Slightly Correlated. Content: First, we quantified gene expression changes between normal a[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.466500282287598 --> Section Title: 2.4. Meta-Analysis Pipeline. Content: The meta-analysis involves the following steps: (1) define the conditions to be compared (e.g., t[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.6354756355285645 --> Section Title: 3.1. Database Construction. Content

Embedding corpus of 47 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 18.88s (0.402s per chunk)
Corpus embedding completed. Shape: (47, 768)
Chunking results: 47 original documents → 47 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.5577850341796875 --> Section Title: Results > Identification of Breast Cancer Survival-Related Adenosine-to-Inosine RNA Editing Sites. Content: Results > Identification of[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.399251937866211 --> Section Title: Results > Identification of Breast Cancer Survival-Related Adenosine-to-Inosine RNA Editing Sites. Content: Results > Identification of[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.817184925079346 --> Section Title: Results > Adenosine-to-Inosine RNA Editing-Based Risk Score Construction and Its Association With Breast Cancer Overall Survival and Di[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.881102561950684 --> Section Title: Material and Methods > Gene Set Enri

Embedding corpus of 77 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 22.44s (0.291s per chunk)
Corpus embedding completed. Shape: (77, 768)
Chunking results: 77 original documents → 77 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.479111671447754 --> Section Title: Results > Identification of apaQTL/eQTL-SNPs in APA-related LUAD genes. Content: Results > Identification of apaQTL/eQTL-SNPs in APA-re[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.742191791534424 --> Section Title: Results > Expression analysis of CISD2 and NIT2. Content: Results > Expression analysis of CISD2 and NIT2
a , b CISD2 mRNA expression ([0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.886385440826416 --> Section Title: Introduction. Content: Introduction
Lung adenocarcinoma (LUAD) and lung squamous cell carcinoma (LUSC) have shown distinct incidence tr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.972764015197754 --> Section Title: Methods > Fluorescent enzyme reporter

Embedding corpus of 71 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 13.84s (0.195s per chunk)
Corpus embedding completed. Shape: (71, 768)
Chunking results: 71 original documents → 71 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.656396389007568 --> Section Title: Materials and methods > Human miRNA expression profile and data preprocessing. Content: Materials and methods > Human miRNA expression [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.999017238616943 --> Section Title: Materials and methods > Brain and neuronal sample preparation and biochemical analysis. Content: Materials and methods > Brain and neur[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.1535162925720215 --> Section Title: Materials and methods > Human brain and CSF sample preparation. Content: Materials and methods > Human brain and CSF sample preparation[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.949723720550537 --> Section Title: Materials and methods > Differential

Embedding corpus of 52 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 21.01s (0.404s per chunk)
Corpus embedding completed. Shape: (52, 768)
Chunking results: 52 original documents → 52 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.775222778320312 --> Section Title: Computational analysis of proteomics data. Content: Statistical analysis was performed using the R software environment. Correlation co[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.155433654785156 --> Section Title: SDS-PAGE and western blotting. Content: Proteins were resolved on 4-12% gradient SDS-PAGE gels (NuPAGE Bis-Tris Precast Gels, Life Tech[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.26128387451172 --> Section Title: Peptide pull downs. Content: Biotinylated NELFE peptide (QPFQRSIpSADDLQE) was synthesized (GenScript) and bound to NeutrAvidin agarose.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.498855590820312 --> Section Title: Author contribution. Content: P.B.

Embedding corpus of 33 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 10.31s (0.312s per chunk)
Corpus embedding completed. Shape: (33, 768)
Chunking results: 33 original documents → 33 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.8973894119262695 --> Section Title: Methods > Convolutional neural network (CNN).. Content: Methods > Convolutional neural network (CNN).
Similar to our previous study 38 [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.950045585632324 --> Section Title: Results > Classification results using combined features.. Content: Results > Classification results using combined features.
Differenc[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.347223281860352 --> Section Title: Results > Classification results using spectral features.. Content: Results > Classification results using spectral features.
Four type[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.674473762512207 --> Section Title: Discussion > Limitations.. Content: 

Embedding corpus of 73 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 27.43s (0.376s per chunk)
Corpus embedding completed. Shape: (73, 768)
Chunking results: 73 original documents → 73 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.95240592956543 --> Section Title: Analysis of susceptibility to chemicals and antifungal agents. Content: Analyses of susceptibility to the presence of antifungal drugs [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.033077239990234 --> Section Title: Author contributions. Content: All authors contributed to the data analysis. P.H., H.L., L.C., G.-J.H., X.T., X.Y., C.T., E.Y., and L.W[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.807262420654297 --> Section Title: Chromatin immunoprecipitation sequencing and data analysis. Content: Library sequencing was performed on an Illumina Hi-Seq 2000. The r[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.840773582458496 --> Section Title: RNA purification and quantitative RT-P

Embedding corpus of 42 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 10.71s (0.255s per chunk)
Corpus embedding completed. Shape: (42, 768)
Chunking results: 42 original documents → 42 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.154438018798828 --> Section Title: 1. Introduction. Content: Generally MS/MS spectrum identification can be considered a nearest neighbor task: for a given query spectrum[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.905954360961914 --> Section Title: 2.4.2. SpectraST. Content: We compared the performance of ANN-SoLo against the popular spectral library search engine SpectraST [ 42 ].[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.930069923400879 --> Section Title: 3.2. Approximate nearest neighbor indexing speeds up open search. Content: The previous timing results do not include the time required[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.050975799560547 --> Section Title: 1. Introduction. Content: Here we pre

Embedding corpus of 32 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 20.93s (0.654s per chunk)
Corpus embedding completed. Shape: (32, 768)
Chunking results: 32 original documents → 32 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.688931465148926 --> Section Title: Grade II. Content: Out of 16 patients, 7 showed a resolution of the clinical picture by means of antibiotic therapy alone. In seven pat[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.977043151855469 --> Section Title: Study design. Content: We retrieved medical records of all patients admitted to our Emergency Department for Acute Cholecystitis (AC) f[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.146045684814453 --> Section Title: Grade III. Content: Fifty percent (4/8) of cases showed a resolution of the clinical picture with conservative treatment alone. A young[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.464113235473633 --> Section Title: Discussion. Content: The Rose Surgica

Embedding corpus of 89 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 36.00s (0.405s per chunk)
Corpus embedding completed. Shape: (89, 768)
Chunking results: 89 original documents → 89 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.12853479385376 --> Section Title: Methods > Single-cell RNA-seq with DNBelab C4 system. Content: Methods > Single-cell RNA-seq with DNBelab C4 system
The DNBelab C Serie[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.2782793045043945 --> Section Title: Methods > Pathway enrichment analysis. Content: Methods > Pathway enrichment analysis
Enriched pathways on the whole gene-set were dete[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.424923896789551 --> Section Title: Results > Similar disease signatures in PBMCs and neural cells. Content: Results > Similar disease signatures in PBMCs and neural cells[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.499225616455078 --> Section Title: Methods > Cohort. Content: Methods > 

Embedding corpus of 74 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 19.24s (0.260s per chunk)
Corpus embedding completed. Shape: (74, 768)
Chunking results: 74 original documents → 74 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.119657516479492 --> Section Title: Analysis of somatic retrotransposition > Detection of mobile element insertions using TraFiC-mem. Content: Analysis of somatic retrotra[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.452957630157471 --> Section Title: Results > Dissecting the genomic features that influence the landscape of L1 retrotranspositions in cancer. Content: Results > Dissecti[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.821849822998047 --> Section Title: Results > The landscape of somatic retrotransposition in a large cancer whole-genome dataset. Content: Results > The landscape of somat[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.166179656982422 --> Section Title: Analysis of somatic retrotranspositio

Embedding corpus of 53 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 30.46s (0.575s per chunk)
Corpus embedding completed. Shape: (53, 768)
Chunking results: 53 original documents → 53 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.707798004150391 --> Section Title: N-glycosylation is a prerequisite for the proper cell surface expression of SIDT1. Content: N -glycosylation is important for cell surf[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.859999656677246 --> Section Title: Discussion. Content: Our results show that the occupancy of glycosylation at each N -glycosite is different, and the N -glycosylation o[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.004440307617188 --> Section Title: Protein expression and purification. Content: Full-length SIDT1 and SIDT2 expression were performed in HEK293F cells in the same manner[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.272541046142578 --> Section Title: N-glycosylation contributes to RNA bi

Embedding corpus of 64 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 27.00s (0.422s per chunk)
Corpus embedding completed. Shape: (64, 768)
Chunking results: 64 original documents → 64 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.605284690856934 --> Section Title: An encyclopedia of enhancer-gene regulatory interactions. Content: An encyclopedia of enhancer-gene regulatory interactions
A new bench[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.1442060470581055 --> Section Title: Supervised classification of enhancer-gene regulatory interactions. Content: Supervised classification of enhancer-gene regulatory inte[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.345572471618652 --> Section Title: Applying ENCODE-rE2G to new cell types. Content: Applying ENCODE-rE2G to new cell types
Toward guiding further data collection and mode[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.190919876098633 --> Section Title: Supervised classification of enhance

Embedding corpus of 92 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 34.02s (0.370s per chunk)
Corpus embedding completed. Shape: (92, 768)
Chunking results: 92 original documents → 92 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.368783473968506 --> Section Title: Results > Chromatic features identified master transcriptional factors in PRCC, ChRCC, and TFE3-RCC. Content: Results > Chromatic featu[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.5677170753479 --> Section Title: Results > Mutational signature analysis of Japanese RCCs. Content: Results > Mutational signature analysis of Japanese RCCs
a Signal-mu[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.677219390869141 --> Section Title: Results > Somatic genomic aberrations in Japanese RCCs. Content: Results > Somatic genomic aberrations in Japanese RCCs
The bottom heat[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.059279441833496 --> Section Title: Methods > Definitions of ATAC scores, d

Embedding corpus of 76 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 31.34s (0.412s per chunk)
Corpus embedding completed. Shape: (76, 768)
Chunking results: 76 original documents → 76 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.6733527183532715 --> Section Title: 3’ mRNA sequencing and analysis. Content: 2 million shControl and shPCID2 J-Lat 11.1 cells were collected for RNA sequencing analysis i[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.385955810546875 --> Section Title: FISH-Flow. Content: To analyze the dynamics of viral RNA (vRNA) and/or GFP-producing cells by FISH-Flow, five million control and PCID2[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.879549026489258 --> Section Title: Experimental model and study participant details. Content: Jurkat cells and latent HIV-1 infected Jurkat clones J-Lat 11.1, 10.6 and A2[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.196355819702148 --> Section Title: PCID2 represses HIV-1 gene expressio

Embedding corpus of 118 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 32.69s (0.277s per chunk)
Corpus embedding completed. Shape: (118, 768)
Chunking results: 118 original documents → 118 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 4.900213241577148 --> Section Title: Methods > Antibodies and reagents. Content: Methods > Antibodies and reagents
AAV2/9-containing LRP10 and AAV2/9-control vectors were g[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.195831298828125 --> Section Title: Supplementary Information > No Title. Content: Supplementary Information > No Title
Additional file 17 : Supplemental Figure 2. Venn di[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.2944231033325195 --> Section Title: Supplementary Information > No Title. Content: Supplementary Information > No Title
Additional file 18 : Supplemental Figure 3. Differe[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.46827507019043 --> Section Title: Supplementary Information > No Title.

Embedding corpus of 79 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 31.18s (0.395s per chunk)
Corpus embedding completed. Shape: (79, 768)
Chunking results: 79 original documents → 79 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 4.773425102233887 --> Section Title: Brain Region-Specific Comparison of Aging and Alzheimer’s Disease Signatures > Functional Enrichment of Aging Specific, Aging/Alzheimer[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.258943557739258 --> Section Title: Subgroup Identification in Normal Aging Brain Hippocampus Samples > Function Annotation of Genotype-Tissue Expression and UK Subgroup D[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.524110317230225 --> Section Title: Results > Brain Region-Specific Comparison of Aging and Alzheimer’s Disease Signatures. Content: Results > Brain Region-Specific Compar[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.236513137817383 --> Section Title: Results > Brain Region-Specific Compa

Embedding corpus of 89 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 27.63s (0.310s per chunk)
Corpus embedding completed. Shape: (89, 768)
Chunking results: 89 original documents → 89 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.39344596862793 --> Section Title: Quantification of DAF-16::GFP fluorescence.. Content: For quantification of GFP fluorescence in fixed animals, 25-30 worms were transfe[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.10775089263916 --> Section Title: C. elegans day 10.. Content: cDNA was synthesized Clontech SmartSeq v4 reagents from of 2ng RNA, due to low yields from aged worms. Ful[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.230978012084961 --> Section Title: Neural excitation and longevity. Content: Neural excitation and longevity in humans and C. elegans . a, Analysis of the cortical transc[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.444913864135742 --> Section Title: Electroencephalogram (EEG) Telemetry Un

Embedding corpus of 46 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 23.81s (0.518s per chunk)
Corpus embedding completed. Shape: (46, 768)
Chunking results: 46 original documents → 46 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.685453414916992 --> Section Title: Materials and Methods > Inhibition of Purified Sialidases.. Content: Materials and Methods > Inhibition of Purified Sialidases.
Sialida[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.859899997711182 --> Section Title: Materials and Methods > Bacterial Genomic Database and HMM-Based Sialidase Searches of Isolate Genomes.. Content: Materials and Methods[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.136732578277588 --> Section Title: Results > No Title. Content: Results > No Title
To further confirm the functional assignment of these enzymes, we examined their suscep[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.165614604949951 --> Section Title: Materials and Methods > Heterologous 

Embedding corpus of 54 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 26.09s (0.483s per chunk)
Corpus embedding completed. Shape: (54, 768)
Chunking results: 54 original documents → 54 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.861261367797852 --> Section Title: Results > Overview of method. Content: Results > Overview of method
Overview of CellWalker. a Cells (circles) are connected based on si[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.045609951019287 --> Section Title: Results > Identification of cell types in the developing brain. Content: Results > Identification of cell types in the developing brain[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.0790300369262695 --> Section Title: Results > Identification of cell types in the developing brain. Content: Results > Identification of cell types in the developing brain[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.309962272644043 --> Section Title: Availability of data and materials. 

Embedding corpus of 83 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 43.06s (0.519s per chunk)
Corpus embedding completed. Shape: (83, 768)
Chunking results: 83 original documents → 83 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.824221134185791 --> Section Title: Authors’ contributions. Content: Authors’ contributions
KI conducted a survey; performed clinical examination, data collection and data[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.048834800720215 --> Section Title: Results > Antibody characteristics. Content: Results > Antibody characteristics
The majority (84.46%) of patients had Raynaud’s phenome[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.081320762634277 --> Section Title: Materials and methods > Statistical analysis. Content: Materials and methods > Statistical analysis
Statistical analysis was performed [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.083999633789062 --> Section Title: Materials and methods > Methods. Cont

Embedding corpus of 98 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 29.22s (0.298s per chunk)
Corpus embedding completed. Shape: (98, 768)
Chunking results: 98 original documents → 98 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.05985164642334 --> Section Title: Methods > Cell culture, siRNA transfection, and EMT induction. Content: Methods > Cell culture, siRNA transfection, and EMT induction
I[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.488955497741699 --> Section Title: Results > Single cell analysis highlights dynamic changes in carnitine and fatty acid metabolism across EMT. Content: Results > Single [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.813482761383057 --> Section Title: Results > Literature mining and CRISPR knockout screens support metabolic targets inferred using both bulk and single-cell simulations.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.820040702819824 --> Section Title: Results > Single cell analysis highlig

Embedding corpus of 93 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 33.14s (0.356s per chunk)
Corpus embedding completed. Shape: (93, 768)
Chunking results: 93 original documents → 93 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 4.682888031005859 --> Section Title: RNAscope in situ hybridization. Content: Fluorescent in situ hybridization was performed using RNAscope Multiplex Fluorescent Reagent K[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.362314701080322 --> Section Title: Visium spatial transcriptomics data processing. Content: Samples were processed using Scanpy 1.7.2. The seven sample data matrices were[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.751852035522461 --> Section Title: DSB-bearing neurons stimulate glial activation. Content: By mapping the density of γH2AX + capture areas across all spatial clusters, w[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.389046669006348 --> Section Title: Single-cell RNA-seq analysis in DSB-b

Embedding corpus of 41 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 41.55s (1.013s per chunk)
Corpus embedding completed. Shape: (41, 768)
Chunking results: 41 original documents → 41 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.415267944335938 --> Section Title: Accuracy testing. Content: Area under ROC (A) and PR (B) curves are ranked across different methods: the highest value is ranked first,[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.938369750976562 --> Section Title: Accuracy testing. Content: Yeast static synthetic: 2,000 genes ×2,000 experiments dataset generated using GNW simulator ( Schaffter, Ma[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.08564281463623 --> Section Title: Accuracy testing. Content: Previously we compared accuracy of BNFinder algorithm with Banjo ( Wilczyński & Dojer, 2009 ) on data provid[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.233062744140625 --> Section Title: Acknowledgments. Content: For distribu

Embedding corpus of 58 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 29.60s (0.510s per chunk)
Corpus embedding completed. Shape: (58, 768)
Chunking results: 58 original documents → 58 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.825325012207031 --> Section Title: Proteomics | Multidimensional protein identification technology. Content: All labeled samples were combined prior to multidimensional p[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.894470691680908 --> Section Title: Experiment 1.1: Nuclear proteins differentially expressed in GVs from pre-antral versus antral follicles. Content: ( A ) Correlation of[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.0499420166015625 --> Section Title: Experiment 1.3: Protein interaction network of differentially expressed proteins. Content: The interaction network constructed by STRIN[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.385479927062988 --> Section Title: Experiment 1.2: Gene ontology analys

Embedding corpus of 98 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 37.54s (0.383s per chunk)
Corpus embedding completed. Shape: (98, 768)
Chunking results: 98 original documents → 98 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.498572826385498 --> Section Title: Method details > Calculation of XAR-PSX and XAR-PIX. Content: Method details > Calculation of XAR-PSX and XAR-PIX
First, median express[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.716204643249512 --> Section Title: Method details > Association of XAR with somatic mutations. Content: Method details > Association of XAR with somatic mutations
Damagin[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.767905235290527 --> Section Title: Method details > Defining PIX, PSX, and autosomal genes for dosage compensation estimation. Content: Method details > Defining PIX, PSX[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.126319408416748 --> Section Title: Results > Evolutionally conserved dif

Embedding corpus of 116 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 26.44s (0.228s per chunk)
Corpus embedding completed. Shape: (116, 768)
Chunking results: 116 original documents → 116 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 4.892776966094971 --> Section Title: Co-expression miRNA network analysis > Identification of AD-associated consensus modules across two independent datasets and replicatio[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.669248104095459 --> Section Title: Co-expression miRNA network analysis > Preservation of consensus modules. Content: Co-expression miRNA network analysis > Preservation [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.809338569641113 --> Section Title: Co-expression miRNA network analysis > Machine learning analysis for AD classification. Content: Co-expression miRNA network analysis >[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.9241156578063965 --> Section Title: Co-expression miRNA network analysis

Embedding corpus of 37 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 8.44s (0.228s per chunk)
Corpus embedding completed. Shape: (37, 768)
Chunking results: 37 original documents → 37 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.737588405609131 --> Section Title: 2.1. Clustering algorithms for scRNA-seq analysis. Content: provides built-in functions for clustering scRNA-seq data [ 17 , 18 ]. Thes[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.870368003845215 --> Section Title: 3.1. Results on PBMCs: Seurat-generated networks. Content: Finally, we applied the algorithms to the complete dataset, encompassing all[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.392495155334473 --> Section Title: 3.2. Results on PBMCs: Alternative preprocessing. Content: ARI across different configurations of alternative preprocessing pipelines f[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.367655754089355 --> Section Title: 4. Discussion and Conclusion. Content

Embedding corpus of 79 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 14.08s (0.178s per chunk)
Corpus embedding completed. Shape: (79, 768)
Chunking results: 79 original documents → 79 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 14.37177848815918 --> Section Title: METHOD DETAILS > Multi-omics analysis. Content: METHOD DETAILS > Multi-omics analysis
We identified different microglia subtypes with t[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 15.403209686279297 --> Section Title: Multi-omics analysis > Differential expression analyses from brain proteomic analyses. Content: Multi-omics analysis > Differential exp[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 15.438015937805176 --> Section Title: METHOD DETAILS > Multi-omics analysis. Content: METHOD DETAILS > Multi-omics analysis
Differential expression analyses of brain proteom[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 15.628152847290039 --> Section Title: Multi-omics analysis > Differentia

Embedding corpus of 16 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 3.16s (0.198s per chunk)
Corpus embedding completed. Shape: (16, 768)
Chunking results: 16 original documents → 16 embedded chunks


[97m3210742098.py - line 131 - INFO - L2 Norm 8.827048301696777 --> Section Title: 3.3. Correlation between neoantigen load and specific subtypes of CN-low/endometrioid and CN-high/serous-like endometrial cancers. Cont[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.126357078552246 --> Section Title: 1. Introduction. Content: Cancer specific neoantigens result from genetic alterations accumulated by tumor cells that create altered op[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.1275634765625 --> Section Title: 3.2. Prognostic significance of neoantigen load in CN-low/endometrioid and CN-high/serous-like. Content: We evaluated the association o[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.41118049621582 --> Section Title: 3.1. Predicted neoantigen load of CN-low/endometrioid and CN-high/serous-like endometrial cancers. Content: We accessed whole-exome seq[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.83537483215332 --> Section Title: 4. Discussion. Content: 

Embedding corpus of 27 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 4.67s (0.173s per chunk)
Corpus embedding completed. Shape: (27, 768)
Chunking results: 27 original documents → 27 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.886743068695068 --> Section Title: Outside data sets. Content: Data sets from the GTex consortium ( 15 ) and the Human Proteome Map ( 16 ) were downloaded from their resp[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.561114311218262 --> Section Title: Web and data analysis tools. Content: The DKK server runs Red Hat Linux, with the Apache HTTP server providing access to the web resour[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.621297836303711 --> Section Title: RESULTS | Sample kinase page components. Content: In addition to the phylogenetic tree and PRM peptides, protein interaction networks f[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.080913543701172 --> Section Title: MATERIALS AND METHODS. Content: All o

Embedding corpus of 32 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 5.49s (0.172s per chunk)
Corpus embedding completed. Shape: (32, 768)
Chunking results: 32 original documents → 32 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 15.83541488647461 --> Section Title: Results | Associations with pathology. Content: Associations in the VEGF family with cognition, AD dementia, and AD pathology. This fig[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.411558151245117 --> Section Title: Participants. Content: Data were acquired from two well-characterized cohort studies of aging and dementia. The Religious Orders Study [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.922292709350586 --> Section Title: Neuropsychological composites. Content: Composite measures of cognition have been calculated in ROS/MAP [ 22 ]. Briefly, global cogniti[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 19.56149673461914 --> Section Title: Measures of neural and cerebrovascu

Embedding corpus of 47 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 7.70s (0.164s per chunk)
Corpus embedding completed. Shape: (47, 768)
Chunking results: 47 original documents → 47 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 16.89535140991211 --> Section Title: Data Records. Content: Folder structure: PolypGen dataset is divided into two folders – positive frames and negative frames. Later, it [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.94745445251465 --> Section Title: Competing interests. Content: J. E. East has served on clinical advisory board for Lumendi, Boston Scientific and Paion; Clinical advis[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 19.673236846923828 --> Section Title: Code availability | Data Availability Statement. Content: To help users with the evaluate the generalizability of detection and segment[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 20.158550262451172 --> Section Title: Footnotes. Content: Publisher’s not

Embedding corpus of 83 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 16.03s (0.193s per chunk)
Corpus embedding completed. Shape: (83, 768)
Chunking results: 83 original documents → 83 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 4.81510066986084 --> Section Title: Methods > Microbubble preparation and characterization. Content: Methods > Microbubble preparation and characterization
The MBs used in[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.128409385681152 --> Section Title: Results > Ultrasound frequency can be tuned to elicit differential gene expression in the BBB. Content: Results > Ultrasound frequency [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.842194557189941 --> Section Title: Methods > Bulk RNA sequencing of brain endothelial cells. Content: Methods > Bulk RNA sequencing of brain endothelial cells
Immediately[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.976006031036377 --> Section Title: Methods > Bulk RNA sequencing analysis

Embedding corpus of 35 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 7.28s (0.208s per chunk)
Corpus embedding completed. Shape: (35, 768)
Chunking results: 35 original documents → 35 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.423349380493164 --> Section Title: DISCUSSION. Content: Exploration of the enhanced biological pathways by DAVID and IPA emphasizes the commonalities and differences betw[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.431478500366211 --> Section Title: DISCUSSION. Content: Our comprehensive analysis of the transcriptional response of the endothelium to diet-induced diabetes has identif[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 10.03658390045166 --> Section Title: DISCUSSION. Content: We also found that serum galectin-3 levels were substantially increased in our HFD model. A recent study has simil[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 10.6160888671875 --> Section Title: Microarray analysis.. Content: Microar

Embedding corpus of 40 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 6.82s (0.171s per chunk)
Corpus embedding completed. Shape: (40, 768)
Chunking results: 40 original documents → 40 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.06619119644165 --> Section Title: Introduction. Content: Philadelphia-like (Ph-like) ALL is a high-risk subtype of B-cell acute lymphoblastic leukemia (B-ALL) associated[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.90816068649292 --> Section Title: SFPQ-ABL1 transforms cytokine-dependent cell lines but is a relatively weaker driver of proliferation compared with BCR-ABL1. Content: [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.367853164672852 --> Section Title: SFPQ-ABL1 and BCR-ABL1 activate distinct signaling networks. Content: We performed 2 complementary approaches to identify sites that we[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.587947845458984 --> Section Title: SFPQ-ABL1 transforms cytokine-dependent

Embedding corpus of 51 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 10.74s (0.211s per chunk)
Corpus embedding completed. Shape: (51, 768)
Chunking results: 51 original documents → 51 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.357816219329834 --> Section Title: Methods > Mass spectrometry. Content: Methods > Mass spectrometry
A schematic description of our mass spectrometry workflow is shown in[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.945775985717773 --> Section Title: Mass spectrometry > Sample collection. Content: Mass spectrometry > Sample collection
Cells grown in their recommended growth medium to[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.961637020111084 --> Section Title: Usage Notes > Availability of data at different levels of processing. Content: Usage Notes > Availability of data at different levels o[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.0780439376831055 --> Section Title: Methods > Mass spectrometry. Content

Embedding corpus of 169 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 28.75s (0.170s per chunk)
Corpus embedding completed. Shape: (169, 768)
Chunking results: 169 original documents → 169 embedded chunks


[97m3210742098.py - line 131 - INFO - L2 Norm 5.920191764831543 --> Section Title: Results > Diverse deconvolution algorithmic cores perform well. Content: Results > Diverse deconvolution algorithmic cores perform well[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.074817657470703 --> Section Title: Methods > In vitro validation admixture generation. Content: Methods > In vitro validation admixture generation
60 biological admixture[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.123251914978027 --> Section Title: Results > Purified and admixed expression profiles enable unbiased assessment of deconvolution methods. Content: Results > Purified and[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.163987636566162 --> Section Title: Methods > In silico validation admixture generation. Content: Methods > In silico validation admixture generation
Insufficient RNA was [0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 1[0m
[97m3210742098.py - line 153 - INF

Embedding corpus of 55 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 12.69s (0.231s per chunk)
Corpus embedding completed. Shape: (55, 768)
Chunking results: 55 original documents → 55 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.05840301513672 --> Section Title: Botrytisinfection. Content: For inoculation of 4-week-old Arabidopsis plants , B. cinerea spores were collected in Vogel buffer as desc[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 19.456729888916016 --> Section Title: Oxidative burst assay. Content: The production of ROS was measured by a luminol-based assay on leaf discs from 4-week-old Arabidopsis a[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 19.469966888427734 --> Section Title: RT-PCR and RT-qPCR. Content: RT-qPCR was performed using cDNA from the reverse transcriptase reaction described above and SsoAdvanced U[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 20.727745056152344 --> Section Title: Bacterial pathogen infection assay

Embedding corpus of 69 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 15.87s (0.230s per chunk)
Corpus embedding completed. Shape: (69, 768)
Chunking results: 69 original documents → 69 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 14.58004093170166 --> Section Title: Introduction. Content: Introduction
Frontotemporal lobar degeneration with tau pathology (FTLD-tau) is characterized clinically by beha[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 15.200249671936035 --> Section Title: Author contributions. Content: Author contributions
SM designed experiments performed, analyzed immunostaining and immunoblotting, and [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.54913330078125 --> Section Title: Materials and Methods > Antibodies. Content: Materials and Methods > Antibodies
The following antibodies were used in this study: Tau5 [0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.135833740234375 --> Section Title: Introduction. Content: Introduction

Embedding corpus of 68 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 13.88s (0.204s per chunk)
Corpus embedding completed. Shape: (68, 768)
Chunking results: 68 original documents → 68 embedded chunks


[97m3210742098.py - line 155 - INFO - Missed citations: {'syn4164376'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 3[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8024253, Top-k 3: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn4164376'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 5[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8024253, Top-k 5: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn4164376'}[0m
[97m3210742098.py - line 135 - INFO - Evaluating with top-k = 9[0m
[97m3210742098.py - line 153 - INFO - Publication pmc8024253, Top-k 9: Found 0 matches out of 1 ground truth[0m
[97m3210742098.py - line 155 - INFO - Missed citations: {'syn4164376'}[0m
[97m3210742098.py - line 118 - INFO - Query 1 query_augmented: Dataset or data repository information including: deposited in, uploaded to, archiv

Embedding corpus of 101 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 20.93s (0.207s per chunk)
Corpus embedding completed. Shape: (101, 768)
Chunking results: 101 original documents → 101 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.974399089813232 --> Section Title: Single-cell data science: recurring themes | Scaling to higher dimensionalities: more cells, more features, and broader coverage. Conte[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.004376411437988 --> Section Title: Status. Content: For integrating across multiple measurement types from separate cells (approach +M+C), all of which stem from a popula[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.43214225769043 --> Section Title: Challenge X: Integration of single-cell data across samples, experiments, and types of measurement. Content: For integrating across mul[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.543375015258789 --> Section Title: Single-cell data science: recurring th

Embedding corpus of 49 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 13.90s (0.284s per chunk)
Corpus embedding completed. Shape: (49, 768)
Chunking results: 49 original documents → 49 embedded chunks


[97m3210742098.py - line 131 - INFO - L2 Norm 8.59088134765625 --> Section Title: RNA isolation and quantitative RT-PCR analysis. Content: 3T3L1 adipocytes: Total RNA was purified from cultured cells using Trizol Reag[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.735733032226562 --> Section Title: Western blots. Content: 3T3-L1 adipocytes were lysed in ice-cold RIPA buffer with 1x protease inhibitor (Sigma-Aldrich). 25 μg total pr[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 8.83700942993164 --> Section Title: Generation of CRISPR/Cas9 KO in hAPCs. Content: Isolation, proliferation, and differentiation of human adipocyte progenitor cells (hAPC[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.115739822387695 --> Section Title: RNA-seq analysis and gene set enrichment analysis. Content: The Kallisto/Sleuth differential expression pipeline analysis was performed[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 9.251184463500977 --> Section Title: Bckdhadeficiency alter

Embedding corpus of 68 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 17.53s (0.258s per chunk)
Corpus embedding completed. Shape: (68, 768)
Chunking results: 68 original documents → 68 embedded chunks


[97m3210742098.py - line 131 - INFO - L2 Norm 6.381348609924316 --> Section Title: Calculation of HR deficiency associated copy number events (scores). Content: To calculate the number of LoH events, TAI events and LST[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.832138538360596 --> Section Title: Joint quality control metrics. Content: To pass quality control, we required samples to pass four separate criteria. GATK3.7 ( https://[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.978957176208496 --> Section Title: Significantly Mutated Genes in Melanoma Genomic Subtypes | Triple Wild-Type (TWT) Subtype. Content: a) The co-mutation plot of TWT sign[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.0372514724731445 --> Section Title: Significantly Mutated Genes in Melanoma Genomic Subtypes | V600E and V600K mutant melanomas. Content: We then examined BRAF V600E ( NC_[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 7.124807357788086 --> Section Title: Whole-Genome sequen

Embedding corpus of 21 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m
[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar

Embedding time: 6.36s (0.303s per chunk)
Corpus embedding completed. Shape: (21, 768)
Chunking results: 21 original documents → 21 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 17.552188873291016 --> Section Title: Competing interest statement. Content: G.H.S., D.S.M., D.M.S., P.K.R., M.P., A.G., and M.V. were employees of IBM Research Africa at th[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 18.74881362915039 --> Section Title: Data analysis hackathon as a cross-domain, peer-learning environment. Content: Average normalized survey marks with confidence interval[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 21.40662384033203 --> Section Title: Contributor Information. Content: Collaborators: Taoufik Bensellak , Anita Ghansah , Kais Ghedira , Ashley Gritzman , Itunuoluwa Isewon[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 21.91313362121582 --> Section Title: Acknowledgments. Content: The genera

Embedding corpus of 55 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[97m3210742098.py - line 118 - INFO - Query 0 query_ontology_aware: Data Availability Statement or mentions of dataset repositories/portals, identifiers, or accession codes, including PRIDE, ProteomeXchange, MassIVE, iProX, JPOST, Proteomic Data Commons (PDC), Genomic Data Commons (GDC), Cancer Imaging Archive (TCIA), Imaging Data Commons (IDC), Gene Expression Omnibus (GEO), ArrayExpress, dbGaP, Sequence Read Archive (SRA), Protein Data Bank (PDB), Mendeley Data, Synapse, European Genome-Phenome Archive (EGA), BIGD, and ProteomeCentral. 
Also include dataset identifiers or links such as PXD, MSV, GSE, GSM, GPL, GDS, phs, syn, PDC, PRJNA, DOI, or accession code. 
Look for phrases like deposited in, available at, submitted to, uploaded to, archived in, hosted by, retrieved from, accessible via, or publicly available. 
Capture statements indicating datasets, repositories, or data access locations.
[0m


Embedding time: 13.13s (0.239s per chunk)
Corpus embedding completed. Shape: (55, 768)
Chunking results: 55 original documents → 55 embedded chunks


[97membeddings_retriever.py - line 184 - INFO - Searching for top-9 passages similar to the query by embeddings.[0m
[97membeddings_retriever.py - line 166 - INFO - Computing L2 distances using numpy.[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 5.869191646575928 --> Section Title: Methods > Assessment of GZMK and GZMB expression by stimulated versus unstimulated human CD8. Content: Methods > Assessment of GZMK and[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.396605968475342 --> Section Title: Methods > Fluid-phase complement C2, C3, and C4 cleavage assays. Content: Methods > Fluid-phase complement C2, C3, and C4 cleavage assa[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.803619384765625 --> Section Title: Methods > Assessment of complement C2, C3, and C4 secretion by synovial fibroblasts. Content: Methods > Assessment of complement C2, C3[0m
[97m3210742098.py - line 131 - INFO - L2 Norm 6.839570999145508 --> Section Title: Methods > Comparison of complement ac

Embedding corpus of 116 documents using deepset/roberta-base-squad2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print("="*80)
print("VECTOR RETRIEVAL EVALUATION RESULTS")
print("="*80)

# Group results by query type - Updated logic for new structure
query_results = {'ontology_aware': {}, 'augmented': {}, 'base': {}}

for result_key, model_results in results.items():
    # Extract model name and query name from the combined key
    if '_query_' in result_key:
        model_name, query_name = result_key.split('_query_', 1)
    else:
        # Fallback: extract from stored data
        first_result = list(model_results.values())[0]
        model_name = first_result['model']
        query_name = first_result['query']
    
    if query_name not in query_results:
        query_results[query_name] = {}
    if model_name not in query_results[query_name]:
        query_results[query_name][model_name] = {}
    
    for topk, metrics in model_results.items():
        query_results[query_name][model_name][topk] = metrics

print(f"Found query types: {list(query_results.keys())}")
print(f"Results structure check:")
for q_type, q_data in query_results.items():
    print(f"  {q_type}: {len(q_data)} models")

# Create and save tables for each query type
all_tables = {}

for query_name, query_data in query_results.items():
    if not query_data:  # Skip empty query results
        print(f"Skipping empty query: {query_name}")
        continue
        
    print(f"\n{'='*80}")
    print(f"RESULTS FOR {query_name.upper().replace('_', ' ')}")
    print(f"{'='*80}")
    
    # Create table data
    table_data = []
    for model_name, model_results in query_data.items():
        row = {'Model': model_name.split('/')[-1]}  # Just the model name without org
        
        # Add recall for each top-k value
        for topk in sorted(model_results.keys()):
            row[f'Top-K {topk}'] = f"{model_results[topk]['recall']:.4f}"
        
        # Add processing info
        first_result = list(model_results.values())[0]
        row['Processed Docs'] = first_result['processed_docs']
        
        table_data.append(row)
    
    # Convert to DataFrame and display
    df = pd.DataFrame(table_data)
    print(df.to_string(index=False))
    
    # Find best model for this query
    best_recall = 0
    best_config = None
    for model_name, model_results in query_data.items():
        for topk, metrics in model_results.items():
            if metrics['recall'] > best_recall:
                best_recall = metrics['recall']
                best_config = (model_name.split('/')[-1], topk)
    
    if best_config:
        print(f"\n🏆 Best for {query_name}: {best_recall:.4f} - {best_config[0]} (Top-K: {best_config[1]})")

print(f"\n{'='*80}")
print("ALL QUERIES EVALUATION COMPLETE!")
print("Tables saved to scripts/output/")
print(f"{'='*80}")

# Debug: Show actual results structure
print(f"\nDEBUG - Raw results structure:")
for result_key, model_results in results.items():
    print(f"Result Key: {result_key}")
    for topk, metrics in model_results.items():
        print(f"  Top-K {topk}: query={metrics['query']}, model={metrics['model']}, recall={metrics['recall']:.4f}")

VECTOR RETRIEVAL EVALUATION RESULTS
Found query types: ['ontology_aware', 'augmented', 'base', 'base_adj']
Results structure check:
  ontology_aware: 5 models
  augmented: 5 models
  base: 5 models
  base_adj: 5 models

RESULTS FOR ONTOLOGY AWARE
                                              Model Top-K 1 Top-K 3 Top-K 5 Top-K 9  Processed Docs
                                   all-MiniLM-L6-v2  0.4931  0.6131  0.6946  0.7636             100
BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext  0.4325  0.6380  0.7231  0.7992              95
                         msmarco-distilbert-base-v4  0.4648  0.6459  0.7913  0.8474              98
                            paraphrase-MiniLM-L3-v2  0.4234  0.5791  0.6436  0.7345              99
                          multi-qa-MiniLM-L6-cos-v1  0.5042  0.7113  0.8123  0.8426              99

🏆 Best for ontology_aware: 0.8474 - msmarco-distilbert-base-v4 (Top-K: 9)

RESULTS FOR AUGMENTED
                                              Model Top

In [None]:
# 7. Define your trainset (queries and expected retrievals)
trainset = [
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Data Availability Statement\nThe datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: NCBI GEO repository,\nGSE123128\n."]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data Availability Statement\nRaw sequencing data from this study have been deposited in the GEO database with the accession number\nGSE171155\n. The mass spectrometry proteomics data have been deposited to the ProteomeXchange Consortium via the PRIDE [1] partner repository with the data set identifier PXD024161 and 10.6019/PXD024161.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Associated Data\nThis section collects any data citations, data availability statements, or supplementary materials included in this article.\nSupplementary Materials\nDocument S1. Figures\xa0S1–S6 and Tables\xa0S1–S6\nmmc1.pdf\n(2.5MB, pdf)\nDocument S2. Article plus supplemental information\nmmc2.pdf\n(9.1MB, pdf)\nData Availability Statement\n•\nThe next-generation DNA sequencing dataset generated during this study is available at the National Genomics Data Center: HRA003231 (URL:\nhttps://ngdc.cncb.ac.cn\n). The mass spectrometry proteomics data reported in this paper have been deposited to the ProteomeXchange Consortium: PXD037076(\nhttp://proteomecentral.proteomexchange.org\n) via iProx partner repository\n61\n.\n•\nThis paper does not report the original code.\n•\nAny additional information required to reanalyze the data reported in this work paper is available from the\nlead contact\nupon request.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data and Code Availability\nRNA-seq data generated in this study are available at NCBI GEO database with the accession number\nGSE151029\n. The 53BP1 mass spectrometry data have been deposited to the ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD020090. The accession number for the FOXK1 and FOXK2 MS data reported in this paper is PRIDE: PXD001383']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["METHODS\nConstruction of Plasmids\nThe protein-coding regions of the NST3 gene were amplified from the Arabidopsis thaliana cDNA library with appropriate primers (see Supplemental Table 2 online). The 5′ upstream region of 3027 bp, which extended from the site of initiation of translation of the NST3 gene, was used for preparation of the ProNST3:GUS, ProNST3:NST3, and ProNST3:NST3SRDX gene constructs. These genes and 35S:NST3 were constructed from modified vectors derived from pGreenII0029 (Hellens et al., 2000) and p35SSRDXG (Mitsuda et al., 2006). For complementation analysis, we used genomic fragments including NST1 (9580 bp) and NST3 (5199 bp), which contained 6523 and 3069 bp of the respective promoter regions. The region corresponding to the transgene of each vector, with the exception of the pGreen-based vectors, was transferred to the pBCKH plant expression vector (Mitsuda et al., 2006) using the Gateway system (Invitrogen).\n\nConditions for Plant Growth and Transformation\nArabidopsis plants were grown in soil at 22°C with 16 h (long-day condition) or 8 h (short-day condition) of light daily. Unless otherwise stated, plants were grown under the long-day condition. For transformation, a T-DNA vector carrying the appropriate construct was introduced into Agrobacterium tumefaciens strain GV3101 by electroporation, and the resultant Agrobacterium was infiltrated into Arabidopsis using the floral dip method (Clough and Bent, 1998).\n\n\nAssessment of the Mechanical Strength of Inflorescence Stems\nWe used the bottom 5 cm of inflorescence stems taller than 25 cm for measurement of Young's modulus according to a previously described method (Kojima and Yamamoto, 2004).\n\nExamination of the Crystal State of Cellulose Microfibrils of Inflorescence Stems\nThe bottom region of the inflorescence stems, as described above, was used for x-ray diffraction analysis according to a previously described method (Abe and Yamamoto, 2005). Nickel-filtered Cu Kα radiation (wavelength, 0.154 nm) at 30 kV and 35 mA was used with the reflection technique.\n\nIsolation of RNA, Microarray Experiments, and Analysis\nTotal RNA was isolated with Trizol as described previously (Fukuda et al., 1991) from the bottom 4 cm of the inflorescence stems of three independent plants grown under the short-day condition and with a height of between 13 and 17 cm. Microarray analyses were performed with the Arabidopsis 2 Oligo Microarray (Agilent Technologies). All microarray experiments and the analysis of data were performed as described previously (Mitsuda et al., 2005) with the exceptions summarized below. P values for differences between nst1-1 nst3-1 and wild-type plants were calculated by Welch's t test, based on a two-tailed distribution (n = 3). To minimize type-I family-wise errors in multiple and simultaneous statistical tests, we adopted a strategy for suppression of false positives. We calculated a Q-value to estimate the false discovery rate from the P value described above using QVALUE software (Storey and Tibshirani, 2003) with the default setting. We considered genes with a Q-value of <0.1 to be genes expressed at different levels in nst1-1 nst3-1 and wild-type plants. Comprehensive gene group analysis using Fisher's exact test was performed with the R program package (http://www.r-project.org/). Quantitative RT-PCR was performed as described previously (Mitsuda et al., 2005). For the analysis of NST transcripts in the mutant lines, RT-PCR was performed with appropriate primers (see Supplemental Table 2 online).\n\nLight and Fluorescence Microscopy\nFor observations of lignin autofluorescence, we used a filter with the following specifications: glass, 365; dichroic mirror, 395; long-pass, 400. To observe ectopic secondary wall thickening, we cleared tissues by incubating them overnight in 70% lactic acid at 50°C. To prepare 70- to 150-μm sections of inflorescence stems and hypocotyls, we embedded the tissues in 3% agar then sectioned them on a vibrating microtome (HM-650V; Microm). Assays of GUS activity were performed with T1 or T2 transgenic plants. Plant tissues were fixed briefly, in some cases, in solution containing 0.3% formalin, 0.2% MES, pH 5.8, and 0.3 M mannitol before incubation in 100 mM sodium phosphate buffer, pH 7.0, containing 0.1% Triton X-100, 1 mM 5-bromo-4-chloro-3-indolyl-β-d-glucuronide, and 0.5 mM potassium ferricyanide at 37°C for up to 12 h. Stained stems and hypocotyls were embedded in 3% agar and sectioned. All observations by light and fluorescence microscopy were made with the Axioskop2 plus system (Carl Zeiss).\n\nUltrastructural Observation by Transmission Electron Microscopy\nShort pieces of inflorescence stems were fixed in 30 mM HEPES buffer containing 2% paraformaldehyde and 2% glutaraldehyde then fixed in HEPES buffer containing 2% osmium tetroxide. Fixed tissues were embedded in Q651 resin (Nissin EM). Sections of 80 to 90 nm thick were post-stained with uranyl acetate and lead citrate and observed by a JEM1200EX transmission electron microscope (JEOL) at an accelerating voltage of 80 kV.\n\nIdentification of NST Homologs in Poplar\nPoplar NAC genes resembling the Arabidopsis NST genes were collected using the Advanced Search tool of the Joint Genome Initiative poplar database (http://genome.jgi-psf.org/Poptr1/Poptr1.home.html) with the command, “find by homology to related protein with E-value <1.0e-20”; the database for Populus trichocarpa; and the query “At2g46770.” The 62 extracted sequences and amino acid sequences of subfamily IIb of NAC transcription factors of Arabidopsis, as defined in a previous study (Mitsuda et al., 2005), were aligned using the ClustalW program with default settings (Chenna et al., 2003). The amino acid sequences corresponding to conserved NAC domains were extracted and realigned. A phylogenetic tree was built by neighboring-joining method using ClustalW with default settings (an alignment and the sequences are shown in Supplemental Table 3 online). Bootstrap values were calculated from 100 trials. The subtree including the NST and VND genes is shown in Figure 7.\n\nAccession Numbers and Data Deposition\nNST1 and NST3 reported in this study correspond to the Arabidopsis Genome Initiative locus identifiers At2g46770 and At1g32770, respectively. Microarray data performed in this study can be found in the National Center for Biotechnology Information Gene Expression Omnibus data library under accession number GSE5187.\n"]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Materials and methods\nMaterials\nDilution series\nIllumina HumanCNV370-Duo BeadChip Infinium SNP data for dilution series of 12 mixtures of cancer cell line (HCC1395) mixed with its paired normal cell line (HCC1395BL) were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE11976]. We excluded chromosome 6 and 16 from analysis due to copy genomic aberrations present in the normal cell line HCC1395BL.\n\nCancer cell lines\nIllumina HumanHap300 data for the promyelocytic leukemia cancer cell HL-60 and colon cancer cell line HT-29 were obtained from Illumina, and Human-610 Quad SNP genotyping data for the colon cancer cell lines SW403, SW480, SW620, SW837, SW1417 and LIM1863 were generated at the Ludwig Institute of Cancer Research using standard processing protocols. The genotyping data for breast cancer cell lines MDA-175 and MDA-468 were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE18799] [23].\n\nPrimary breast tumors\nThree breast tumors (cases 114, 601 and 3,364) that had not received non-neoadjuvant therapy were analyzed in detail using material derived from microdissection. For each case, material containing pure tumor and pure stroma cells respectively was microdissected and compared to data obtained from surgically obtained material from the same tumors. Case 114 was of Luminal B type (23 mm tumor, moderately differentiated infiltrating ductal carcinoma with an extensive in-situ component. Node +ve, ER +ve (6.8 fm/mg protein), EGFR -ve (7.8 fm/mg protein)). Case 601 (20 mm 30 mm tumor, grade 3 with intraductal in-situ ca. and in filtrating ductal carcinoma, node +ve, ER -ve (1.5 fm/mg protein), Her2 +ve (histoscore of 3), EGFR +ve (histoscore of 208)) was classified as ERBB2 positive based on expression microarray data with a fractional rank of 0.982, Case 3,364 was 25 mm grade 3 infiltrating ductal carcinoma, ER positive (8 fm/mg protein), PR positive (histoscore 8/8), Her2 positive (histoscore 3+, one of ten axillary nodes +ve). For each case, DNA was extracted from microdissected stroma and tumor, as well as the original non-dissected sample and analyzed using Illumina Human-610 Quad SNP arrays applying standard protocols.\n\nData processing\nGenome Alteration Print was downloaded [43] and used to analyze all datasets using default settings and the highest ranked copy number and LOH predictions used for comparisons. However, for the cancer cell line dilution series, we re-used the results that had previously been generated by [23] and made available on the aforementioned website.\n\nGenoCN v1.06 was downloaded [44] and used with default settings and stromal contamination settings on for all datasets generated using Illumina Infonaut II SNP arrays. Adjusted GenoCN parameters for the Log R Ratio levels were used for Infonaut HD SNP array processing and in these instances we used the same levels that we specified for OncoSNP. The copy number and LOH predictions from the Viterbi sequence were used for comparisons.\n\nOncoSNP was run on all datasets using 15 EM iterations and with both stromal and intra-tumor heterogeneity options. In all cases, the ploidy prediction with the highest maximum likelihood was chosen and the Viterbi sequence of tumor states used for comparisons. We filtered detected aberrations using a Log Bayes Factor of 30.\n\nStatistical model\nA complete description of our statistical model is provided in Supplementary Information in Additional file 1.\n\nLet xi denote the tumor state at the i-th probe location and (xi, n, xi, t) denote the associated normal and tumor copy numbers. Furthermore, let zi = (zi, n,zi, t) denote the B allele count for the normal and tumor genotype respectively. The combinations (zi, n, (xi, n) and (zi, t, xi, t) fully define the normal and tumor genotypes respectively. The tumor state at each probe denotes the allowable combinations of normal-tumor genotypes at that location as shown in Table 1.\n\nLet π0 denote the normal DNA fraction of the tumor sample due to stromal contamination and 𝜋={𝜋𝑖}𝑛\n𝑖=1 denote the proportion of tumor cells having the normal genotype at each probe. The data 𝑦={𝑦𝑖}𝑛\n𝑖=1 consists of a set of two-dimensional vectors yi = [ri, bi]' whose elements correspond to the Log R Ratio and B allele frequency respectively.\n\nGiven (x, z, π, π0) the data is assumed to be distributed according to a (K + 1)-component mixture of Student t-distributions, where ki indicates the mixture component assignment of the i-th data point,\n\n𝑦𝑖|𝑥𝑖,𝑧𝑖,𝑘𝑖,𝑚,𝛿, 𝛴={ \n𝑆⁢𝑡(𝑚⁡(𝑥𝑖,𝑧𝑖)+𝛿(𝑙𝑙)\n𝑘𝑙,∑(𝑙𝑖)\n𝑘𝑖,𝜈),	𝑘≠0,\n𝑈𝑟⁡(𝑟min,𝑟max)×U𝑏⁢(0,1),	𝑘=0,\n \n(1)\nwhere 𝑆⁢𝑡⁡(𝛿(𝑙)\n𝑘,𝛴(𝑙)\n𝑘,𝑣) is the probability density function of the Student t-distribution with mean 𝛿(𝑙)\n𝑘 and covariance matrix 𝛴(𝑙)\n𝑘 associated with the k-th mixture component and the l-th genotype class and v degrees of freedom. The 0-th component is an outlier class which assumes uniformly distributed data over a specified range.\n\nThe elements of the mean vectors m(xi, zi) = [mr(xi), mb(zi, xi)]' are given by the following:\n\n𝑚𝑟⁡(𝑥𝑖)=(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢\n̅\n𝑟\n𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢\n̅\n𝑟\n𝑥𝑥𝑖,⁢𝑡+𝛽0+𝛽1⁢𝑔𝑖,\n(2)\nwhere gi is the local GC content at the i-th probe location and\n\n𝑚𝑏⁡(𝑧𝑖,𝑥𝑖)=\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑧𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑧𝑖,𝑡\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑥𝑖,𝑡\n \n.\n(3)\nPrior distributions\nThe prior distribution on the mixture weights is given by a Dirichlet distribution:\n\n𝑤(𝑙)|𝛼~𝐷⁢𝑖⁢𝑟⁡(𝛼),\n(4)\nwhere α is a concentration parameter which in the numerical results we used α = 1 to give a at prior on the mixture weights.\n\nThe prior distributions on the mixture centers and covariance matrices are given by standard conjugate Normal-Inverse Wishart distributions:\n\n𝛿(𝑙)\n𝑘|𝜏, 𝛴(𝑙)\n𝑘~𝑁⁡(0,𝜏 𝛴(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(5)\n𝛴(𝑙)\n𝑘|𝛾, 𝑆(𝑙)\n𝑘~𝐼⁢𝑊⁡(𝛾,𝑆(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(6)\nwhere τ is a hyperparameter that controls the strength of the prior and IW(γ, Λ) denotes the Inverse-Wishart distribution with parameter γ and scale matrix Λ.\n\nA beta prior is assumed for the outlier rate,\n\n𝜂|𝛼𝜂, 𝛽𝜂~𝐵⁢𝑒⁡(𝛼𝜂,𝛽𝜂),\n(7)\nwhere (αn, βn) are hyperparameters associated with the Beta prior. For the numerical results we set these as (1,1) to give a uniform distribution. \n\nA normal prior is assumed for the local GC content regression parameters,\n\n𝛽|𝜆𝛽~𝑁⁡(0,𝜆𝛽⁢𝐼2),\n(8)\nwhere Ip is a p × p identity matrix.\n\nA discrete prior is assumed for the stromal contamination content and intra-tumour heterogeneity levels,\n\n𝑝⁡(𝜋0)={ \n𝛼𝜋0,𝜋0=0,\n𝛽𝜋0,𝜋0>0,\n \n(9)\nand\n\n𝑝⁡(𝜋𝑖)={ \n𝛼𝜋,𝜋𝑖=0,\n𝛽𝜋,𝜋𝑖>0,\n  𝑖=1,…,𝑛,\n(10)\nwhere in the numerical results we have used απ0 = βπ0 = 1 and απ = 1, βπ = 2.\n\nThe tumor states are assumed to form an inhomogeneous Markov Chain with transition matrix,\n\n𝑝⁡(𝑥𝑖|𝑥𝑖−1)={ \n1−𝜌,𝑥𝑖=𝑥𝑖−1,\n𝜌,𝑥𝑖≠𝑥𝑖−1,\n \n(11)\nwhere ρ = (1/2) (1-exp(-(1/2L) (si-si-1) and si is the physical coordinate of the i-th probe and L is a characteristic length which we set as L = 2,000,000 for the numerical results.\n\nPosterior inference\nWe estimated the unknown model parameters using an expectation-maximization algorithm. Multiple restarts were used to explore different baseline of the Log R Ratio and the baseline with the greatest likelihood was chosen for the calculation of summary statistics.\n\nSummary statistics\nWe used the Viterbi algorithm to extract the most likely sequence of tumors states and for each aberrant segment in the Viterbi sequence we calculated an approximate Bayes Factor (score) of that segment belonging to each of the tumor states. In addition we also recorded the maximum a posteriori estimates of the Log R Ratio baseline adjustment β0 and the stromal contamination π0.\n\nAvailability\nA MATLAB based implementation (for 64 bit Linux systems) of our software is available for academic and non-commercial use from the associated website [45]. In addition, SNP data analyzed in this paper are also available from this website and from the Gene Expression Omnibus Database under Accession No.[GEO:GSE23785]."]
        )
]