# Vector Retrieval Experiment

In [1]:
import pandas as pd
from data_gatherer.parser.xml_parser import XMLParser
from data_gatherer.parser.html_parser import HTMLParser
from data_gatherer.logger_setup import setup_logging
from data_gatherer.retriever.embeddings_retriever import EmbeddingsRetriever
from lxml import etree
import dspy
import logging
import re
import os

## 1. Load corpus and ground truth

In [2]:
input_corpus = pd.read_parquet('exp_input/Local_fetched_data.parquet')  # or load HTML and extract text
ground_truth = pd.read_parquet('exp_input/dataset_citation_records_Table.parquet')  # adjust as needed

# Add a warning about input data:
print("Corpus shape:", input_corpus.shape)
print("Ground truth shape:", ground_truth.shape)

Corpus shape: (1314, 6)
Ground truth shape: (397263, 5)


In [3]:
ground_truth['pmc_id'] = ground_truth['citing_publication_link'].str.extract(r'(PMC\d+)', flags=re.IGNORECASE)

In [4]:
input_corpus.head()  # Check the structure of the corpus

Unnamed: 0,file_name,raw_cont,format,length,path,publication
0,miR-33b-3p Acts as a Tumor Suppressor by Targe...,"<html lang=""en"" class=""""><head>\n\n <me...",html,205313,../html_xml_samples/PMC/miR-33b-3p Acts as a T...,pmc8595470
1,Murine neuronatin deficiency is associated wit...,"<html lang=""en"" class=""""><head>\n\n <me...",html,238825,../html_xml_samples/PMC/Murine neuronatin defi...,pmc8413370
2,Using patient-derived organoids to predict loc...,"<html lang=""en"" class=""""><head>\n\n <me...",html,302206,../html_xml_samples/PMC/Using patient-derived ...,pmc9975107
3,FOXK1 Participates in DNA Damage Response by C...,"<html lang=""en"" class=""""><head>\n\n <me...",html,249460,../html_xml_samples/PMC/FOXK1 Participates in ...,pmc7458625
4,JAK-STAT Pathway Inhibition Partially Restores...,"<html lang=""en"" class=""""><head>\n\n <me...",html,231479,../html_xml_samples/PMC/JAK-STAT Pathway Inhib...,pmc7911100


In [5]:
ground_truth.head()  # Check the structure of the ground truth

Unnamed: 0,identifier,repository,citing_publication_link,citation_record_source,citation_record_from_doi,pmc_id
0,PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1,proteomexchange_search.tsv,1,
1,PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312,proteomexchange_search.tsv,1,
2,PXD051312,PRIDE,https://dx.doi.org/10.1002/prca.202400095,proteomexchange_search.tsv,1,
3,PXD051312,PRIDE,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,proteomexchange_search.tsv,0,PMC11895760
4,PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571,proteomexchange_search.tsv,1,


In [6]:
setup_logging('vector_retrieval_experiment', '../logs/vector_retrieval_experiment.log')

<Logger vector_retrieval_experiment (INFO)>

In [7]:
xml_parser = XMLParser('open_bio_data_repos.json', logging, llm_name='gemini-2.0-flash', use_portkey=True)
html_parser = HTMLParser('open_bio_data_repos.json', logging, llm_name='gemini-2.0-flash', use_portkey=True)

Note: some files are being skipped because of ground truth incompleteness.

In [8]:
# model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu") 
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cpu") 
# model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2', device="cpu") 
# model = SentenceTransformer('sentence-transformers/sentence-t5-base', device="cpu") 
# model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4', device="cpu") 
#model = SentenceTransformer(
#    modules=[
#        models.Transformer("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", max_seq_length=512),
#        models.Pooling("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", pooling_mode='mean')
#    ], device="cpu"
#)

In [9]:
recall, cnt = 0, 0 

if os.path.exists("corpus_embeddings.npy"):
    os.remove("corpus_embeddings.npy")

topk_docs_to_retrieve = 3

for i,publication in input_corpus.iterrows():
    if i == 10:
        break
    
    gt = ground_truth[ground_truth['pmc_id'].str.lower() == publication['publication'].lower()]
    idnts = gt['identifier'].tolist()
    
    
    if publication['format'] == 'xml':
        sections = xml_parser.extract_sections_from_xml(etree.fromstring(publication['raw_cont'].encode('utf-8')))
    
    elif publication['format'] == 'html':
        clean_html = html_parser.normalize_HTML(publication['raw_cont'])
        sections = html_parser.extract_sections_from_html(clean_html)
    
    else:
        logging.warning(f"Unsupported format {publication['format']} for publication {publication['publication']}. Skipping.")
        continue
    
    idnts_in_cont = []
    skip = False
    for idnt in idnts:
        if idnt not in publication['raw_cont']:
            skip = True
    if skip:
        print(f"Skipping file {i+1} as no identifiers found in content.")
        continue
        
    cnt += 1
    
    
    corpus = []
    for section in sections:
        corpus.append(
            'section_title: ' + section['section_title'] + 
            '.\nsection_type: ' + section['sec_type'] + 
            '.\ncontent: ' + section['sec_txt']
        )
        
    print(f"Processing file {i+1} with {len(corpus)} sections.")
    
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    query = "Explicitly identify all the datasets by their database accession codes, repository names, and links to deposited datasets mentioned in this paper."
    
    retriever = EmbeddingsRetriever(
        model_name=model_name,  # or any other model you prefer
        device="cpu",
        corpus=corpus
    )

    # "Available data, accession code, data repository, deposited data"
    # "Explicitly identify all database accession codes, repository names, and links to deposited datasets or supplementary data mentioned in this paper."
    # "Deposited data will be available in the repository XYZ, with accession code ABC123."
    
    result = retriever.search(
        query=query,
        k=topk_docs_to_retrieve
    )
    
    sections_snapshot = [str(x['Faiss_index']) + '->' + x['text'][:30] for x in result]
            
    matches = set()
    # Check if the ground truth identifier is in the result
    for j,row in gt.iterrows():
        for passage in result:
            if row['identifier'].lower() in passage['text'].lower():
                recall += 1/len(gt)
                matches.add(row['identifier'])
                break
                #print(f"Found ground truth {row['identifier']} in passage")
    print(f"Found {len(matches)} matches in sections {sections_snapshot} of file {publication['publication']} --> Recall: {len(matches)/len(gt) if len(gt) > 0 else 1}")
print(f"Recall of model {model_name}: {recall/cnt}")  # Average recall across all document
    
    # Add a note for yourself:
    # If you always get the same performance, double-check:
    # - Are the input texts (sections/corpus) actually different for each paper?
    # - Are you sure the retrieval is not dominated by a single section or artifact?
    # - Are you using the correct model object in dspy.Embedder?
    # - Are you passing the correct device and not accidentally reusing a cached model?
    # - Try running with a completely different model (e.g., a random vectorizer) to see if results change at all.

Processing file 1 with 43 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 1 matches in sections ['0.7291100025177002->section_title: Data Availabili', '0.753530740737915->section_title: Data Availabili', '1.4991044998168945->section_title: RNA-Seq Analysi'] of file pmc8595470 --> Recall: 1.0
Processing file 2 with 42 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 2 matches in sections ['1.1604387760162354->section_title: Data Availabili', '1.1781458854675293->section_title: Data availabili', '1.3184823989868164->section_title: Supplementary I'] of file pmc8413370 --> Recall: 1.0
Processing file 3 with 54 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 1 matches in sections ['1.2177165746688843->section_title: Kyoto encyclope', '1.2675576210021973->section_title: Gene ontology (', '1.3030074834823608->section_title: Data Availabili'] of file pmc9975107 --> Recall: 1.0
Processing file 4 with 54 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 2 matches in sections ['0.966753363609314->section_title: Data and Code A', '1.009289026260376->section_title: Data Availabili', '1.4252426624298096->section_title: Footnotes.\nsect'] of file pmc7458625 --> Recall: 1.0
Processing file 5 with 32 sections.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Found 2 matches in sections ['1.1621068716049194->section_title: Data Availabili', '1.1832464933395386->section_title: Data Availabili', '1.2181322574615479->section_title: 2.6. Bioinforma'] of file pmc7911100 --> Recall: 1.0
Processing file 6 with 22 sections.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Found 0 matches in sections ['1.3826327323913574->section_title: Supplementary M', '1.441547155380249->section_title: Funding.\nsectio', '1.4739614725112915->section_title: Acknowledgement'] of file pmc10500535 --> Recall: 0.0
Processing file 7 with 42 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 0 matches in sections ['1.1727302074432373->section_title: Code availabili', '1.2020015716552734->section_title: Source data.\nse', '1.340444564819336->section_title: Supplementary i'] of file pmc7385494 --> Recall: 0.0
Processing file 8 with 54 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 0 matches in sections ['1.1490538120269775->section_title: Data Availabili', '1.1588064432144165->section_title: Data and code a', '1.458694338798523->section_title: On this page.\ns'] of file pmc9017214 --> Recall: 0.0
Processing file 9 with 43 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 6 matches in sections ['1.2081634998321533->section_title: Data Availabili', '1.2186120748519897->section_title: Data availabili', '1.3036179542541504->section_title: Supplementary i'] of file pmc10333231 --> Recall: 1.0
Processing file 10 with 36 sections.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Found 0 matches in sections ['0.9561635851860046->section_title: Data Availabili', '0.9705162644386292->section_title: Data Availabili', '1.358206033706665->section_title: On this page.\ns'] of file pmc11425778 --> Recall: 1
Recall of model sentence-transformers/all-MiniLM-L6-v2: 0.6000000000000002


In [10]:
# 7. Define your trainset (queries and expected retrievals)
trainset = [
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Data Availability Statement\nThe datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: NCBI GEO repository,\nGSE123128\n."]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data Availability Statement\nRaw sequencing data from this study have been deposited in the GEO database with the accession number\nGSE171155\n. The mass spectrometry proteomics data have been deposited to the ProteomeXchange Consortium via the PRIDE [1] partner repository with the data set identifier PXD024161 and 10.6019/PXD024161.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Associated Data\nThis section collects any data citations, data availability statements, or supplementary materials included in this article.\nSupplementary Materials\nDocument S1. Figures\xa0S1–S6 and Tables\xa0S1–S6\nmmc1.pdf\n(2.5MB, pdf)\nDocument S2. Article plus supplemental information\nmmc2.pdf\n(9.1MB, pdf)\nData Availability Statement\n•\nThe next-generation DNA sequencing dataset generated during this study is available at the National Genomics Data Center: HRA003231 (URL:\nhttps://ngdc.cncb.ac.cn\n). The mass spectrometry proteomics data reported in this paper have been deposited to the ProteomeXchange Consortium: PXD037076(\nhttp://proteomecentral.proteomexchange.org\n) via iProx partner repository\n61\n.\n•\nThis paper does not report the original code.\n•\nAny additional information required to reanalyze the data reported in this work paper is available from the\nlead contact\nupon request.']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=['Data and Code Availability\nRNA-seq data generated in this study are available at NCBI GEO database with the accession number\nGSE151029\n. The 53BP1 mass spectrometry data have been deposited to the ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD020090. The accession number for the FOXK1 and FOXK2 MS data reported in this paper is PRIDE: PXD001383']
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["METHODS\nConstruction of Plasmids\nThe protein-coding regions of the NST3 gene were amplified from the Arabidopsis thaliana cDNA library with appropriate primers (see Supplemental Table 2 online). The 5′ upstream region of 3027 bp, which extended from the site of initiation of translation of the NST3 gene, was used for preparation of the ProNST3:GUS, ProNST3:NST3, and ProNST3:NST3SRDX gene constructs. These genes and 35S:NST3 were constructed from modified vectors derived from pGreenII0029 (Hellens et al., 2000) and p35SSRDXG (Mitsuda et al., 2006). For complementation analysis, we used genomic fragments including NST1 (9580 bp) and NST3 (5199 bp), which contained 6523 and 3069 bp of the respective promoter regions. The region corresponding to the transgene of each vector, with the exception of the pGreen-based vectors, was transferred to the pBCKH plant expression vector (Mitsuda et al., 2006) using the Gateway system (Invitrogen).\n\nConditions for Plant Growth and Transformation\nArabidopsis plants were grown in soil at 22°C with 16 h (long-day condition) or 8 h (short-day condition) of light daily. Unless otherwise stated, plants were grown under the long-day condition. For transformation, a T-DNA vector carrying the appropriate construct was introduced into Agrobacterium tumefaciens strain GV3101 by electroporation, and the resultant Agrobacterium was infiltrated into Arabidopsis using the floral dip method (Clough and Bent, 1998).\n\n\nAssessment of the Mechanical Strength of Inflorescence Stems\nWe used the bottom 5 cm of inflorescence stems taller than 25 cm for measurement of Young's modulus according to a previously described method (Kojima and Yamamoto, 2004).\n\nExamination of the Crystal State of Cellulose Microfibrils of Inflorescence Stems\nThe bottom region of the inflorescence stems, as described above, was used for x-ray diffraction analysis according to a previously described method (Abe and Yamamoto, 2005). Nickel-filtered Cu Kα radiation (wavelength, 0.154 nm) at 30 kV and 35 mA was used with the reflection technique.\n\nIsolation of RNA, Microarray Experiments, and Analysis\nTotal RNA was isolated with Trizol as described previously (Fukuda et al., 1991) from the bottom 4 cm of the inflorescence stems of three independent plants grown under the short-day condition and with a height of between 13 and 17 cm. Microarray analyses were performed with the Arabidopsis 2 Oligo Microarray (Agilent Technologies). All microarray experiments and the analysis of data were performed as described previously (Mitsuda et al., 2005) with the exceptions summarized below. P values for differences between nst1-1 nst3-1 and wild-type plants were calculated by Welch's t test, based on a two-tailed distribution (n = 3). To minimize type-I family-wise errors in multiple and simultaneous statistical tests, we adopted a strategy for suppression of false positives. We calculated a Q-value to estimate the false discovery rate from the P value described above using QVALUE software (Storey and Tibshirani, 2003) with the default setting. We considered genes with a Q-value of <0.1 to be genes expressed at different levels in nst1-1 nst3-1 and wild-type plants. Comprehensive gene group analysis using Fisher's exact test was performed with the R program package (http://www.r-project.org/). Quantitative RT-PCR was performed as described previously (Mitsuda et al., 2005). For the analysis of NST transcripts in the mutant lines, RT-PCR was performed with appropriate primers (see Supplemental Table 2 online).\n\nLight and Fluorescence Microscopy\nFor observations of lignin autofluorescence, we used a filter with the following specifications: glass, 365; dichroic mirror, 395; long-pass, 400. To observe ectopic secondary wall thickening, we cleared tissues by incubating them overnight in 70% lactic acid at 50°C. To prepare 70- to 150-μm sections of inflorescence stems and hypocotyls, we embedded the tissues in 3% agar then sectioned them on a vibrating microtome (HM-650V; Microm). Assays of GUS activity were performed with T1 or T2 transgenic plants. Plant tissues were fixed briefly, in some cases, in solution containing 0.3% formalin, 0.2% MES, pH 5.8, and 0.3 M mannitol before incubation in 100 mM sodium phosphate buffer, pH 7.0, containing 0.1% Triton X-100, 1 mM 5-bromo-4-chloro-3-indolyl-β-d-glucuronide, and 0.5 mM potassium ferricyanide at 37°C for up to 12 h. Stained stems and hypocotyls were embedded in 3% agar and sectioned. All observations by light and fluorescence microscopy were made with the Axioskop2 plus system (Carl Zeiss).\n\nUltrastructural Observation by Transmission Electron Microscopy\nShort pieces of inflorescence stems were fixed in 30 mM HEPES buffer containing 2% paraformaldehyde and 2% glutaraldehyde then fixed in HEPES buffer containing 2% osmium tetroxide. Fixed tissues were embedded in Q651 resin (Nissin EM). Sections of 80 to 90 nm thick were post-stained with uranyl acetate and lead citrate and observed by a JEM1200EX transmission electron microscope (JEOL) at an accelerating voltage of 80 kV.\n\nIdentification of NST Homologs in Poplar\nPoplar NAC genes resembling the Arabidopsis NST genes were collected using the Advanced Search tool of the Joint Genome Initiative poplar database (http://genome.jgi-psf.org/Poptr1/Poptr1.home.html) with the command, “find by homology to related protein with E-value <1.0e-20”; the database for Populus trichocarpa; and the query “At2g46770.” The 62 extracted sequences and amino acid sequences of subfamily IIb of NAC transcription factors of Arabidopsis, as defined in a previous study (Mitsuda et al., 2005), were aligned using the ClustalW program with default settings (Chenna et al., 2003). The amino acid sequences corresponding to conserved NAC domains were extracted and realigned. A phylogenetic tree was built by neighboring-joining method using ClustalW with default settings (an alignment and the sequences are shown in Supplemental Table 3 online). Bootstrap values were calculated from 100 trials. The subtree including the NST and VND genes is shown in Figure 7.\n\nAccession Numbers and Data Deposition\nNST1 and NST3 reported in this study correspond to the Arabidopsis Genome Initiative locus identifiers At2g46770 and At1g32770, respectively. Microarray data performed in this study can be found in the National Center for Biotechnology Information Gene Expression Omnibus data library under accession number GSE5187.\n"]
        ),
    dspy.Example(
        question="Data is available with accession code ABC0123 in Repository XYZ",            
        references=["Materials and methods\nMaterials\nDilution series\nIllumina HumanCNV370-Duo BeadChip Infinium SNP data for dilution series of 12 mixtures of cancer cell line (HCC1395) mixed with its paired normal cell line (HCC1395BL) were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE11976]. We excluded chromosome 6 and 16 from analysis due to copy genomic aberrations present in the normal cell line HCC1395BL.\n\nCancer cell lines\nIllumina HumanHap300 data for the promyelocytic leukemia cancer cell HL-60 and colon cancer cell line HT-29 were obtained from Illumina, and Human-610 Quad SNP genotyping data for the colon cancer cell lines SW403, SW480, SW620, SW837, SW1417 and LIM1863 were generated at the Ludwig Institute of Cancer Research using standard processing protocols. The genotyping data for breast cancer cell lines MDA-175 and MDA-468 were downloaded from the NCBI Gene Expression Omnibus accession [GEO:GSE18799] [23].\n\nPrimary breast tumors\nThree breast tumors (cases 114, 601 and 3,364) that had not received non-neoadjuvant therapy were analyzed in detail using material derived from microdissection. For each case, material containing pure tumor and pure stroma cells respectively was microdissected and compared to data obtained from surgically obtained material from the same tumors. Case 114 was of Luminal B type (23 mm tumor, moderately differentiated infiltrating ductal carcinoma with an extensive in-situ component. Node +ve, ER +ve (6.8 fm/mg protein), EGFR -ve (7.8 fm/mg protein)). Case 601 (20 mm 30 mm tumor, grade 3 with intraductal in-situ ca. and in filtrating ductal carcinoma, node +ve, ER -ve (1.5 fm/mg protein), Her2 +ve (histoscore of 3), EGFR +ve (histoscore of 208)) was classified as ERBB2 positive based on expression microarray data with a fractional rank of 0.982, Case 3,364 was 25 mm grade 3 infiltrating ductal carcinoma, ER positive (8 fm/mg protein), PR positive (histoscore 8/8), Her2 positive (histoscore 3+, one of ten axillary nodes +ve). For each case, DNA was extracted from microdissected stroma and tumor, as well as the original non-dissected sample and analyzed using Illumina Human-610 Quad SNP arrays applying standard protocols.\n\nData processing\nGenome Alteration Print was downloaded [43] and used to analyze all datasets using default settings and the highest ranked copy number and LOH predictions used for comparisons. However, for the cancer cell line dilution series, we re-used the results that had previously been generated by [23] and made available on the aforementioned website.\n\nGenoCN v1.06 was downloaded [44] and used with default settings and stromal contamination settings on for all datasets generated using Illumina Infonaut II SNP arrays. Adjusted GenoCN parameters for the Log R Ratio levels were used for Infonaut HD SNP array processing and in these instances we used the same levels that we specified for OncoSNP. The copy number and LOH predictions from the Viterbi sequence were used for comparisons.\n\nOncoSNP was run on all datasets using 15 EM iterations and with both stromal and intra-tumor heterogeneity options. In all cases, the ploidy prediction with the highest maximum likelihood was chosen and the Viterbi sequence of tumor states used for comparisons. We filtered detected aberrations using a Log Bayes Factor of 30.\n\nStatistical model\nA complete description of our statistical model is provided in Supplementary Information in Additional file 1.\n\nLet xi denote the tumor state at the i-th probe location and (xi, n, xi, t) denote the associated normal and tumor copy numbers. Furthermore, let zi = (zi, n,zi, t) denote the B allele count for the normal and tumor genotype respectively. The combinations (zi, n, (xi, n) and (zi, t, xi, t) fully define the normal and tumor genotypes respectively. The tumor state at each probe denotes the allowable combinations of normal-tumor genotypes at that location as shown in Table 1.\n\nLet π0 denote the normal DNA fraction of the tumor sample due to stromal contamination and 𝜋={𝜋𝑖}𝑛\n𝑖=1 denote the proportion of tumor cells having the normal genotype at each probe. The data 𝑦={𝑦𝑖}𝑛\n𝑖=1 consists of a set of two-dimensional vectors yi = [ri, bi]' whose elements correspond to the Log R Ratio and B allele frequency respectively.\n\nGiven (x, z, π, π0) the data is assumed to be distributed according to a (K + 1)-component mixture of Student t-distributions, where ki indicates the mixture component assignment of the i-th data point,\n\n𝑦𝑖|𝑥𝑖,𝑧𝑖,𝑘𝑖,𝑚,𝛿, 𝛴={ \n𝑆⁢𝑡(𝑚⁡(𝑥𝑖,𝑧𝑖)+𝛿(𝑙𝑙)\n𝑘𝑙,∑(𝑙𝑖)\n𝑘𝑖,𝜈),	𝑘≠0,\n𝑈𝑟⁡(𝑟min,𝑟max)×U𝑏⁢(0,1),	𝑘=0,\n \n(1)\nwhere 𝑆⁢𝑡⁡(𝛿(𝑙)\n𝑘,𝛴(𝑙)\n𝑘,𝑣) is the probability density function of the Student t-distribution with mean 𝛿(𝑙)\n𝑘 and covariance matrix 𝛴(𝑙)\n𝑘 associated with the k-th mixture component and the l-th genotype class and v degrees of freedom. The 0-th component is an outlier class which assumes uniformly distributed data over a specified range.\n\nThe elements of the mean vectors m(xi, zi) = [mr(xi), mb(zi, xi)]' are given by the following:\n\n𝑚𝑟⁡(𝑥𝑖)=(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢\n̅\n𝑟\n𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢\n̅\n𝑟\n𝑥𝑥𝑖,⁢𝑡+𝛽0+𝛽1⁢𝑔𝑖,\n(2)\nwhere gi is the local GC content at the i-th probe location and\n\n𝑚𝑏⁡(𝑧𝑖,𝑥𝑖)=\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑧𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑧𝑖,𝑡\n(𝜋𝑖⁢(1−𝜋0)+𝜋0)⁢𝑥𝑖,𝑛+(1−𝜋𝑖)⁢(1−𝜋0)⁢𝑥𝑖,𝑡\n \n.\n(3)\nPrior distributions\nThe prior distribution on the mixture weights is given by a Dirichlet distribution:\n\n𝑤(𝑙)|𝛼~𝐷⁢𝑖⁢𝑟⁡(𝛼),\n(4)\nwhere α is a concentration parameter which in the numerical results we used α = 1 to give a at prior on the mixture weights.\n\nThe prior distributions on the mixture centers and covariance matrices are given by standard conjugate Normal-Inverse Wishart distributions:\n\n𝛿(𝑙)\n𝑘|𝜏, 𝛴(𝑙)\n𝑘~𝑁⁡(0,𝜏 𝛴(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(5)\n𝛴(𝑙)\n𝑘|𝛾, 𝑆(𝑙)\n𝑘~𝐼⁢𝑊⁡(𝛾,𝑆(𝑙)\n𝑘), 𝑘=1,…, 𝐾, 𝑙 =1,2,3,\n(6)\nwhere τ is a hyperparameter that controls the strength of the prior and IW(γ, Λ) denotes the Inverse-Wishart distribution with parameter γ and scale matrix Λ.\n\nA beta prior is assumed for the outlier rate,\n\n𝜂|𝛼𝜂, 𝛽𝜂~𝐵⁢𝑒⁡(𝛼𝜂,𝛽𝜂),\n(7)\nwhere (αn, βn) are hyperparameters associated with the Beta prior. For the numerical results we set these as (1,1) to give a uniform distribution. \n\nA normal prior is assumed for the local GC content regression parameters,\n\n𝛽|𝜆𝛽~𝑁⁡(0,𝜆𝛽⁢𝐼2),\n(8)\nwhere Ip is a p × p identity matrix.\n\nA discrete prior is assumed for the stromal contamination content and intra-tumour heterogeneity levels,\n\n𝑝⁡(𝜋0)={ \n𝛼𝜋0,𝜋0=0,\n𝛽𝜋0,𝜋0>0,\n \n(9)\nand\n\n𝑝⁡(𝜋𝑖)={ \n𝛼𝜋,𝜋𝑖=0,\n𝛽𝜋,𝜋𝑖>0,\n  𝑖=1,…,𝑛,\n(10)\nwhere in the numerical results we have used απ0 = βπ0 = 1 and απ = 1, βπ = 2.\n\nThe tumor states are assumed to form an inhomogeneous Markov Chain with transition matrix,\n\n𝑝⁡(𝑥𝑖|𝑥𝑖−1)={ \n1−𝜌,𝑥𝑖=𝑥𝑖−1,\n𝜌,𝑥𝑖≠𝑥𝑖−1,\n \n(11)\nwhere ρ = (1/2) (1-exp(-(1/2L) (si-si-1) and si is the physical coordinate of the i-th probe and L is a characteristic length which we set as L = 2,000,000 for the numerical results.\n\nPosterior inference\nWe estimated the unknown model parameters using an expectation-maximization algorithm. Multiple restarts were used to explore different baseline of the Log R Ratio and the baseline with the greatest likelihood was chosen for the calculation of summary statistics.\n\nSummary statistics\nWe used the Viterbi algorithm to extract the most likely sequence of tumors states and for each aberrant segment in the Viterbi sequence we calculated an approximate Bayes Factor (score) of that segment belonging to each of the tumor states. In addition we also recorded the maximum a posteriori estimates of the Log R Ratio baseline adjustment β0 and the stromal contamination π0.\n\nAvailability\nA MATLAB based implementation (for 64 bit Linux systems) of our software is available for academic and non-commercial use from the associated website [45]. In addition, SNP data analyzed in this paper are also available from this website and from the Gene Expression Omnibus Database under Accession No.[GEO:GSE23785]."]
        )
]