In [3]:
from dataset_loader import load_arxpr_data

from llama_index.core import Document
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes
from llama_index.core.node_parser import (SentenceSplitter, MarkdownNodeParser, MarkdownElementNodeParser)

from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core.schema import TextNode
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.ingestion import IngestionPipeline

from llama_index.core.extractors import (
    # SummaryExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
    # BaseExtractor,
)


import nest_asyncio
nest_asyncio.apply()



## Markdown Parser

In [223]:

def _pseudo_markdown_splitter(text: str, chunk_size=2000, chunk_overlap=125, markdown_headers=[], exclude_headers=[], verbose=False):
    """
    Borrowing the LangChain markdown splitter to spot header str in lieu of #, ##, ###, etc. 
    Keeps the md-formated header in medatdata dict if found, else nothing.
    Then recursively splits the text without breaking paragraphs.
    
    Args:
        text: str
            The text from UnstructuredXMLLoader in line-separated format header, subheader, pargaraphs.
        ...
        headers_to_split_on: list
            A list of tuples of the form (header, metadata_key) where header is a str that
            will be used to split the text and metadata_key is the key in the metadata dict
            that will be used to store ONLY markdown-formatted header.

    Returns:
        splits: list
            A list of Langchain doc objects, each containing paragraphs and artifact sub-headers according to 
            chunk size; and 'metadata' key would contain real markdown headers IF any.
    
    """

    from langchain_text_splitters import MarkdownHeaderTextSplitter
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    # split by headers
    markdown_splitter = MarkdownHeaderTextSplitter(markdown_headers)
    md_header_splits = markdown_splitter.split_text(text)

    # filter by exclusion headers
    toss = [doc for doc in md_header_splits for exclusion in exclude_headers if exclusion in doc.metadata] 
    md_filtered_splits = [doc for doc in md_header_splits if doc not in toss]    

    if verbose:
        print(f"md_header_splits: {len(md_header_splits)}")
        print(f"toss: {len(toss)} \n{toss}")
        print(f"md_filtered_splits: {len(md_filtered_splits)} \n{md_filtered_splits}")

    # chunk
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(md_filtered_splits)
    
    return splits


In [225]:
exclude_headers = []
splits = _pseudo_markdown_splitter(text, chunk_size=2000, chunk_overlap=125, markdown_headers=markdown_headers, 
                                   exclude_headers=exclude_headers, verbose=True)


md_header_splits: 16
toss: 0 
[]
md_filtered_splits: 16 
[Document(page_content='BioC-API\ncollection.key\nCC BY-NC-SA\nSplicing role of mitotic regulator kills tumor cells\nThis article is distributed under the terms of an Attribution-Noncommercial-Share Alike-No Mirror Sites license for the first six months after the publication date (see http://www.rupress.org/terms). After six months it is available under a Creative Commons License (Attribution-Noncommercial-Share Alike 3.0 Unported license, as described at http://creativecommons.org/licenses/by-nc-sa/3.0/).\nsurname:Wan;given-names:Yihan\nsurname:Zheng;given-names:Xiaobin\nsurname:Chen;given-names:Haiyang\nsurname:Guo;given-names:Yuxuan\nsurname:Jiang;given-names:Hao\nsurname:He;given-names:Xiaonan\nsurname:Zhu;given-names:Xueliang\nsurname:Zheng;given-names:Yixian\nTITLE\nfront\nSplicing function of mitotic regulators links R-loop-mediated DNA damage to tumor cell killing\nABSTRACT\nabstract\nMitotic regulators BuGZ and Bub3 play

In [108]:

def _match_sequence(seq1: str, seq2_list: list, threshold=0.7):
    """
    Gives a sequence similarity match between seq1 and seq2 from seq2_list.

    Args: 
        seq1: str
            The sequence to match
        seq_list: list
            A list of sequences to match against seq1
        threshold: float
            The minimum similarity ratio; <.8 for short sequences seems to work well.
    Return:
        seq2: str
            The sequence that passes the matching threshold or None if insufficient match.
            
    """

    from difflib import SequenceMatcher

    for seq2 in seq2_list:
        if SequenceMatcher(None, seq1, seq2).ratio() > threshold:
            return seq2
    return None
        


def _split_non_md_headers(doc_lc, headers=[]):
    """ A secondery cleaner to extract leftover non-markdown headers from the text. 
    
    Args:
        doc_lc: LangChain Document 
            A list of Langchain doc objects, each containing paragraphs and artifact sub-headers according to 
            chunk size; and 'metadata' key would contain real markdown headers IF any.
        headers: list    
            A list of possible headers (e.g. headers = ["introduction", "paragraph", "title_1", "title_2", "fig_caption"])
    Return:
        cleaned_docs: dict
            Contains core paragraphs under dict['text'] and associated headers under dict['headers']

    """

    cleaned_docs = {}
    cleaned_docs['text'] = ""
    cleaned_docs['headers'] = []
    text = doc_lc.page_content

    for line in text.split("\n"):
        match = _match_sequence(line.lower(), headers)
        if match:
            cleaned_docs['headers'].append(match)
        else:
            cleaned_docs['text'] += line
    return cleaned_docs

In [227]:
t,l = load_arxpr_data(5)

# keys for manual check on papers
paper_keys = ['25918225', '18618715', '28845460', '25977295', '18631455']
text = t['25918225']

markdown_headers = [
    ("METHODS", "methods"),
    ("METHODOLOGY", "methods"),   
    ("RESULT", "result"),
    ("RESULTS", "result"),
    ("FIG", "figure"),
    ("FIGURE", "figure"),
    ("INTRO", "introduction"),
    ("INTRODUCTION", "introduction"),
    ("REF", "reference"),
    ("REFERENCES", "reference"),
    ("DISCUSS", "discussion"),
    ("DISCUSSION", "discussion"),
    ("SUPPL", "supplement"),
    ("SUPPLEMENT", "supplement"),
    ("abstract_title_1", "abstract"),
]

exclude_headers = ['reference', 'supplement']


# TODO: need a script to generate a list of headers from a large set or sampled papers.
other_headers = ["introduction",
                "paragraph",
                "title_1", "title_2",
                "fig_caption",
                "abstract",
                "supplementary material",
                "materials and methods",
                "results",
                "discussion",
                "results and discussion",
                "footnote_title",
                "figures and tables",
                "summary", 
                "ref",
                "lancetref",
                "experimental procedures"
                "fig1", "fig2", "fig3", "fig4", "fig5", "fig6", "fig7", "fig8", "fig9", "fig10",
                ]

Settings.embed_model = OllamaEmbedding(model_name="llama3.1:8b")
Settings.llm = Ollama(model="llama3.1:8b") 
llm = Ollama(model="llama3.1:8b") 
embed_model = OllamaEmbedding(model_name="llama3.1:8b")


N datasets with exactly one label, for each field:
{'assay_by_molecule_14': 3,
 'assay_count_7': 4,
 'experimental_design_10': 0,
 'experimental_factors_20': 2,
 'hardware_4': 1,
 'name_19': 4,
 'no_of_samples_22': 0,
 'no_of_samples_23': 0,
 'organism_16': 3,
 'releasedate_12': 4,
 'sample_count_13': 4,
 'sex_2': 0,
 'study_type_18': 3,
 'technology_15': 4,
 'type_21': 0,
 'type_9': 3}
N datasets with at least one label, for each field
{'assay_by_molecule_14': 5,
 'assay_count_7': 4,
 'experimental_design_10': 0,
 'experimental_factors_20': 4,
 'hardware_4': 1,
 'name_19': 5,
 'no_of_samples_22': 0,
 'no_of_samples_23': 0,
 'organism_16': 5,
 'releasedate_12': 5,
 'sample_count_13': 5,
 'sex_2': 1,
 'study_type_18': 5,
 'technology_15': 5,
 'type_21': 5,
 'type_9': 5}


### Pipeline: Markdown + Recursive 


In [228]:
# Pipeline: Markdown + Recursive 

metadata_extractors = [
    # QuestionsAnsweredExtractor(questions=3),
    # KeywordExtractor(keywords=5)
    ]

pipeline = IngestionPipeline(transformations=metadata_extractors)

nodes = []
splits = _pseudo_markdown_splitter(text, chunk_size=2000, chunk_overlap=125, markdown_headers=markdown_headers, exclude_headers=exclude_headers)

for doc in splits:
    split = _split_non_md_headers(doc, headers=other_headers)

    print(f'\nMD Headers: {doc.metadata} \nOther Headers: {split["headers"]} \nSize change: {len(doc.page_content)} ? {len(split["text"])}')
    print(f'{split["text"]}')

    # Avoid creating a node with empty text
    if len(split["text"]) == 0: continue

    nodes.append(TextNode(text=split['text']))

docs = pipeline.run(documents=nodes)
index = VectorStoreIndex(nodes=docs)



MD Headers: {} 
Other Headers: ['title_1', 'abstract', 'abstract', 'abstract', 'abstract'] 
Size change: 1061 ? 1004
BioC-APIcollection.keyCC BY-NC-SASplicing role of mitotic regulator kills tumor cellsThis article is distributed under the terms of an Attribution-Noncommercial-Share Alike-No Mirror Sites license for the first six months after the publication date (see http://www.rupress.org/terms). After six months it is available under a Creative Commons License (Attribution-Noncommercial-Share Alike 3.0 Unported license, as described at http://creativecommons.org/licenses/by-nc-sa/3.0/).surname:Wan;given-names:Yihansurname:Zheng;given-names:Xiaobinsurname:Chen;given-names:Haiyangsurname:Guo;given-names:Yuxuansurname:Jiang;given-names:Haosurname:He;given-names:Xiaonansurname:Zhu;given-names:Xueliangsurname:Zheng;given-names:YixianfrontSplicing function of mitotic regulators links R-loop-mediated DNA damage to tumor cell killingMitotic regulators BuGZ and Bub3 play a critical role in 

In [112]:
from llama_index.core.response.notebook_utils import display_source_node

base_retriever = index.as_retriever(similarity_top_k=3)

query_str = "What is the use of RT-PCR kit?"
base_nodes = base_retriever.retrieve(query_str)


In [None]:
for node in base_nodes: 
    print('')
    print(node.get_content(metadata_mode='all'))
    # print(node.metadata)
    # print(node.embedding)
    # display_source_node(node, source_length=500)

### Pipeline: Topic Node Parsing

In [163]:
from llama_index.node_parser.topic import TopicNodeParser

def topic_parse(documents, chunk_size=1000, similarity_threshold=0.8, window_size=2, similarity_method="embedding"):
    """ A function to parse the text into topic nodes. """
    
    node_parser = TopicNodeParser.from_defaults(
        embed_model=embed_model,
        llm=llm,
        max_chunk_size=chunk_size,
        similarity_method=similarity_method,  # can be "llm" or "embedding"
        similarity_threshold=similarity_threshold,
        window_size=window_size,  # paper suggests window_size=5
    )
    return {"nodes": node_parser.get_nodes_from_documents(documents, show_progress=True), 
            "params": {"chunk_size": chunk_size, "threshold": similarity_threshold, "window": window_size}}

    # return {"nodes": node_parser(documents), 
    #         "params": {"chunk_size": chunk_size, "threshold": similarity_threshold, "window": window_size}}


def print_topic_nodes(parsed):
    print(parsed['params'])
    for i in range(len(parsed['nodes'])):
        print(f'node {i}:\n{parsed["nodes"][i].get_content()}')

In [None]:
whole_text = ""
nodes = []
splits = _pseudo_markdown_splitter(text, chunk_size=1000, chunk_overlap=0, markdown_headers=markdown_headers, exclude_headers=exclude_headers)

for doc in splits:
    split = _split_non_md_headers(doc, headers=other_headers)

    # print(f'\nMD Headers: {doc.metadata} \nOther Headers: {split["headers"]} \nSize change: {len(doc.page_content)} ? {len(split["text"])}')
    # print(f'{split["text"]}')

    # Avoid creating a node with empty text
    if len(split["text"]) == 0: continue

    nodes.append(Document(text=split['text']))

# print(whole_text)
nodes[1].get_content

<bound method TextNode.get_content of Document(id_='c1ab428e-a608-484d-bb6e-5a5b3fda3b35', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Mitotic regulators BuGZ and Bub3 play a critical role in RNA splicing during interphase, and disruption of this function leads to R-loop formation, DNA damage, and p53 activation.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')>

In [None]:
nodes 

In [231]:
parsed = topic_parse(nodes, chunk_size=2000, similarity_threshold=0.8, window_size=5, similarity_method="llm")
print_topic_nodes(parsed)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

No valid JSON found in the response: assistant: It seems there is no content provided for me to decompose. Please provide the content, and I will follow the rules to create a list of simple propositions.

If you're ready to proceed, please go ahead and share the content, and I'll get started!
Failed to parse JSON: [ 
"The solution contained the wash buffer."
"The concentration of the wash buffer was 0.5 milligrams per milliliter." ]
{'chunk_size': 2000, 'threshold': 0.8, 'window': 5}
node 0:
The article is distributed under a specific license for the first six months after publication. The license is Attribution-Noncommercial-Share Alike-No Mirror Sites. After six months, the article will be available under another Creative Commons License. This other license is Attribution-Noncommercial-Share Alike 3.0 Unported.
node 1:
Wan Yihang has a surname and given name. Yihang Wan's surname is Wan and his given name is Yihang. Zheng Xiaobin has a surname and given name. Xiaobin Zheng's surname 

In [None]:
# WITHOUT excluding references & supplements
parsed = topic_parse(nodes, chunk_size=2000, similarity_threshold=0.8, window_size=5, similarity_method="llm")
print_topic_nodes(parsed)

Parsing nodes:   0%|          | 0/72 [00:00<?, ?it/s]

No valid JSON found in the response: assistant: It seems you forgot to provide the content for decomposition.

To proceed, please provide the text that needs to be broken down into simple propositions. I'll apply the rules to transform it into a list of strings in JSON format.
Failed to parse JSON: [ 
"The control or BuGZ siRNA was used in an experiment." ,
"The error bars indicate the standard deviation (SD) of the results." ,
"A Student's t test was performed to determine the significance of the results." ,
"In the t test, not significant was determined when P > 0.05." ,
"In the t test, a P value less than 0.05 was considered significant (*)" ,
"In the t test, a P value less than 0.01 was considered highly significant (**)" ,
"In the t test, a P value less than 0.001 was considered very highly significant (***)" ,
"The results of three independent experiments were used to determine significance." ,
"BuGZ siRNA was used in an experiment involving HFF cells." ,
"Lentivirus-expressed sh

Failed to parse JSON: 


ReadTimeout: timed out

In [155]:
parsed = topic_parse(nodes, chunk_size=2000, similarity_threshold=0.8, window_size=5, similarity_method="embedding")
print_topic_nodes(parsed)

Failed to parse JSON: ["We used the Senescence beta-Galactosidase Staining kit", "The kit was purchased from Cell Signaling and had product number 9860S.", "We detected beta-galactosidase activity at pH 6.0 using the kit.", "Cells were fixed in a fixative solution for 10-15 min at room temperature.", "The fixed cells were then incubated with a beta-galactosidase staining solution containing X-gal at 37 C overnight.", "As the blue color developed, bright-field cell images were taken using an Axiovert 25 microscope connected to a Canon camera.", "We used a 0.4% Trypan blue stock solution from Life Technologies with product number 15250061 to stain dead cells.", "The stock solution was added to a cell suspension in the ratio of 10 microl of Trypan blue stock solution per 100 microl of cell suspension.", "Cells negative for Trypan blue staining were counted as live cells using hemocytometer.", "We used two methods to confirm the interaction between BuGZ or Bub3 and spliceosome components."

TypeError: sequence item 0: expected str instance, dict found

In [126]:

text = """In this paper, we introduce a novel graph RAG method for applying LLMs to the medical domain, which we refer to as Medical Graph RAG (MedRAG). This technique improves LLM performance in the medical domain by response queries with grounded source citations and clear interpretations of medical terminology, boosting the transparency and interpretability of the results. This approach involves a three-tier hierarchical graph construction method. Initially, we use documents provided by users as our top-level source to extract entities. These entities are then linked to a second level consisting of more basic entities previously abstracted from credible medical books and papers. Subsequently, these entities are connected to a third level—the fundamental medical dictionary graph—that provides detailed explanations of each medical term and their semantic relationships. We then construct a comprehensive graph at the highest level by linking entities based on their content and hierarchical connections. This method ensures that the knowledge can be traced back to its sources and the results are factually accurate.

To respond to user queries, we implement a U-retrieve strategy that combines top-down retrieval with bottom-up response generation. The process begins by structuring the query using predefined medical tags and indexing them through the graphs in a top-down manner. The system then generates responses based on these queries, pulling from meta-graphs—nodes retrieved along with their TopK related nodes and relationships—and summarizing the information into a detailed response. This technique maintains a balance between global context awareness and the contextual limitations inherent in LLMs.

Our medical graph RAG provides Intrinsic source citation can enhance LLM transparency, interpretability, and verifiability. The results provides the provenance, or source grounding information, as it generates each response, and demonstrates that an answer is grounded in the dataset. Having the cited source for each assertion readily available also enables a human user to quickly and accurately audit the LLM’s output directly against the original source material. It is super useful in the field of medicine that security is very important, and each of the reasoning should be evidence-based. By using such a method, we construct an evidence-based Medical LLM that the clinician could easiely check the source of the reasoning and calibrate the model response to ensure the safty usage of llm in the clinical senarios.

To evaluate our medical graph RAG, we implemented the method on several popular open and closed-source LLMs, including ChatGPT OpenAI (2023a) and LLaMA Touvron et al. (2023), testing them across mainstream medical Q&A benchmarks such as PubMedQA Jin et al. (2019), MedMCQA Pal et al. (2022), and USMLE Kung et al. (2023). For the RAG process, we supplied a comprehensive medical dictionary as the foundational knowledge layer, the UMLS medical knowledge graph Lindberg et al. (1993) as the foundamental layer detailing semantic relationships, and a curated MedC-K dataset Wu et al. (2023) —comprising the latest medical papers and books—as the intermediate level of data to simulate user-provided private data. Our experiments demonstrate that our model significantly enhances the performance of general-purpose LLMs on medical questions. Remarkably, it even surpasses many fine-tuned or specially trained LLMs on medical corpora, solely using the RAG approach without additional training.
"""

documents = [Document(text=text)]
print(documents[0].get_content())


In this paper, we introduce a novel graph RAG method for applying LLMs to the medical domain, which we refer to as Medical Graph RAG (MedRAG). This technique improves LLM performance in the medical domain by response queries with grounded source citations and clear interpretations of medical terminology, boosting the transparency and interpretability of the results. This approach involves a three-tier hierarchical graph construction method. Initially, we use documents provided by users as our top-level source to extract entities. These entities are then linked to a second level consisting of more basic entities previously abstracted from credible medical books and papers. Subsequently, these entities are connected to a third level—the fundamental medical dictionary graph—that provides detailed explanations of each medical term and their semantic relationships. We then construct a comprehensive graph at the highest level by linking entities based on their content and hierarchical connec

In [30]:
parsed = topic_parse(documents, chunk_size=2000, similarity_threshold=0.8, window_size=5)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
len(parsed['nodes'])

42

In [32]:
print(parsed['params'])
for i in range(len(parsed['nodes'])):
    print(f'node {i}:\n{parsed["nodes"][i].get_content()}')

{'chunk_size': 2000, 'threshold': 0.8, 'window': 5}
node 0:
We introduce a novel graph RAG method for applying LLMs to the medical domain, which we refer to as Medical Graph RAG (MedRAG).
node 1:
This technique improves LLM performance in the medical domain by response queries with grounded source citations and clear interpretations of medical terminology.
node 2:
This approach involves a three-tier hierarchical graph construction method.
node 3:
Initially, we use documents provided by users as our top-level source to extract entities.
node 4:
These entities are then linked to a second level consisting of more basic entities previously abstracted from credible medical books and papers. Subsequently, these entities are connected to a third level—the fundamental medical dictionary graph—that provides detailed explanations of each medical term and their semantic relationships. We then construct a comprehensive graph at the highest level by linking entities based on their content and hiera

In [127]:

node_parser = TopicNodeParser.from_defaults(
    llm=llm,
    max_chunk_size=1000,
    similarity_method="llm",  # can be "llm" or "embedding"
    window_size=5,  # paper suggests window_size=5
)

nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
# window_size = 5
for i in range(len(nodes)):
    print(f'node {i}:\n{nodes[i].get_content()}')


node 0:
We introduce a novel graph RAG method for applying LLMs to the medical domain, which we refer to as Medical Graph RAG (MedRAG). The technique improves LLM performance in the medical domain by response queries with grounded source citations and clear interpretations of medical terminology. This approach involves a three-tier hierarchical graph construction method.
node 1:
Initially, we use documents provided by users as our top-level source to extract entities. These entities are then linked to a second level consisting of more basic entities previously abstracted from credible medical books and papers. Subsequently, these entities are connected to a third level—the fundamental medical dictionary graph—that provides detailed explanations of each medical term and their semantic relationships. We then construct a comprehensive graph at the highest level by linking entities based on their content and hierarchical connections. This method ensures that the knowledge can be traced bac

In [None]:
# window_size = 2
for i in range(len(nodes)):
    print(f'node {i}:\n{nodes[i].get_content()}')



node 0:
We introduce a novel graph RAG method for applying LLMs to the medical domain, which we refer to as Medical Graph RAG (MedRAG). The MedRAG technique improves LLM performance in the medical domain by response queries with grounded source citations and clear interpretations of medical terminology. This approach involves a three-tier hierarchical graph construction method.
node 1:
Initially, we use documents provided by users as our top-level source to extract entities. These entities are then linked to a second level consisting of more basic entities previously abstracted from credible medical books and papers. Subsequently, these entities are connected to a third level—the fundamental medical dictionary graph—that provides detailed explanations of each medical term and their semantic relationships. We then construct a comprehensive graph at the highest level by linking entities based on their content and hierarchical connections. This method ensures that the knowledge can be tra