In [2]:
pip install pdfminer.six pymupdf pdfplumber


Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.7 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_

In [5]:
import pdfminer.high_level
import fitz  # PyMuPDF
import pdfplumber
import re
import json
import os
from multiprocessing import Pool

# Function to extract structured text while streaming to save memory
def extract_text_with_structure(pdf_path, start_page=0, end_page=None):
    """Stream text extraction to save memory."""
    structured_text = []

    with open(pdf_path, "rb") as f:
        text = pdfminer.high_level.extract_text(f, page_numbers=range(start_page, end_page))
        for line in text.split("\n"):
            if re.match(r"^\d+\.\s+\w+", line):  # Heading detection (e.g., "1. Introduction")
                structured_text.append({"type": "heading", "text": line.strip()})
            elif line.strip():
                structured_text.append({"type": "paragraph", "text": line.strip()})

    return structured_text

# Function to extract tables in smaller batches
def extract_tables(pdf_path, start_page=0, end_page=None):
    """Extract tables in batches to reduce memory usage."""
    tables_data = []

    with pdfplumber.open(pdf_path) as pdf:
        if end_page is None:
            end_page = len(pdf.pages)

        for i in range(start_page, min(end_page, len(pdf.pages))):
            tables = pdf.pages[i].extract_tables()
            for table in tables:
                tables_data.append({"page": i + 1, "table": table})

    return tables_data

# Function to extract and save images instead of storing them in memory
def extract_figures(pdf_path, start_page=0, end_page=None, output_folder="figures3"):
    """Extract images and save to disk to reduce memory usage."""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    figures = []

    for i in range(start_page, min(end_page, len(doc))):
        for img_index, img in enumerate(doc[i].get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_filename = os.path.join(output_folder, f"page_{i+1}_img_{img_index}.png")

            with open(img_filename, "wb") as img_file:
                img_file.write(base_image["image"])

            figures.append({"page": i + 1, "image_index": img_index, "file_path": img_filename})

    return figures

# Function to process a specific range of pages
def process_page_range(pdf_path, start, end):
    return {
        "text": extract_text_with_structure(pdf_path, start, end),
        "tables": extract_tables(pdf_path, start, end),
        "figures": extract_figures(pdf_path, start, end)
    }

# Function to extract and save all data efficiently
def save_extracted_data(pdf_path, output_json="extracted_medical_text3.json", batch_size=500):
    """Parallelize extraction to improve speed and efficiency."""
    num_pages = fitz.open(pdf_path).page_count
    ranges = [(pdf_path, i, min(i + batch_size, num_pages)) for i in range(0, num_pages, batch_size)]

    with Pool(processes=4) as pool:  # Adjust the number of processes based on CPU
        results = pool.starmap(process_page_range, ranges)

    # Merge results
    extracted_data = {
        "text": sum([res["text"] for res in results], []),
        "tables": sum([res["tables"] for res in results], []),
        "figures": sum([res["figures"] for res in results], [])
    }

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, ensure_ascii=False, indent=4)

    print(f"Extracted data saved to {output_json}")

# Run the extraction on your PDF
pdf_file = "/content/Book3.pdf"  # Replace with your actual PDF path
save_extracted_data(pdf_file)


Extracted data saved to extracted_medical_text3.json


In [1]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import json
import numpy as np
import re
import torch
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from pymongo import MongoClient
from collections import defaultdict

# Load extracted medical text
def load_corpus(filename="extracted_medical_text1.json"):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [item["text"] for item in data["text"]]

# Preprocess text for BM25 indexing
def preprocess_text(texts):
    lemmatizer = WordNetLemmatizer()
    def clean_text(text):
        words = re.findall(r"\b\w+\b", text.lower())
        return [lemmatizer.lemmatize(word) for word in words]
    return [clean_text(text) for text in texts]

# Query Expansion using WordNet synonyms
def expand_query(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().replace('_', ' '))
    return query + " " + " ".join(synonyms)

# Initialize BM25
def init_bm25(corpus):
    tokenized_corpus = preprocess_text(corpus)
    return BM25Okapi(tokenized_corpus), tokenized_corpus

# Initialize Sentence-BERT for dense retrieval
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def encode_corpus_sbert(corpus):
    return sbert_model.encode(corpus, convert_to_tensor=True)

# Retrieve documents using BM25 and Sentence-BERT
def hybrid_retrieval(query, bm25, tokenized_corpus, corpus, sbert_embeddings, top_k=10):
    query_expanded = expand_query(query)
    bm25_scores = bm25.get_scores(query_expanded.split())
    sbert_query_embedding = sbert_model.encode(query, convert_to_tensor=True)
    sbert_scores = util.pytorch_cos_sim(sbert_query_embedding, sbert_embeddings)[0].numpy()
    
    combined_scores = bm25_scores + sbert_scores  # Hybrid approach
    top_indices = np.argsort(combined_scores)[-top_k:][::-1]
    
    return [(corpus[i], combined_scores[i]) for i in top_indices]

# Save results to MongoDB
def save_results_to_mongo(results, query, mongo_db="retrieval_db", collection_name="results"):
    client = MongoClient("mongodb://localhost:27017/")
    db = client[mongo_db]
    collection = db[collection_name]
    collection.insert_one({"query": query, "results": results})
    print("Results saved to MongoDB.")

# Main pipeline
def main():
    corpus = load_corpus()
    bm25, tokenized_corpus = init_bm25(corpus)
    sbert_embeddings = encode_corpus_sbert(corpus)
    
    query = "heart attack symptoms"
    results = hybrid_retrieval(query, bm25, tokenized_corpus, corpus, sbert_embeddings)
    
    for doc, score in results:
        print(f"Score: {score:.4f} - {doc[:200]}...")  # Show preview
    
    save_results_to_mongo(results, query)

if __name__ == "__main__":
    main()


In [3]:
import json
import re
import sqlite3
from pymongo import MongoClient

class Node:
    """Represents a node in the hierarchical tree."""
    def __init__(self, node_id, text, level, parent=None):
        self.node_id = node_id
        self.text = text
        self.level = level  # 0: root, 1: chapter, 2: section, 3: subsection, 4: paragraph
        self.children = []
        self.parent = parent

    def add_child(self, child):
        """Adds a child node."""
        self.children.append(child)

    def to_dict(self):
        """Converts node to dictionary format for JSON storage."""
        return {
            "node_id": self.node_id,
            "text": self.text,
            "level": self.level,
            "children": [child.to_dict() for child in self.children]
        }

class HierarchicalTree:
    """Creates and manages the hierarchical structure of the textbook."""
    def __init__(self):
        self.root = Node("root", "Textbook", 0)

    def parse_structure(self, extracted_data):
        """Parses structured text and organizes it into a hierarchical tree."""
        current_chapter = None
        current_section = None
        current_subsection = None
        node_counter = 1  # Unique identifier for nodes

        for item in extracted_data["text"]:
            text = item["text"]

            # Identify structure based on patterns
            if re.match(r"^\d+\.\s+[A-Z]", text):  # Chapter (e.g., "1. Introduction")
                current_chapter = Node(f"ch_{node_counter}", text, 1, self.root)
                self.root.add_child(current_chapter)
                current_section = None
                current_subsection = None
                node_counter += 1

            elif re.match(r"^\d+\.\d+\s+[A-Z]", text):  # Section (e.g., "1.1 Background")
                if current_chapter:
                    current_section = Node(f"sec_{node_counter}", text, 2, current_chapter)
                    current_chapter.add_child(current_section)
                    current_subsection = None
                    node_counter += 1

            elif re.match(r"^\d+\.\d+\.\d+\s+[A-Z]", text):  # Subsection (e.g., "1.1.1 Definition")
                if current_section:
                    current_subsection = Node(f"subsec_{node_counter}", text, 3, current_section)
                    current_section.add_child(current_subsection)
                    node_counter += 1

            else:  # Paragraphs
                if current_subsection:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_subsection)
                    current_subsection.add_child(paragraph_node)
                elif current_section:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_section)
                    current_section.add_child(paragraph_node)
                elif current_chapter:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_chapter)
                    current_chapter.add_child(paragraph_node)
                else:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, self.root)
                    self.root.add_child(paragraph_node)

                node_counter += 1

    def to_json(self, filename="hierarchical_tree.json"):
        """Stores hierarchical structure as JSON."""
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.root.to_dict(), f, ensure_ascii=False, indent=4)
        print(f"Hierarchical tree saved to {filename}")

    def save_to_sqlite(self, db_name="hierarchical_tree.db"):
        """Stores hierarchical structure in SQLite."""
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        cursor.execute("""
        CREATE TABLE IF NOT EXISTS hierarchy (
            node_id TEXT PRIMARY KEY,
            text TEXT,
            level INTEGER,
            parent_id TEXT
        )
        """)

        def insert_node(node, parent_id=None):
            cursor.execute("INSERT INTO hierarchy VALUES (?, ?, ?, ?)", (node.node_id, node.text, node.level, parent_id))
            for child in node.children:
                insert_node(child, node.node_id)

        insert_node(self.root)
        conn.commit()
        conn.close()
        print(f"Hierarchical tree stored in {db_name}")

    def save_to_mongodb(self, mongo_db="textbook_db", collection_name="hierarchical_tree"):
        """Stores hierarchical structure in MongoDB."""
        client = MongoClient("mongodb+srv://anant22067:db6aM9UfKxitrBw0@cluster0.oyo8k.mongodb.net/")
        db = client[mongo_db]
        collection = db[collection_name]

        collection.delete_many({})  # Clear previous data
        def insert_node(node, parent_id=None):
                """Recursively insert nodes into MongoDB as separate documents."""
                document = {
                    "node_id": node.node_id,
                    "text": node.text,
                    "level": node.level,
                    "parent_id": parent_id  # Store parent reference
                }
                collection.insert_one(document)

                for child in node.children:
                    insert_node(child, node.node_id)

        insert_node(self.root)  # Start inserting from the root node

        print(f"Hierarchical tree saved to MongoDB (DB: {mongo_db}, Collection: {collection_name})")

# Usage Example
if __name__ == "__main__":
    # Load extracted data from JSON
    with open("extracted_medical_text1.json", "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Build hierarchical tree
    tree = HierarchicalTree()
    tree.parse_structure(extracted_data)

    # Save to different storage formats
    tree.to_json()  # Save as JSON
    tree.save_to_sqlite()  # Save to SQLite
    tree.save_to_mongodb()  # Save to MongoDB


Hierarchical tree saved to hierarchical_tree.json
Hierarchical tree stored in hierarchical_tree.db
Hierarchical tree saved to MongoDB (DB: textbook_db, Collection: hierarchical_tree)


In [4]:
import json
import re
import sqlite3
from pymongo import MongoClient

class Node:
    """Represents a node in the hierarchical tree."""
    def __init__(self, node_id, text, level, parent=None):
        self.node_id = node_id
        self.text = text
        self.level = level  # 0: root, 1: chapter, 2: section, 3: subsection, 4: paragraph
        self.children = []
        self.parent = parent

    def add_child(self, child):
        """Adds a child node."""
        self.children.append(child)

    def to_dict(self):
        """Converts node to dictionary format for JSON storage."""
        return {
            "node_id": self.node_id,
            "text": self.text,
            "level": self.level,
            "children": [child.to_dict() for child in self.children]
        }

class HierarchicalTree:
    """Creates and manages the hierarchical structure of the textbook."""
    def __init__(self):
        self.root = Node("root", "Textbook", 0)

    def parse_structure(self, extracted_data):
        """Parses structured text and organizes it into a hierarchical tree."""
        current_chapter = None
        current_section = None
        current_subsection = None
        node_counter = 1  # Unique identifier for nodes

        for item in extracted_data["text"]:
            text = item["text"]

            # Identify structure based on patterns
            if re.match(r"^\d+\.\s+[A-Z]", text):  # Chapter (e.g., "1. Introduction")
                current_chapter = Node(f"ch_{node_counter}", text, 1, self.root)
                self.root.add_child(current_chapter)
                current_section = None
                current_subsection = None
                node_counter += 1

            elif re.match(r"^\d+\.\d+\s+[A-Z]", text):  # Section (e.g., "1.1 Background")
                if current_chapter:
                    current_section = Node(f"sec_{node_counter}", text, 2, current_chapter)
                    current_chapter.add_child(current_section)
                    current_subsection = None
                    node_counter += 1

            elif re.match(r"^\d+\.\d+\.\d+\s+[A-Z]", text):  # Subsection (e.g., "1.1.1 Definition")
                if current_section:
                    current_subsection = Node(f"subsec_{node_counter}", text, 3, current_section)
                    current_section.add_child(current_subsection)
                    node_counter += 1

            else:  # Paragraphs
                if current_subsection:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_subsection)
                    current_subsection.add_child(paragraph_node)
                elif current_section:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_section)
                    current_section.add_child(paragraph_node)
                elif current_chapter:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_chapter)
                    current_chapter.add_child(paragraph_node)
                else:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, self.root)
                    self.root.add_child(paragraph_node)

                node_counter += 1

    def to_json(self, filename="hierarchical_tree2.json"):
        """Stores hierarchical structure as JSON."""
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.root.to_dict(), f, ensure_ascii=False, indent=4)
        print(f"Hierarchical tree saved to {filename}")

    def save_to_sqlite(self, db_name="hierarchical_tree2.db"):
        """Stores hierarchical structure in SQLite."""
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        cursor.execute("""
        CREATE TABLE IF NOT EXISTS hierarchy (
            node_id TEXT PRIMARY KEY,
            text TEXT,
            level INTEGER,
            parent_id TEXT
        )
        """)

        def insert_node(node, parent_id=None):
            cursor.execute("INSERT INTO hierarchy VALUES (?, ?, ?, ?)", (node.node_id, node.text, node.level, parent_id))
            for child in node.children:
                insert_node(child, node.node_id)

        insert_node(self.root)
        conn.commit()
        conn.close()
        print(f"Hierarchical tree stored in {db_name}")

    def save_to_mongodb(self, mongo_db="textbook2_db", collection_name="hierarchical_tree2"):
        """Stores hierarchical structure in MongoDB."""
        client = MongoClient("mongodb+srv://anant22067:db6aM9UfKxitrBw0@cluster0.oyo8k.mongodb.net/")
        db = client[mongo_db]
        collection = db[collection_name]

        collection.delete_many({})  # Clear previous data
        def insert_node(node, parent_id=None):
                """Recursively insert nodes into MongoDB as separate documents."""
                document = {
                    "node_id": node.node_id,
                    "text": node.text,
                    "level": node.level,
                    "parent_id": parent_id  # Store parent reference
                }
                collection.insert_one(document)

                for child in node.children:
                    insert_node(child, node.node_id)

        insert_node(self.root)  # Start inserting from the root node

        print(f"Hierarchical tree saved to MongoDB (DB: {mongo_db}, Collection: {collection_name})")

# Usage Example
if __name__ == "__main__":
    # Load extracted data from JSON
    with open("extracted_medical_text2.json", "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Build hierarchical tree
    tree = HierarchicalTree()
    tree.parse_structure(extracted_data)

    # Save to different storage formats
    tree.to_json()  # Save as JSON
    tree.save_to_sqlite()  # Save to SQLite
    tree.save_to_mongodb()  # Save to MongoDB


Hierarchical tree saved to hierarchical_tree2.json
Hierarchical tree stored in hierarchical_tree2.db
Hierarchical tree saved to MongoDB (DB: textbook2_db, Collection: hierarchical_tree2)


In [1]:
import json
import re
import sqlite3
from pymongo import MongoClient

class Node:
    """Represents a node in the hierarchical tree."""
    def __init__(self, node_id, text, level, parent=None):
        self.node_id = node_id
        self.text = text
        self.level = level  # 0: root, 1: chapter, 2: section, 3: subsection, 4: paragraph
        self.children = []
        self.parent = parent

    def add_child(self, child):
        """Adds a child node."""
        self.children.append(child)

    def to_dict(self):
        """Converts node to dictionary format for JSON storage."""
        return {
            "node_id": self.node_id,
            "text": self.text,
            "level": self.level,
            "children": [child.to_dict() for child in self.children]
        }

class HierarchicalTree:
    """Creates and manages the hierarchical structure of the textbook."""
    def __init__(self):
        self.root = Node("root", "Textbook", 0)

    def parse_structure(self, extracted_data):
        """Parses structured text and organizes it into a hierarchical tree."""
        current_chapter = None
        current_section = None
        current_subsection = None
        node_counter = 1  # Unique identifier for nodes

        for item in extracted_data["text"]:
            text = item["text"]

            # Identify structure based on patterns
            if re.match(r"^\d+\.\s+[A-Z]", text):  # Chapter (e.g., "1. Introduction")
                current_chapter = Node(f"ch_{node_counter}", text, 1, self.root)
                self.root.add_child(current_chapter)
                current_section = None
                current_subsection = None
                node_counter += 1

            elif re.match(r"^\d+\.\d+\s+[A-Z]", text):  # Section (e.g., "1.1 Background")
                if current_chapter:
                    current_section = Node(f"sec_{node_counter}", text, 2, current_chapter)
                    current_chapter.add_child(current_section)
                    current_subsection = None
                    node_counter += 1

            elif re.match(r"^\d+\.\d+\.\d+\s+[A-Z]", text):  # Subsection (e.g., "1.1.1 Definition")
                if current_section:
                    current_subsection = Node(f"subsec_{node_counter}", text, 3, current_section)
                    current_section.add_child(current_subsection)
                    node_counter += 1

            else:  # Paragraphs
                if current_subsection:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_subsection)
                    current_subsection.add_child(paragraph_node)
                elif current_section:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_section)
                    current_section.add_child(paragraph_node)
                elif current_chapter:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, current_chapter)
                    current_chapter.add_child(paragraph_node)
                else:
                    paragraph_node = Node(f"para_{node_counter}", text, 4, self.root)
                    self.root.add_child(paragraph_node)

                node_counter += 1

    def to_json(self, filename="hierarchical_tree3.json"):
        """Stores hierarchical structure as JSON."""
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.root.to_dict(), f, ensure_ascii=False, indent=4)
        print(f"Hierarchical tree saved to {filename}")

    def save_to_sqlite(self, db_name="hierarchical_tree3.db"):
        """Stores hierarchical structure in SQLite."""
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()

        cursor.execute("""
        CREATE TABLE IF NOT EXISTS hierarchy (
            node_id TEXT PRIMARY KEY,
            text TEXT,
            level INTEGER,
            parent_id TEXT
        )
        """)

        def insert_node(node, parent_id=None):
            cursor.execute("INSERT INTO hierarchy VALUES (?, ?, ?, ?)", (node.node_id, node.text, node.level, parent_id))
            for child in node.children:
                insert_node(child, node.node_id)

        insert_node(self.root)
        conn.commit()
        conn.close()
        print(f"Hierarchical tree stored in {db_name}")

    def save_to_mongodb(self, mongo_db="textbook3_db", collection_name="hierarchical_tree3"):
        """Stores hierarchical structure in MongoDB."""
        client = MongoClient("mongodb+srv://anant22067:db6aM9UfKxitrBw0@cluster0.oyo8k.mongodb.net/")
        db = client[mongo_db]
        collection = db[collection_name]

        collection.delete_many({})  # Clear previous data
        def insert_node(node, parent_id=None):
                """Recursively insert nodes into MongoDB as separate documents."""
                document = {
                    "node_id": node.node_id,
                    "text": node.text,
                    "level": node.level,
                    "parent_id": parent_id  # Store parent reference
                }
                collection.update_one({"node_id": node.node_id}, {"$set": document}, upsert=True)

                for child in node.children:
                    insert_node(child, node.node_id)

        insert_node(self.root)  # Start inserting from the root node

        print(f"Hierarchical tree saved to MongoDB (DB: {mongo_db}, Collection: {collection_name})")

# Usage Example
if __name__ == "__main__":
    # Load extracted data from JSON
    with open("extracted_medical_text3.json", "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Build hierarchical tree
    tree = HierarchicalTree()
    tree.parse_structure(extracted_data)

    # Save to different storage formats
    tree.to_json()  # Save as JSON
    tree.save_to_sqlite()  # Save to SQLite
    tree.save_to_mongodb()  # Save to MongoDB


Hierarchical tree saved to hierarchical_tree3.json
Hierarchical tree stored in hierarchical_tree3.db


AutoReconnect: SSL handshake failed: cluster0-shard-00-02.oyo8k.mongodb.net:27017: [WinError 10054] An existing connection was forcibly closed by the remote host (configured timeouts: connectTimeoutMS: 20000.0ms)

In [3]:
pip install pdfminer.six pymupdf pdfplumber


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
pip install rank_bm25

Note: you may need to restart the kernel to use updated packages.Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2




[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 660.6 kB/s eta 0:00:03
   ------ --------------------------------- 0.2/1.5 MB 2.5 MB/s eta 0:00:01
   ---------------------------------------  1.5/1.5 MB 10.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 9.6 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [10]:
pip install gensim

Collecting gensimNote: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip



  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.6 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 60.6/60.6 kB 645.4 kB/s eta 0:00:00
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- --------------------------

In [12]:
import json
import nltk
import torch
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from collections import defaultdict


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A13na\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\A13na\AppData\Roaming\nltk_data...


In [14]:
pip install faiss-cpu whoosh pymedtermino 

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Collecting pymedtermino
  Downloading PyMedTermino-0.3.3.tar.gz (34.0 MB)
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/34.0 MB ? eta -:--:--
     --------------------------------------- 0.1/34.0 MB 423.5 kB/s eta 0:01:20
     - -------------------------------------- 0.8/34.0 MB 2.7 MB/s eta 0:00:13
     - -------------------------------------- 1.0/34.0 MB 3.2 MB/s eta 0:00:11
    


[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import json
import os
import re
import numpy as np
import faiss
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from sentence_transformers import SentenceTransformer, CrossEncoder
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
import pandas as pd
from transformers import DPRQuestionEncoderTokenizer, DPRQuestionEncoder
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load Sentence-BERT & Cross-Encoder
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Load SNOMED-CT and MeSH datasets
snomed_df = pd.read_csv("SNOMED-CT_cleaned.csv")  # Ensure this file is available
mesh_df = pd.read_csv("MeSH_terms.csv")  # Ensure this file is available

def load_text_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    def extract_text(node):
        """ Recursively extract text from hierarchical tree """
        texts = [{"id": node["node_id"], "text": node["text"]}]
        for child in node.get("children", []):
            texts.extend(extract_text(child))  # Recursively process children
        return texts

    return extract_text(data)  # Start extraction from the root node

# Query Expansion using SNOMED-CT, MeSH, and WordNet
def expand_query(query):
    expanded_terms = set()
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    for word in query.split():
        lemma = lemmatizer.lemmatize(word)
        stem = stemmer.stem(word)
        expanded_terms.update([word, lemma, stem])

        # Add WordNet synonyms
        syns = wordnet.synsets(word)
        for syn in syns:
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name())

        # Add SNOMED-CT synonyms
        snomed_synonyms = snomed_df[snomed_df['Concept'] == word]['Synonyms'].values
        if len(snomed_synonyms) > 0:
            expanded_terms.update(snomed_synonyms[0].split(','))

        # Add MeSH synonyms
        mesh_synonyms = mesh_df[mesh_df['Term'] == word]['Synonyms'].values
        if len(mesh_synonyms) > 0:
            expanded_terms.update(mesh_synonyms[0].split(','))

    return " ".join(expanded_terms)

# BM25 using Whoosh
def build_bm25_index(text_data, index_dir="bm25_index"):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
        schema = Schema(id=ID(stored=True), content=TEXT)
        ix = create_in(index_dir, schema)
        writer = ix.writer()
        for  item in text_data:
            writer.add_document(id=item["id"], content=item["text"])
        writer.commit()

def bm25_search(query, top_k=10, index_dir="bm25_index"):
    ix = open_dir(index_dir)
    qp = QueryParser("content", ix.schema)
    with ix.searcher() as searcher:
        results = searcher.search(qp.parse(query), limit=top_k)
        return [(hit["id"], hit.score) for hit in results]

# Dense Passage Retrieval (DPR) Indexing with FAISS
def build_dpr_index(text_data, index_path="dpr_index.faiss"):
    encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

    embeddings = []
    for item in text_data:
        input_ids = tokenizer(item["text"], return_tensors="pt", truncation=True, padding=True)
        embedding = encoder(**input_ids).pooler_output.detach().numpy()
        embeddings.append(embedding)

    embeddings = np.vstack(embeddings)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_path)

def dpr_search(query, top_k=10, index_path="dpr_index.faiss"):
    index = faiss.read_index(index_path)
    encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

    input_ids = tokenizer(query, return_tensors="pt", truncation=True, padding=True)
    query_embedding = encoder(**input_ids).pooler_output.detach().numpy()
    _, idxs = index.search(query_embedding, top_k)
    return list(idxs[0])

# Cross-Encoder Re-Ranking
def rerank_results(query, results, text_data):
    pairs = [(query, text_data[int(doc_id)]["text"]) for doc_id, _ in results]
    scores = cross_encoder.predict(pairs)
    reranked_results = sorted(zip(results, scores), key=lambda x: x[1], reverse=True)
    return reranked_results

# Main Retrieval Pipeline
def retrieve(query, text_data, top_k=10):
    expanded_query = expand_query(query)

    bm25_results = bm25_search(expanded_query, top_k)
    dpr_results = [(doc_id, 1.0) for doc_id in dpr_search(query, top_k)]

    combined_results = list(set(bm25_results + dpr_results))
    reranked_results = rerank_results(query, combined_results, text_data)
    return reranked_results

if __name__ == "__main__":
    text_data = load_text_data("hierarchical_tree2.json")
    build_bm25_index(text_data)
    build_dpr_index(text_data)

    query = "heart attack treatment"
    results = retrieve(query, text_data, top_k=10)
    for rank, ((doc_id, score), re_score) in enumerate(results):
        print(f"{rank+1}. Doc {doc_id} - Score: {re_score:.4f}")


PermissionError: [WinError 5] Access is denied: 'bm25_index\\MAIN.tmp'

: 

In [16]:
import json

def flatten_json(node, results=None):
    """
    Recursively flatten a hierarchical JSON structure to extract `text`.
    :param node: Current node in the hierarchy.
    :param results: List to store the flattened results.
    :return: Flattened list of dictionaries with `id` and `content`.
    """
    if results is None:
        results = []

    # Add the current node's text to the results
    if "text" in node and "node_id" in node:
        results.append({"id": node["node_id"], "content": node["text"]})

    # Recursively process children
    for child in node.get("children", []):
        flatten_json(child, results)

    return results

# Load the JSON file
with open("hierarchical_tree.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Flatten the hierarchy
text_data = flatten_json(data)
print(f"Flattened text data: {text_data[:5]}")  # Print first 5 entries for verification


Flattened text data: [{'id': 'root', 'content': 'Textbook'}, {'id': 'para_1', 'content': 'Goodman & Gilman’s'}, {'id': 'para_2', 'content': 'The'}, {'id': 'para_3', 'content': 'Pharmacological'}, {'id': 'para_4', 'content': 'Basis of'}]


In [17]:
import faiss
import numpy as np
import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from tqdm import tqdm

def build_dpr_index(text_data, index_path="dpr_index.faiss", batch_size=128):
    """Build the FAISS index for DPR."""
    encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

    index = faiss.IndexFlatL2(768)  # 768-dimensional embeddings
    for i in tqdm(range(0, len(text_data), batch_size), desc="Building DPR Index"):
        batch = text_data[i:i + batch_size]
        batch_texts = [item["content"] for item in batch]  # Extract 'content' field from JSON
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            embeddings = encoder(**inputs).pooler_output.numpy()
        index.add(embeddings)

    # Save the FAISS index
    faiss.write_index(index, index_path)
    print(f"FAISS index saved to {index_path}")

build_dpr_index(text_data[:50], index_path="dpr_index_test.faiss")  # Use only the first 50 entries


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

FAISS index saved to dpr_index_test.faiss





In [18]:
def multi_document_retrieval(query, bm25_top_k=5, dpr_top_k=5):
    # BM25 Retrieval
    bm25_results = bm25_search(query, top_k=bm25_top_k)

    # DPR Retrieval
    dpr_results = dpr_search(query, top_k=dpr_top_k, index_path="dpr_index.faiss")

    # Combine results (e.g., union of BM25 and DPR with scores)
    combined_results = {result["id"]: result for result in bm25_results + dpr_results}
    return list(combined_results.values())


In [19]:
def hierarchical_traversal(node, query, results=None):
    """
    Traverse the hierarchy and extract relevant nodes based on the query.
    """
    if results is None:
        results = []

    # Check if the current node is relevant (basic matching or scoring)
    if query.lower() in node.get("text", "").lower():
        results.append({"id": node["node_id"], "content": node["text"]})

    # Traverse children recursively
    for child in node.get("children", []):
        hierarchical_traversal(child, query, results)

    return results


In [20]:
# Traverse the JSON hierarchy for the query
query_results = hierarchical_traversal(data, query="pharmacological")


In [21]:
from whoosh.index import open_dir


In [22]:
from whoosh.qparser import QueryParser
from whoosh.index import open_dir

def bm25_search(query, top_k=10, index_dir="bm25_index2"):
    """
    Search the BM25 index for the top_k relevant documents.
    """
    ix = open_dir(index_dir)  # Open the BM25 index directory
    qp = QueryParser("content", ix.schema)  # Query parser for the "content" field
    with ix.searcher() as searcher:
        results = searcher.search(qp.parse(query), limit=top_k)
        return [{"id": hit["id"], "content": hit["content"], "score": hit.score} for hit in results]
    



In [24]:
def multi_document_retrieval(query, bm25_top_k=5, dpr_top_k=5):
    bm25_results = bm25_search(query, top_k=bm25_top_k)
    dpr_results = dpr_search(query, top_k=dpr_top_k, index_path="dpr_index_test.faiss")

    combined_results = {result["id"]: result for result in (bm25_results + dpr_results)}

    # Debug: Print results to ensure no nested structures
    print("Combined Results:", combined_results)
    return list(combined_results.values())


In [25]:
def extract_context(retrieved_content):
    """
    Extract and combine the content from the retrieved documents into a single context string.
    Ensures the content field is properly handled as a string.
    """
    if not retrieved_content:
        print("No retrieved content found. Returning an empty context.")
        return ""

    # Extract content from retrieved results and join into a single string
    try:
        context = " ".join([
            item["content"] if isinstance(item["content"], str) else str(item["content"])
            for item in retrieved_content
        ])
    except Exception as e:
        print(f"Error during context extraction: {e}")
        return ""

    return context


In [26]:
response = full_rag_pipeline(query="pharmacological treatments")
print("Answer:", response["answer"])
print("Evidence:", response["evidence"])


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Combined Results: {5: {'id': 5, 'content': {'id': 'para_5', 'content': 'THERAPEUTICS'}}, 27: {'id': 27, 'content': {'id': 'para_27', 'content': 'THERAPEUTICS'}}, 3: {'id': 3, 'content': {'id': 'para_3', 'content': 'Pharmacological'}}, 25: {'id': 25, 'content': {'id': 'para_25', 'content': 'Pharmacological'}}, 20: {'id': 20, 'content': {'id': 'para_20', 'content': 'Chief, Division of Endocrinology and Metabolism'}}}
Context extracted successfully: {'id': 'para_5', 'content': 'THERAPEUTICS'} {'id': 'para_27', 'content': 'THERAPEUTICS'} {'id': 'para_3', 'content': 'Pharmacological'} {'id': 'para_25', 'content': 'Pharmacological'} {'id': 'para_20', 'content': 'Chief, Division of Endocrinology and Metabolism'}
Answer: THERAPEUTICS
Evidence: [{'id': 5, 'content': {'id': 'para_5', 'content': 'THERAPEUTICS'}}, {'id': 27, 'content': {'id': 'para_27', 'content': 'THERAPEUTICS'}}, {'id': 3, 'content': {'id': 'para_3', 'content': 'Pharmacological'}}, {'id': 25, 'content': {'id': 'para_25', 'conten

In [1]:
from transformers import pipeline

# Load the Flan-T5 model
llm = pipeline("text2text-generation", model="google/flan-t5-small")

def generate_answer_with_flan_t5(query, context=""):
    """
    Generate an answer using Flan-T5.
    If context is provided, it will use it to generate a more accurate response.
    """
    if context:
        prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    else:
        prompt = f"Answer the following question:\n\nQuestion: {query}\nAnswer:"
    
    try:
        # Generate the answer
        response = llm(prompt, max_length=200, truncation=True)
        return response[0]["generated_text"]
    except Exception as e:
        print(f"Flan-T5 Error: {e}")
        return "Sorry, I couldn't generate an answer at this time."


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [9]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import faiss

def dpr_search(query, top_k=10, index_path="dpr_index_test.faiss"):
    # Load the FAISS index
    index = faiss.read_index(index_path)

    # Load the DPR Question Encoder and Tokenizer
    question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

    # Encode the user query
    inputs = question_tokenizer(query, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        query_embedding = question_encoder(**inputs).pooler_output.numpy()

    # Search the FAISS index
    if index.ntotal == 0:
        print("FAISS index is empty.")
        return []

    distances, idxs = index.search(query_embedding, top_k)
    if len(idxs[0]) == 0:
        print("No results found in FAISS index.")
        return []

    # Return the document IDs from the FAISS index
    return [{"id": idx, "content": text_data[idx]} for idx in idxs[0] if idx < len(text_data)]


In [10]:
def extract_context(retrieved_content):
    """
    Extract and combine the content from the retrieved documents into a single context string.
    Ensures the content field is properly handled as a string.
    """
    if not retrieved_content:
        print("No retrieved content found. Returning an empty context.")
        return ""

    # Safely extract content and handle potential nested dictionaries
    context = " ".join([
        item["content"] if isinstance(item["content"], str) else str(item["content"])
        for item in retrieved_content
    ])

    return context


In [1]:
from transformers import pipeline

# Load the Flan-T5 model
llm = pipeline("text2text-generation", model="google/flan-t5-small")

def generate_answer_with_flan_t5(query, context=""):
    """
    Generate an answer using Flan-T5.
    If context is provided, it will use it to generate a more accurate response.
    """
    if context:
        prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
    else:
        prompt = f"Answer the following question:\n\nQuestion: {query}\nAnswer:"
    
    try:
        # Generate the answer
        response = llm(prompt, max_length=200, truncation=True)
        return response[0]["generated_text"]
    except Exception as e:
        print(f"Flan-T5 Error: {e}")
        return "Sorry, I couldn't generate an answer at this time."


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [11]:
def full_rag_pipeline(query):
    # Step 1: Retrieve relevant content
    retrieved_content = multi_document_retrieval(query)

    if not retrieved_content:  # No relevant results found
        print("No results found in BM25 or DPR. Falling back to Flan-T5.")
        return {
            "answer": generate_answer_with_flan_t5(query=query, context=""),
            "evidence": []
        }

    # Step 2: Extract context
    try:
        context = extract_context(retrieved_content)
        print("Context extracted successfully:", context)
    except Exception as e:
        print(f"Error during context extraction: {e}")
        return {
            "answer": "Error: Could not extract context.",
            "evidence": retrieved_content
        }

    # Step 3: Generate answer using the extracted context
    answer = generate_answer_with_flan_t5(query=query, context=context)

    return {"answer": answer, "evidence": retrieved_content}




In [53]:
response = full_rag_pipeline(query="country of New Delhi?")
print("Answer:", response["answer"])
print("Evidence:", response["evidence"])



Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Combined Results: {32: {'id': 32, 'content': {'id': 'para_32', 'content': 'San Juan   Seoul   Singapore   Sydney   Toronto'}}, 31: {'id': 31, 'content': {'id': 'para_31', 'content': 'New York   Chicago   San Francisco   Lisbon  London   Madrid   Mexico City   Milan   New Delhi'}}, 39: {'id': 39, 'content': {'id': 'para_39', 'content': 'names  in  an  editorial  fashion  only,  and  to  the  benefit  of  the  trademark  owner,  with  no  intention  of  infringement  of  the  trademark.  Where  such'}}, 40: {'id': 40, 'content': {'id': 'para_40', 'content': 'designations appear in this book, they have been printed with initial caps.'}}, 49: {'id': 49, 'content': {'id': 'para_49', 'content': 'THE WORK IS PROVIDED “AS IS.” McGRAW-HILL AND ITS LICENSORS MAKE NO GUARANTEES OR WARRANTIES AS TO THE ACCU-'}}}
Context extracted successfully: {'id': 'para_32', 'content': 'San Juan   Seoul   Singapore   Sydney   Toronto'} {'id': 'para_31', 'content': 'New York   Chicago   San Francisco   Lisbon  L

In [55]:
response = full_rag_pipeline(query="treatments for heart attack")
print("Answer:", response["answer"])
print("Evidence:", response["evidence"])

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Combined Results: {5: {'id': 5, 'content': {'id': 'para_5', 'content': 'THERAPEUTICS'}}, 27: {'id': 27, 'content': {'id': 'para_27', 'content': 'THERAPEUTICS'}}, 20: {'id': 20, 'content': {'id': 'para_20', 'content': 'Chief, Division of Endocrinology and Metabolism'}}, 18: {'id': 18, 'content': {'id': 'para_18', 'content': 'Professor of Internal Medicine and Pharmacology'}}, 42: {'id': 42, 'content': {'id': 'para_42', 'content': 'training programs. For more information, please contact George Hoare, Special Sales, at george_hoare@mcgraw-hill.com or (212) 904-4069.'}}}
Context extracted successfully: {'id': 'para_5', 'content': 'THERAPEUTICS'} {'id': 'para_27', 'content': 'THERAPEUTICS'} {'id': 'para_20', 'content': 'Chief, Division of Endocrinology and Metabolism'} {'id': 'para_18', 'content': 'Professor of Internal Medicine and Pharmacology'} {'id': 'para_42', 'content': 'training programs. For more information, please contact George Hoare, Special Sales, at george_hoare@mcgraw-hill.co

In [54]:
pip install rouge-score sacrebleu


Collecting rouge-scoreNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip



  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.8 kB ? eta -:--:--
     ------- -------------------------------- 10.2/51.8 kB ? eta -:--:--
     ------- -------------------------------- 10.2/51.8 kB ? eta -:--:--
     -------------------------------------  51.2/51.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------  51.2/51.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------  51.2/51.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 51.8/51.8 kB 204.9 kB/s eta 0:00:00

In [63]:

# Define the evaluation function
from rouge_score import rouge_scorer
import sacrebleu

def evaluate_generated_answer(reference, generated):
    """
    Evaluate the generated answer using ROUGE and BLEU scores.
    Handles cases where the input is nested (e.g., dictionaries instead of plain strings).
    """
    # Extract the plain text if inputs are nested dictionaries
    if isinstance(reference, dict):
        reference = reference.get("content", reference)
    if isinstance(generated, dict):
        generated = generated.get("content", generated)

    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = scorer.score(reference, generated)

    # Calculate BLEU score
    bleu_score = sacrebleu.corpus_bleu([generated], [[reference]])

    return {
        "ROUGE-1": rouge_scores["rouge1"].fmeasure,
        "ROUGE-2": rouge_scores["rouge2"].fmeasure,
        "ROUGE-L": rouge_scores["rougeL"].fmeasure,
        "BLEU": bleu_score.score,
    }

# Example usage
reference_answer = {"content": "Therapeutics is the treatment of disease"}
generated_answer = response["answer"]

# Evaluate the answers
evaluation_scores = evaluate_generated_answer(reference_answer, generated_answer)
print("Evaluation Scores:", evaluation_scores)


Evaluation Scores: {'ROUGE-1': 0.2857142857142857, 'ROUGE-2': 0.0, 'ROUGE-L': 0.2857142857142857, 'BLEU': 0.0}


In [64]:
pip install streamlit






[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: C:\Users\A13na\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
