In [None]:
!pip install PyPDF2
!pip install faiss-cpu
!pip install -U langchain-community
!pip install rouge-score
!pip install llama-index-vector-stores-faiss
!pip install llama_index llama-index-llms-openai
!pip install python-dotenv

In [None]:
import os
import torch
import json
import faiss
import numpy as np
import PyPDF2
import json
import torch

from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import TextNode
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModel


In [None]:
import openai
import os

# If needed, install:
# pip install openai
openai_api_key =  # Replace with your actual key
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
# prompt: from "/content/drive/MyDrive/sample_pdf_rag" copy the folder and contents to current directory

import shutil
import os

source_path = "/content/drive/MyDrive/sample_pdf_rag"
destination_path = os.getcwd()

# Check if the source folder exists
if os.path.exists(source_path):
  try:
    shutil.copytree(source_path, os.path.join(destination_path, "sample_pdf_rag"))
    print(f"Folder '{source_path}' copied to '{destination_path}' successfully.")
  except FileExistsError:
      print(f"Folder already exists in the destination directory")
  except OSError as e:
    print(f"Error copying folder: {e}")
else:
  print(f"Source folder '{source_path}' not found.")


In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

###############################################
# Step 1: Configuration
###############################################
PDF_FOLDER = "/content/sample_pdf_rag"  # folder containing your PDF files
INDEX_SAVE_PATH = "faiss_index.bin"
DOC_META_PATH = "doc_metadata.json"
ARTICLE_ENCODER_MODEL = "ncbi/MedCPT-Article-Encoder"
QUERY_ENCODER_MODEL = "ncbi/MedCPT-Query-Encoder"
CROSS_ENCODER_MODEL = "ncbi/MedCPT-Cross-Encoder"

# Adjust these parameters as needed
MAX_ARTICLE_LENGTH = 512
MAX_QUERY_LENGTH = 64
CHUNK_SIZE = 500  # number of tokens or words per chunk (adjust as needed)
OVERLAP = 50      # number of tokens overlap between chunks (to maintain context)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

###############################################
# Step 2: PDF Text Extraction Utility
###############################################
def extract_text_from_pdf(pdf_path):
    # Extract all text from a PDF file
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

###############################################
# Step 3: Text Chunking
###############################################
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    # This creates overlapping chunks for better context preservation
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        if not chunk:
            break
        chunks.append(" ".join(chunk))
        start += (chunk_size - overlap)
    return chunks

###############################################
# Step 4: Embedding Using MedCPT Article Encoder
###############################################
# Load Article Encoder
article_tokenizer = AutoTokenizer.from_pretrained(ARTICLE_ENCODER_MODEL)
article_model = AutoModel.from_pretrained(ARTICLE_ENCODER_MODEL).to(DEVICE)
article_model.eval()

def embed_documents(doc_chunks):
    # doc_chunks is a list of strings
    # We'll pass them in batches to the model
    all_embeds = []
    batch_size = 8  # adjust as needed
    for i in range(0, len(doc_chunks), batch_size):
        batch = doc_chunks[i:i+batch_size]
        with torch.no_grad():
            encoded = article_tokenizer(batch, truncation=True, padding=True, return_tensors='pt', max_length=MAX_ARTICLE_LENGTH)
            for k in encoded:
                encoded[k] = encoded[k].to(DEVICE)
            outputs = article_model(**encoded).last_hidden_state[:, 0, :]
            # outputs: [batch, 768] embeddings
            all_embeds.append(outputs.cpu().numpy())
    if all_embeds:
        return np.vstack(all_embeds)
    else:
        return np.array([])

###############################################
# Step 5: Building the Vector Store (Faiss)
###############################################
def build_faiss_index(embeddings):
    # embeddings should be a np.array of shape [N, 768]
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product index
    index.add(embeddings)
    return index

###############################################
# Step 6: Process PDFs and Create the Index
###############################################
# We will create a metadata structure mapping each embedding to which doc and which chunk it came from
doc_metadata = []
doc_text_chunks = []
doc_id = 0

for filename in os.listdir(PDF_FOLDER):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(PDF_FOLDER, filename)
        text = extract_text_from_pdf(pdf_path)
        chunks = chunk_text(text)
        for chunk_id, chunk in enumerate(chunks):
            doc_metadata.append({
                "doc_id": doc_id,
                "filename": filename,
                "chunk_id": chunk_id,
                "text": chunk
            })
        doc_id += 1
        doc_text_chunks.extend(chunks)

# Compute embeddings for all chunks
all_embeddings = embed_documents(doc_text_chunks)
print("Total embeddings shape:", all_embeddings.shape)

# Build the Faiss index
index = build_faiss_index(all_embeddings)

# Save index and metadata if needed
faiss.write_index(index, INDEX_SAVE_PATH)
with open(DOC_META_PATH, 'w') as f:
    json.dump(doc_metadata, f)

###############################################
# Step 7: Query the Index Using MedCPT Query Encoder
###############################################
query_tokenizer = AutoTokenizer.from_pretrained(QUERY_ENCODER_MODEL)
query_model = AutoModel.from_pretrained(QUERY_ENCODER_MODEL).to(DEVICE)
query_model.eval()

def embed_query(queries):
    with torch.no_grad():
        encoded = query_tokenizer(queries, truncation=True, padding=True, return_tensors='pt', max_length=MAX_QUERY_LENGTH)
        for k in encoded:
            encoded[k] = encoded[k].to(DEVICE)
        outputs = query_model(**encoded).last_hidden_state[:, 0, :]
        return outputs.cpu().numpy()

###############################################
# Step 8: Re-rank the Results Using Cross Encoder
###############################################
# Load Cross Encoder
cross_tokenizer = AutoTokenizer.from_pretrained(CROSS_ENCODER_MODEL)
cross_model = AutoModelForSequenceClassification.from_pretrained(CROSS_ENCODER_MODEL).to(DEVICE)
cross_model.eval()

def rerank(query, candidates):
    # Combine query and candidate text into pairs
    pairs = [[query, candidate["text"]] for candidate in candidates]
    with torch.no_grad():
        encoded = cross_tokenizer(pairs, truncation=True, padding=True, return_tensors="pt", max_length=512)
        for k in encoded:
            encoded[k] = encoded[k].to(DEVICE)
        logits = cross_model(**encoded).logits.squeeze(dim=1)  # Relevance scores
    return logits.cpu().numpy()

###############################################
# Step 9: Combined Search with Re-ranking
###############################################
def search_with_rerank(query, k=5):
    # Step 1: Dense retrieval using Faiss
    query_embedding = embed_query([query])  # shape [1, 768]
    scores, inds = index.search(query_embedding, k)

    # Retrieve top-k candidates
    candidates = []
    for score, ind in zip(scores[0], inds[0]):
        entry = doc_metadata[ind]
        entry["retrieval_score"] = float(score)
        candidates.append(entry)

    # Step 2: Re-rank using Cross Encoder
    rerank_scores = rerank(query, candidates)
    for i, score in enumerate(rerank_scores):
        candidates[i]["rerank_score"] = float(score)

    # Sort candidates by re-rank score
    candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
    return candidates

###############################################
# Step 10A: LLM-based Answer Generation
###############################################
from llama_index.llms.openai import OpenAI

def get_llm_answer(query, retrieved_candidates):
    """
    Use OpenAI GPT-4 to generate a concise answer from the retrieved text chunks.
    """
    # Combine the top chunks into one context
    context_text = " ".join([cand["text"] for cand in retrieved_candidates])

    # Create a prompt
    prompt = f"""
    You are a knowledgeable assistant. Use the context below to answer the question in one word or few.

    Context:
    {context_text}

    Question: {query}

    Answer (in one word or few):
    """
    # Call OpenAI LLM (via llama_index)
    llm = OpenAI(model="gpt-4", temperature=0)  # or gpt-3.5-turbo
    response = llm.complete(prompt)
    return response , context_text



###############################################
# Example Usage
###############################################
# if __name__ == "__main__":
#     sample_query = "What metabolites are associated with diabetes and cancer?"
#     results = search_with_rerank(sample_query, k=5)

#     print("Top results after re-ranking:")
#     for res in results:
#         print(f"Filename: {res['filename']}, Chunk: {res['chunk_id']}, "
#               f"Retrieval Score: {res['retrieval_score']:.4f}, "
#               f"Re-rank Score: {res['rerank_score']:.4f}")
#         print("Text snippet:", res['text'][:200], "...")
#         print("------")

#     # NEW: Get the final LLM-generated answer with minimal words
#     final_response = get_llm_answer(sample_query, results)
#     print("\nFinal LLM Answer (in minimal words):")
#     print(final_response)


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

###############################################
# Step 10: Evaluation with ROUGE and BLEU (Updated)
###############################################
# Sample ground truth data (Replace this with your actual ground truth dataset)
queries = [
    "Which biomarker is significantly elevated in the plasma of Gaucher Disease Type 1 patients?",
    "What therapy reduces urinary GlcSph levels in Gaucher Disease patients?",
    "Which biomarker is considered the gold standard for monitoring Gaucher Disease Type 1?",
    "What technique is used to quantify lyso-Gb1 and its analogs in plasma?",
    "Which urinary biomarker is highlighted for monitoring Gaucher Disease progression?",
    "What urinary biomarker class is elevated in untreated Gaucher Disease patients?",
    "What type of molecule is lyso-Gb1, which is associated with Gaucher Disease Type 1?",
    "Which class of biomarkers does lyso-Gb1 belong to in the context of Gaucher Disease Type 1?",
    "Which amino acids are elevated in patients with NASH compared to NAFLD?",
    "What metabolic pathway is closely associated with NASH progression?",
    "What urinary biomarker distinguishes between NAFLD and NASH?",
    "What metabolic pathway is altered in the progression from NAFLD to NASH?",
    "Which sulfated steroid increases with the progression of fibrosis in NAFLD?",
    "Which metabolite ratio is associated with fibrosis severity in NAFLD?",
    "Which biomarker is commonly used for early detection of Type 2 Diabetes (T2D)?",
    "What metabolic pathway is associated with 3-hydroxybutyrate in T2D?",
    "Which amino acids are identified as predictors of future diabetes in metabolomic studies?",
    "What diagnostic method is used for metabolic profiling in diabetes research?",
    "Which metabolite is associated with the progression of diabetic kidney disease?",
    "What technology is used for identifying lipid metabolism-related biomarkers in diabetes?"
]

ground_truths = [
    "Glucosylsphingosine (GlcSph).",
    "Enzyme Replacement Therapy (ERT).",
    "Lyso-Gb1.",
    "UPLC-MS/MS.",
    "Lyso-Gb1 analogs.",
    "Polycyclic Lyso-Gb1 analogs.",
    "A glucosylsphingosine derivative.",
    "Lipid biomarkers.",
    "Glutamate and phenylalanine.",
    "Amino acid metabolism.",
    "Pyroglutamic acid.",
    "Pentose phosphate pathway.",
    "16-OH-DHEA-S.",
    "16-OH-DHEA-S/DHEA-S.",
    "HbA1c.",
    "Ketogenesis.",
    "Branched-chain amino acids (BCAAs) like isoleucine, leucine, and valine.",
    "Nuclear Magnetic Resonance (NMR) spectroscopy.",
    "Phenylalanine.",
    "Liquid Chromatography-Mass Spectrometry (LC-MS)."
]


def evaluate_results_with_rerank(queries, ground_truths, k=3):
    assert len(queries) == len(ground_truths), "Queries and ground truths must have the same length."

    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoothing_function = SmoothingFunction().method4  # For BLEU smoothing

    results = []
    for query, ground_truth in zip(queries, ground_truths):
        # 1) Retrieve top results
        retrieved_results = search_with_rerank(query, k=k)

        # 2) Generate final LLM answer using the retrieved chunks
        llm_answer , context_text = get_llm_answer(query, retrieved_results)
        llm_answer = str(llm_answer)
        response_text = str(llm_answer).strip()
        # print(type(str(llm_answer)))

        # 3) Evaluate the LLM's answer vs. ground truth
        rouge_scores = rouge_scorer_instance.score(ground_truth, str(llm_answer))
        bleu_score = sentence_bleu(
            [ground_truth.split()],  # single reference
            llm_answer.split(),      # LLM answer
            smoothing_function=smoothing_function
        )

        # 4) Store metrics
        results.append({
            "query": query,
            "ground_truth": ground_truth,
            "response": response_text[:],
            "retrieved_context": context_text,
            "rouge1": rouge_scores['rouge1'].fmeasure,
            "rouge2": rouge_scores['rouge2'].fmeasure,
            "rougeL": rouge_scores['rougeL'].fmeasure,
            "bleu": bleu_score
        })

    return results

# Evaluate the queries with the re-ranker
evaluation_results = evaluate_results_with_rerank(queries, ground_truths)

# Print results
# for result in evaluation_results:
#     print("Query:", result["query"])
#     print("Ground Truth:", result["ground_truth"])
#     print("Retrieved Text:", result["retrieved_text"][:200], "...")  # Limiting output for readability
#     print(f"ROUGE-1: {result['rouge1']:.4f}, ROUGE-2: {result['rouge2']:.4f}, ROUGE-L: {result['rougeL']:.4f}, BLEU: {result['bleu']:.4f}")
#     print("------")


In [None]:
import csv

# Define the CSV file path
output_csv_path = "quantitative_evaluation_metrics_MedCPT_openAI.csv"

# Save evaluation results to a CSV file
with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, escapechar='\\', quoting=csv.QUOTE_MINIMAL)  # Add escapechar

    # Write header
    writer.writerow(["Query", "Ground Truth", "Response", "Retrieved Context", "ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU"])

    # Write each evaluation result row
    for result in evaluation_results:
        writer.writerow([
            result["query"],
            result["ground_truth"],
            result["response"],
            result["retrieved_context"],  # Assuming this is a string
            f"{result['rouge1']:.4f}",
            f"{result['rouge2']:.4f}",
            f"{result['rougeL']:.4f}",
            f"{result['bleu']:.4f}"
        ])

print(f"Evaluation metrics saved to {output_csv_path}")


In [None]:
!pip install ragas
!pip install datasets

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, context_recall, context_precision, answer_relevancy
import pandas as pd

# Step 1: Convert `evaluation_results` to `ragas` format
data_samples = {
    'question': [result["query"] for result in evaluation_results],
    'answer': [result["response"] for result in evaluation_results],  # Generated LLM responses
    'contexts': [[result["retrieved_context"]] for result in evaluation_results],  # List of retrieved contexts
    'ground_truth': [result["ground_truth"] for result in evaluation_results]  # Ground truth answers
}

# Step 2: Convert to `datasets.Dataset` object
dataset = Dataset.from_dict(data_samples)

# Step 3: Evaluate using RAGAS metrics
score = evaluate(dataset, metrics=[faithfulness, answer_correctness, context_recall, context_precision,answer_relevancy])

# Step 4: Convert the scores to a Pandas DataFrame and save as CSV
df = score.to_pandas()
output_score_csv_path = "qualitative_ragas_evaluation_scores_MedCPT_openAI.csv"
df.to_csv(output_score_csv_path, index=False)

print(f"Evaluation scores saved to {output_score_csv_path}")
