In [1]:
!pip install rouge_score
!pip install pinecone-client
!pip install openai
!pip install PyPDF2

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=be753f8e69db56abfccddccace51468541dfee0619c6f7d4cb8939ad9c7e2c07
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-no

In [2]:
# Import required packages
from openai import OpenAI
import torch
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone
import hashlib
from PyPDF2 import PdfReader

# Initialize OpenAI client
client = OpenAI(api_key="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")

# Initialize Pinecone
PINECONE_API_KEY = "pcsk_sRUxC_4yWVW7sSX7EE1Y8VvvRDCa94ApPrZDfZkRFfUwdvXNH2ABowupy766Jdp2ivAUn"
PINECONE_ENVIRONMENT = "us-west1-gcp"
INDEX_NAME = "knowledge-base"

pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

# Initialize the embedding model
model_name = "WhereIsAI/UAE-Large-V1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embed_model = AutoModel.from_pretrained(model_name)

def generate_embedding(text):
    """Generate embeddings for given text"""
    inputs = tokenizer(text,
                      return_tensors="pt",
                      truncation=True,
                      max_length=512)
    with torch.no_grad():
        outputs = embed_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

def initialize_pinecone_index():
    """Initialize or connect to Pinecone index"""
    if INDEX_NAME not in [index.name for index in pc.list_indexes()]:
        dimension = 1024  # UAE-Large-V1 embedding dimension
        spec = {
            "metric": "cosine",
            "pods": {"replicas": 1, "shards": 1},
            "capacity_mode": "serverless"
        }
        pc.create_index(INDEX_NAME, dimension=dimension, spec=spec)
        print(f"Index '{INDEX_NAME}' created successfully!")
    else:
        print(f"Index '{INDEX_NAME}' already exists.")

    return pc.Index(INDEX_NAME)

def retrieve_single_best_context(query, index):
    """Retrieve the single most relevant context"""
    query_embedding = generate_embedding(query)
    results = index.query(
        vector=query_embedding.tolist(),
        top_k=1,
        include_metadata=True
    )
    return results['matches'][0]['metadata']['text'] if results['matches'] else ""

def query_rag_system(query: str, index):
    """RAG query function using OpenAI"""
    try:
        # Get best matching context
        context = retrieve_single_best_context(query, index)

        # Create messages for OpenAI
        messages = [
            {"role": "system", "content": "You are a helpful assistant that provides clear and concise answers based on the given context. Only provide direct answers without mentioning the context or using phrases like 'based on the context' or 'according to the document'."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
        ]

        # Generate response using OpenAI
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # You can change to gpt-4 if needed
            messages=messages,
            temperature=0.7,
            max_tokens=150
        )

        # Return clean response
        return {
            "answer": response.choices[0].message.content.strip()
        }

    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

def upload_pdf_to_db(pdf_path, index):
    """Upload PDF content to Pinecone database"""
    reader = PdfReader(pdf_path)
    document_text = ""

    # Extract text from PDF
    for page in reader.pages:
        document_text += page.extract_text()

    # Split text into chunks
    chunk_size = 500
    text_chunks = [document_text[i:i + chunk_size]
                  for i in range(0, len(document_text), chunk_size)]

    # Generate embeddings and upload to Pinecone
    for chunk in text_chunks:
        vector_id = hashlib.sha256(chunk.encode('utf-8')).hexdigest()
        embedding = generate_embedding(chunk)
        index.upsert([(vector_id, embedding.tolist(), {"text": chunk})])

    print(f"PDF uploaded successfully! {len(text_chunks)} chunks added to the database.")

def ask(question):
    """Helper function to ask a single question"""
    index = initialize_pinecone_index()
    result = query_rag_system(question, index)

    if "error" in result:
        print(f"\nError: {result['error']}")
    else:
        print("\nAnswer:", result["answer"])

def test_rag():
    """Interactive query function"""
    index = initialize_pinecone_index()

    while True:
        query = input("\nEnter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        result = query_rag_system(query, index)

        if "error" in result:
            print(f"\nError: {result['error']}")
        else:
            print("\nAnswer:", result["answer"])

# Example usage
if __name__ == "__main__":
    # Initialize the index
    index = initialize_pinecone_index()

    # To upload a new PDF (uncomment and modify path)
    # pdf_path = "/path/to/your/document.pdf"
    # upload_pdf_to_db(pdf_path, index)

    # To ask a single question
    # ask("What is the main topic of this document?")

    # Or to start interactive mode


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Index 'knowledge-base' already exists.


In [4]:
# Import required libraries
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from typing import Dict, List
import pandas as pd

# Download required NLTK data
import nltk
nltk.download('punkt')
nltk.download("punkt_tab")

class RAGEvaluator:
    def __init__(self, index, query_system):
        self.index = index
        self.query_system = query_system
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def calculate_bleu(self, reference: str, candidate: str) -> float:
        """Calculate BLEU score"""
        reference_tokens = [nltk.word_tokenize(reference.lower())]
        candidate_tokens = nltk.word_tokenize(candidate.lower())
        return sentence_bleu(reference_tokens, candidate_tokens)

    def calculate_rouge_scores(self, reference: str, candidate: str) -> Dict:
        """Calculate ROUGE scores with detailed metrics"""
        scores = self.rouge_scorer.score(reference, candidate)

        metrics = {}
        for rouge_type in ['rouge1', 'rouge2', 'rougeL']:
            metrics[f'{rouge_type}_precision'] = scores[rouge_type].precision
            metrics[f'{rouge_type}_recall'] = scores[rouge_type].recall
            metrics[f'{rouge_type}_f1'] = scores[rouge_type].fmeasure

        return metrics

    def calculate_semantic_similarity(self, reference: str, candidate: str) -> float:
        """Calculate semantic similarity using embeddings"""
        ref_embedding = generate_embedding(reference)
        cand_embedding = generate_embedding(candidate)

        return float(cosine_similarity(
            ref_embedding.reshape(1, -1),
            cand_embedding.reshape(1, -1)
        )[0][0])

    def evaluate_single_response(self, query: str, ground_truth: str) -> Dict:
        """Evaluate a single response with all metrics"""
        # Get system response
        result = self.query_system(query, self.index)
        if "error" in result:
            return {"error": result["error"]}

        response = result["answer"]

        # Calculate all metrics
        metrics = {
            "query": query,
            "ground_truth": ground_truth,
            "system_response": response,
            "bleu_score": self.calculate_bleu(ground_truth, response),
            "semantic_similarity": self.calculate_semantic_similarity(ground_truth, response)
        }

        # Add ROUGE scores
        metrics.update(self.calculate_rouge_scores(ground_truth, response))

        return metrics

    def evaluate_test_set(self, test_cases: List[Dict]) -> Dict:
        """Evaluate a set of test cases and compute aggregate metrics"""
        all_results = []

        for test_case in test_cases:
            result = self.evaluate_single_response(
                test_case["query"],
                test_case["ground_truth"]
            )
            all_results.append(result)

        # Convert results to DataFrame for easy analysis
        df = pd.DataFrame(all_results)

        # Calculate aggregate metrics
        numeric_columns = [
            'bleu_score', 'semantic_similarity',
            'rouge1_precision', 'rouge1_recall', 'rouge1_f1',
            'rouge2_precision', 'rouge2_recall', 'rouge2_f1',
            'rougeL_precision', 'rougeL_recall', 'rougeL_f1'
        ]

        summary_metrics = df[numeric_columns].agg([
            'mean', 'std', 'min', 'max'
        ]).round(4)

        return {
            "detailed_results": all_results,
            "summary_metrics": summary_metrics.to_dict()
        }

def format_evaluation_results(results: Dict) -> str:
    """Format evaluation results for display"""
    output = "RAG System Evaluation Results\n"
    output += "=" * 50 + "\n\n"

    # Format summary metrics
    output += "Summary Metrics:\n"
    output += "-" * 20 + "\n\n"

    metrics_df = pd.DataFrame(results["summary_metrics"])
    output += str(metrics_df) + "\n\n"

    # Format detailed results
    output += "Detailed Results:\n"
    output += "-" * 20 + "\n\n"

    for i, result in enumerate(results["detailed_results"], 1):
        output += f"Test Case {i}:\n"
        output += f"Query: {result['query']}\n"
        output += f"Ground Truth: {result['ground_truth']}\n"
        output += f"System Response: {result['system_response']}\n"
        output += f"BLEU Score: {result['bleu_score']:.4f}\n"
        output += f"Semantic Similarity: {result['semantic_similarity']:.4f}\n"
        output += f"ROUGE-1: P={result['rouge1_precision']:.4f}, R={result['rouge1_recall']:.4f}, F1={result['rouge1_f1']:.4f}\n"
        output += f"ROUGE-2: P={result['rouge2_precision']:.4f}, R={result['rouge2_recall']:.4f}, F1={result['rouge2_f1']:.4f}\n"
        output += f"ROUGE-L: P={result['rougeL_precision']:.4f}, R={result['rougeL_recall']:.4f}, F1={result['rougeL_f1']:.4f}\n"
        output += "-" * 50 + "\n"

    return output

# Example usage
def run_evaluation(test_cases):
    """Run evaluation with the provided test cases"""
    # Initialize evaluator
    index = initialize_pinecone_index()
    evaluator = RAGEvaluator(index, query_rag_system)

    # Run evaluation
    results = evaluator.evaluate_test_set(test_cases)

    # Print formatted results
    print(format_evaluation_results(results))

    return results

# Example test cases
test_cases = [
    {
        "query": "What is group health insurance?",
        "ground_truth": "Group health insurance provides health coverage to employees of an organization, typically paid for by the company, with options to include dependents."
    },
    {
        "query": "What factors influence the premiums of group health insurance?",
        "ground_truth": "Premiums are influenced by the type of plan, sum insured, nature of the job, add-ons, claim history, and the average age of employees."
    },
    {
        "query": "What is the difference between group health insurance and individual health insurance in terms of premium payment?",
        "ground_truth": "Group insurance premiums are often deducted from the employee's salary, while individual insurance premiums are paid directly by the policyholder."
    },
    {
        "query": "What does the employee-only group health insurance policy cover?",
        "ground_truth": "It covers only the employee, including contractual employees, but not their family members."
    },
    {
        "query": "What is the significance of the claim settlement ratio?",
        "ground_truth": "It indicates the percentage of claims settled by an insurer annually, with a ratio above 80% considered good."
    },
    {
        "query": "What are daycare and domiciliary expenses in group health insurance?",
        "ground_truth": "Daycare covers hospitalization expenses for less than 24 hours, and domiciliary covers treatment expenses at home."
    },
    {
        "query": "How does the average age of employees affect group health insurance premiums?",
        "ground_truth": "A younger workforce results in lower premiums due to reduced medical risks, while an older workforce increases the premiums."
    },
    {
        "query": "What is the coverage for pre-existing diseases under group health insurance?",
        "ground_truth": "Pre-existing diseases are covered from day one of the policy."
    },
    {
        "query": "What should be considered when comparing group health insurance policies?",
        "ground_truth": "Consider factors such as coverage, sum insured, claim settlement ratio, TPA, network hospitals, and incurred claim ratio."
    },
    {
        "query": "What are some common add-ons available in group health insurance policies?",
        "ground_truth": "Add-ons include maternity insurance, dental coverage, wellness programs, doctor consultations, and room rent capping."
    }
]


# Run evaluation
# results = run_evaluation(test_cases)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:

results = run_evaluation(test_cases)

Index 'knowledge-base' already exists.


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

RAG System Evaluation Results

Summary Metrics:
--------------------

      bleu_score  semantic_similarity  rouge1_precision  rouge1_recall  \
mean      0.1386               0.8447            0.4062         0.5507   
std       0.1664               0.0813            0.1289         0.2227   
min       0.0000               0.7249            0.2333         0.2353   
max       0.5332               0.9468            0.6667         0.8667   

      rouge1_f1  rouge2_precision  rouge2_recall  rouge2_f1  rougeL_precision  \
mean     0.4553            0.1916         0.2652     0.2174            0.3468   
std      0.1549            0.1730         0.2156     0.1880            0.1409   
min      0.2857            0.0345         0.0588     0.0435            0.2000   
max      0.6957            0.6364         0.7000     0.6667            0.6667   

      rougeL_recall  rougeL_f1  
mean         0.4669     0.3879  
std          0.2054     0.1591  
min          0.1765     0.2143  
max          0.7273  