Using Ragas for evaluation: https://docs.ragas.io/en/latest/getstarted/index.html

In [None]:
%pip install langchain_community
%pip install langchain_text_splitters
%pip install langchain-openai
%pip install langchainhub
%pip install chromadb
%pip install langchain
%pip install python-dotenv
%pip uninstall uvloop -y
%pip install PyPDF2 -q --user
%pip install rank_bm25
%pip install ragas
%pip install tqdm -q --user
%pip install matplotlib

In [None]:
import os
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

## new
import tqdm as notebook_tqdm
import pandas as pd
import matplotlib.pyplot as plt
from ragas import EvaluationDataset, SingleTurnSample, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
from ragas.metrics import (
    LLMContextRecall,
    Faithfulness,
    FactualCorrectness,
    ResponseRelevancy,
    LLMContextPrecisionWithoutReference,
    SemanticSimilarity
)

In [None]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
pdf_path = "google-2023-environmental-report.pdf"
collection_name = "google_environmental_report"
str_output_parser = StrOutputParser()
user_query = "What are Google's environmental initiatives?"

In [None]:
# LLMs/Embeddings
embedding_ada = "text-embedding-ada-002"
model_gpt35 = "gpt-3.5-turbo"
model_gpt4 = "gpt-4o-mini"

embedding_function = OpenAIEmbeddings(model=embedding_ada, openai_api_key=openai.api_key)
llm = ChatOpenAI(model=model_gpt35, openai_api_key=openai.api_key, temperature=0.0)
generator_llm = ChatOpenAI(model=model_gpt35, openai_api_key=openai.api_key, temperature=0.0)
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model=embedding_ada,openai_api_key=openai.api_key))
critic_llm_raw = ChatOpenAI(model=model_gpt4,openai_api_key=openai.api_key, temperature=0.0)
embeddings_raw = OpenAIEmbeddings(model=embedding_ada,openai_api_key=openai.api_key)
critic_llm_wrapped = LangchainLLMWrapper(critic_llm_raw)
embeddings_wrapped = LangchainEmbeddingsWrapper(embeddings_raw)


In [None]:
#### INDEXING ####

In [None]:
# # Load the PDF and extract text
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [None]:
# Split
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)
splits = character_splitter.split_text(text)

In [None]:
dense_documents = [Document(page_content=text, metadata={"id": str(i), "source": "dense"}) for i, text in enumerate(splits)]
sparse_documents = [Document(page_content=text, metadata={"id": str(i), "source": "sparse"}) for i, text in enumerate(splits)]

In [None]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client
)

In [None]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [None]:
#### RETRIEVAL and GENERATION ####

In [None]:
# Prompt
prompt = hub.pull("jclemens24/rag-prompt")

In [None]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | str_output_parser
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | str_output_parser
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [None]:
rag_chain_similarity = RunnableParallel(
    {"context": dense_retriever,
     "question": RunnablePassthrough()
}).assign(answer=rag_chain_from_docs)

In [None]:
rag_chain_hybrid = RunnableParallel(
    {"context": ensemble_retriever,
     "question": RunnablePassthrough()
}).assign(answer=rag_chain_from_docs)

In [None]:
# Question - Submitted to the similarity / dense vector search
result = rag_chain_similarity.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question to Similarity Search: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}")
    print(f"Content:\n{doc.page_content}\n")

In [None]:
# Question - Submitted to the hybrid / multi-vector search
result = rag_chain_hybrid.invoke(user_query)
retrieved_docs = result['context']

print(f"Original Question to Dense Search: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}")
    print(f"Content:\n{doc.page_content}\n")

#### SIMILARITY SEARCH ONLY
Google's environmental initiatives include empowering individuals to take action, working together with partners and customers, operating sustainably, achieving net-zero carbon emissions, water stewardship, and promoting a circular economy. They have implemented sustainability features in products like Google Maps, Google Nest thermostats, and Google Flights to help individuals make more sustainable choices. Google also supports various environmental organizations and initiatives, such as the iMasons Climate Accord, ReFED, and The Nature Conservancy, to accelerate climate action and address environmental challenges. Additionally, Google is involved in public policy advocacy and is committed to reducing its environmental impact through its operations and value chain.


#### HYBRID SEARCH

Google's environmental initiatives include empowering individuals to take action, working together with partners and customers, operating sustainably, achieving net-zero carbon emissions, focusing on water stewardship, promoting a circular economy, engaging with suppliers to reduce energy consumption and greenhouse gas emissions, and reporting environmental data. They also support public policy and advocacy for low-carbon economies, participate in initiatives like the iMasons Climate Accord and ReFED, and support projects with organizations like The Nature Conservancy. Additionally, Google is involved in initiatives with the World Business Council for Sustainable Development and the World Resources Institute to improve well-being for people and the planet. They are also working on using technology and platforms to organize information about the planet and make it actionable to help partners and customers create a positive impact.

### SYNTHETIC DATA GENERATION

In [None]:
# Create generator with wrapped LLMs
generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings
)

In [None]:
# Prepare documents
documents = [Document(page_content=chunk) for chunk in splits]

# Define query distribution (only single-hop)
query_distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]

# Generate testset
testset = generator.generate_with_langchain_docs(
    documents,
    testset_size=10,
    query_distribution=query_distribution
)

In [None]:
# comparison dataframe
testset_df = testset.to_pandas()

# save dataframes to CSV files in the specified directory
testset_df.to_csv(os.path.join('testset_data.csv'), index=False)

print("testset DataFrame saved successfully in the local directory.")

In [None]:
# pull data from saved testset, rather than generating above
### load dataframs from CSV file
saved_testset_df = pd.read_csv(os.path.join('testset_data.csv'))
print("testset DataFrame loaded successfully from local directory.")
saved_testset_df.head(5)

### PREPARE SIMILARITY SEARCH DATASET

In [None]:
# PREPARE EVALUATION DATASETS
# Function to generate samples
def generate_ragas_sample(question, ground_truth, rag_chain):
    result = rag_chain.invoke(question)
    
    return SingleTurnSample(
        user_input=question,
        response=result["answer"]["final_answer"],
        retrieved_contexts=[doc.page_content for doc in result["context"]],
        reference=ground_truth
    )

### EVAL SETS FOR EACH CHAIN

In [None]:
# Generate samples for similarity search
similarity_samples = []
for _, row in saved_testset_df.iterrows():
    try:
        sample = generate_ragas_sample(
            row["user_input"], 
            row["reference"],
            rag_chain_similarity
        )
        similarity_samples.append(sample)
    except Exception as e:
        print(f"Error: {e}")
        continue

evaluation_dataset_similarity = EvaluationDataset(samples=similarity_samples)

In [None]:
# Generate samples for hybrid search
hybrid_samples = []
for _, row in saved_testset_df.iterrows():
    try:
        sample = generate_ragas_sample(
            row["user_input"], 
            row["reference"],
            rag_chain_hybrid
        )
        hybrid_samples.append(sample)
    except Exception as e:
        print(f"Error: {e}")
        continue

evaluation_dataset_hybrid = EvaluationDataset(samples=hybrid_samples)

### EVAL SCORING

In [None]:
# Initialize metrics with wrapped LLMs
metrics = [
    Faithfulness(llm=critic_llm_wrapped),
    ResponseRelevancy(llm=critic_llm_wrapped, embeddings=embeddings_wrapped),
    LLMContextPrecisionWithoutReference(llm=critic_llm_wrapped),
    LLMContextRecall(llm=critic_llm_wrapped),
    FactualCorrectness(llm=critic_llm_wrapped),
    SemanticSimilarity(embeddings=embeddings_wrapped)
]

In [None]:
# Evaluate similarity search
print("Evaluating similarity search...")
score_similarity = evaluate(
    dataset=evaluation_dataset_similarity,
    metrics=metrics
)

similarity_df = score_similarity.to_pandas()
similarity_df

In [None]:
# Evaluate hybrid search
print("Evaluating hybrid search...")
score_hybrid = evaluate(
    dataset=evaluation_dataset_hybrid,
    metrics=metrics
)

hybrid_df = score_hybrid.to_pandas()
hybrid_df

### ANALYSIS

In [None]:
# ANALYSIS
key_columns = [
    'faithfulness',
    'answer_relevancy',
    'llm_context_precision_without_reference',
    'context_recall',
    'factual_correctness(mode=f1)',
    'semantic_similarity'
]

similarity_means = similarity_df[key_columns].mean()
hybrid_means = hybrid_df[key_columns].mean()

comparison_df = pd.DataFrame({
    'Similarity Run': similarity_means, 
    'Hybrid Run': hybrid_means
})
comparison_df['Difference'] = comparison_df['Similarity Run'] - comparison_df['Hybrid Run']

# Save dataframes
similarity_df.to_csv(os.path.join('similarity_run_data.csv'), index=False)
hybrid_df.to_csv(os.path.join('hybrid_run_data.csv'), index=False)
comparison_df.to_csv(os.path.join('comparison_data.csv'), index=True)

print("Dataframes saved successfully in the local directory.")

In [None]:
# Load and print comparison
sem_df = pd.read_csv(os.path.join('similarity_run_data.csv'))
rec_df = pd.read_csv(os.path.join('hybrid_run_data.csv'))
comparison_df = pd.read_csv(os.path.join('comparison_data.csv'), index_col=0)

print("Dataframes loaded successfully from the local directory.")
print("Performance Comparison:")
print("\n**Retrieval**:")
print(comparison_df.loc[['llm_context_precision_without_reference', 'context_recall']])
print("\n**Generation**:")
print(comparison_df.loc[['faithfulness', 'answer_relevancy']])
print("\n**End-to-end evaluation**:")
print(comparison_df.loc[['factual_correctness(mode=f1)', 'semantic_similarity']])

In [None]:
# Visualization - Using actual column names from the dataframe
fig, axes = plt.subplots(3, 1, figsize=(12, 18), sharex=False)
bar_width = 0.35
categories = ['Retrieval', 'Generation', 'End-to-end evaluation']
metrics_list = [
    ['llm_context_precision_without_reference', 'context_recall'],
    ['faithfulness', 'answer_relevancy'],
    ['factual_correctness(mode=f1)', 'semantic_similarity']
]

for i, (category, metric_list) in enumerate(zip(categories, metrics_list)):
    ax = axes[i]
    x = range(len(metric_list))
    
    similarity_bars = ax.bar(
        x, 
        comparison_df.loc[metric_list, 'Similarity Run'], 
        width=bar_width, 
        label='Similarity Run', 
        color='#D51900'
    )
    
    for bar in similarity_bars:
        height = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width() / 2, 
            height, 
            f'{height:.1%}', 
            ha='center', 
            va='bottom', 
            fontsize=10
        )
    
    hybrid_bars = ax.bar(
        [i + bar_width for i in x], 
        comparison_df.loc[metric_list, 'Hybrid Run'], 
        width=bar_width, 
        label='Hybrid Run', 
        color='#992111'
    )
    
    for bar in hybrid_bars:
        height = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width() / 2, 
            height, 
            f'{height:.1%}', 
            ha='center', 
            va='bottom', 
            fontsize=10
        )
    
    ax.set_title(category, fontsize=14, pad=20)
    ax.set_xticks([i + bar_width / 2 for i in x])
    
    # Simplify the labels for display
    display_labels = []
    for label in metric_list:
        if label == 'llm_context_precision_without_reference':
            display_labels.append('Context Precision')
        elif label == 'context_recall':
            display_labels.append('Context Recall')
        elif label == 'answer_relevancy':
            display_labels.append('Answer Relevancy')
        elif label == 'factual_correctness(mode=f1)':
            display_labels.append('Factual Correctness')
        else:
            display_labels.append(label.replace('_', ' ').title())
    
    ax.set_xticklabels(display_labels, rotation=45, ha='right', fontsize=12)
    ax.legend(fontsize=12, loc='lower right', bbox_to_anchor=(1, 0))

fig.text(0.04, 0.5, 'Scores', va='center', rotation='vertical', fontsize=14)
fig.suptitle('Performance Comparison', fontsize=16)

plt.tight_layout(rect=[0.05, 0.03, 1, 0.95])
plt.subplots_adjust(hspace=0.6, top=0.92)
plt.show()