## Set up the RAG pipeline

In [None]:
import os

from dotenv import load_dotenv, find_dotenv
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain_openai import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain


load_dotenv(find_dotenv())

embedding_model = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    api_version="2023-05-15",
)

vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
    index_name="engineering_fundamentals",
    embedding_function=embedding_model.embed_query,
)

message_history = ChatMessageHistory()
conversation_buffer_memory = ConversationBufferMemory(
    memory_key="chat_history",
    output_key="answer",
    chat_memory=message_history,
    return_messages=True
)

chat_model = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
    api_version="2023-12-01-preview",
    temperature=0.0,
)

rag_chain = ConversationalRetrievalChain.from_llm(
    llm=chat_model,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    memory=conversation_buffer_memory,
    return_source_documents=True
)

## Set up a test set with RAGAS

In [None]:
# load the original data - split into chunks
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter


def get_md_files(directory):
    files = []
    for path in sorted(directory.rglob("*.md")):
        relative_path = path.relative_to(directory)
        topics = str(relative_path).split("\\")[:-1]
        files.append([path, topics])
    return files

def get_chunks(path, topics):
    headers_to_split_on = [
        ("#", "Title"),
        ("##", "Subheader")
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

    chunks = []

    with open(path, "r", encoding="utf8") as f:
        try:
            markdown_text = f.read()
            chunks = markdown_splitter.split_text(markdown_text)
        except Exception as ex:
            print(path, ex)

    for chunk in chunks:
        chunk.metadata["topics"] = ','.join(topics)
        chunk.metadata["path"] = str(path)

    return chunks


# load the filenames and topics
doc_dir = Path("../data/docs")
files = get_md_files(doc_dir)

# load chunks from the files
docs = []

for path, topic in files:
    docs += get_chunks(path, topic)

In [None]:
docs[:5]

## Create the Test Set

In [None]:
from ragas.testset.generator import TestsetGenerator

# ideally we should have a 4.0 evaluator for a 3.5 pipeline
# but for the demo I'm using the same
generator = TestsetGenerator.from_langchain(
    generator_llm=chat_model,
    critic_llm=chat_model,
    embeddings=embedding_model,
)

In [None]:
from ragas.testset.evolutions import simple, reasoning, multi_context

testset = generator.generate_with_langchain_docs(
    docs,
    test_size=10,
    raise_exceptions=False,
    with_debugging_logs=False,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

In [None]:
dataset = testset.to_pandas()
dataset

In [None]:
dataset.iloc[0].question

In [None]:
dataset.iloc[0].ground_truth

In [None]:
dataset.iloc[0].contexts

## Infer the queries in the test set

In [None]:
questions = [dr.question for dr in testset.test_data]
ground_truth = [dr.ground_truth for dr in testset.test_data]

answers = []
contexts = []

for question in questions:
    result = rag_chain.invoke(question)
    answers.append(result["answer"])
    contexts.append([doc.page_content for doc in result["source_documents"]])

In [None]:
inference_data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truth,
}

In [None]:
from datasets import Dataset


inference_dataset = Dataset.from_dict(inference_data)
inference_dataset

## Evaluate the inferred results

> NOTE: We should ideally use a better eval model like 4.0 evaluating 3.5

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    llm=chat_model,
    embeddings=embedding_model,
    dataset = inference_dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

evaluation_result = result.to_pandas()

In [None]:
evaluation_result

In [None]:
evaluation_result.describe()

Looks like the context recall is a bit on the low side - but we also have extremely few items in our test set so one bad apple throws it off -- here it is all the ones that have ground truth as nan