In [1]:
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
openai_api_key = os.getenv('OPENAI_API_KEY')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_TRACING_V2=True
langchain_project  = os.getenv('LANGCHAIN_PROJECT')


In [4]:
# loading the data using url

url = "https://python.langchain.com/docs/expression_language/"
loader = RecursiveUrlLoader(
    url = url, extractor= lambda x: Soup(x,'html.parser').text, max_depth=20
)

docs = loader.load()

In [5]:
docs

[Document(metadata={'source': 'https://python.langchain.com/docs/expression_language/', 'content_type': 'text/html; charset=utf-8', 'title': 'Conceptual guide | \uf8ffü¶úÔ∏è\uf8ffüîó LangChain', 'description': 'This guide provides explanations of the key concepts behind the LangChain framework and AI applications more broadly.', 'language': 'en'}, page_content='\n\n\n\n\nConceptual guide | \uf8ffü¶úÔ∏è\uf8ffüîó LangChain\n\n\n\n\n\n\nSkip to main contentIntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1\uf8ffüí¨SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a Simple LLM Application with LCELBuild a Query Analysis SystemBuild a ChatbotConversational RAGBuild an Extraction ChainBuild an AgentTaggingdata_generationBuild a Local RAG ApplicationBuild a PDF ingestion and Question/Answering systemBuild a Retrieval Augmented Generation (RAG) AppVector stores and retr

In [6]:
# splitting the docs into chunks
text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200
)
splits = text_splitter.split_documents(docs)

In [7]:
#vectorstoe aand converting chunks of data into emebeddings

vectorstore = Chroma.from_documents(documents=splits, embedding= OpenAIEmbeddings())

In [8]:
#indexing the vectorstore.

retriever = vectorstore.as_retriever()

In [9]:
# building RAG chain

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:
    def __init__(self, retriever, model:str = 'gpt-4-0125-preview'):
        self.retriever =  retriever
        self.client = wrap_openai(openai.Client())
        self.model = model

    @traceable
    def retrieve_docs(self, question):
        return self.retriever.invoke(question)
    
    @traceable()
    def get_answer(self, question: str):
        similar = self.retrieve_docs(question)
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{similar}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in similar],
        }
    
rag_bot = RagBot(retriever)



In [10]:
response =  rag_bot.get_answer("What is LCEL?")

In [11]:
response

{'answer': 'The LangChain Expression Language (LCEL) is a syntax designed for orchestrating LangChain components. It is particularly useful for creating simpler applications within the LangChain framework. LCEL enables developers to define and execute complex operations involving various LangChain components, such as document loaders, retrieval systems, chat models, and tools, in a more streamlined and efficient manner. This syntax facilitates the manipulation and coordination of these components to build AI applications that can process and respond to data in versatile ways, including text, audio, images, and video. Through LCEL, developers can harness the full capabilities of the LangChain ecosystem to construct sophisticated AI solutions with relative ease.',
 'contexts': ["page_content='Chat models: LLMs exposed via a chat API that process sequences of messages as input and output a message.\nMessages: The unit of communication in chat models, used to represent model input and outp

In [12]:
response['answer']

'The LangChain Expression Language (LCEL) is a syntax designed for orchestrating LangChain components. It is particularly useful for creating simpler applications within the LangChain framework. LCEL enables developers to define and execute complex operations involving various LangChain components, such as document loaders, retrieval systems, chat models, and tools, in a more streamlined and efficient manner. This syntax facilitates the manipulation and coordination of these components to build AI applications that can process and respond to data in versatile ways, including text, audio, images, and video. Through LCEL, developers can harness the full capabilities of the LangChain ecosystem to construct sophisticated AI solutions with relative ease.'

# RAG EValuators. 

components of Evalution pipeline.
1. create a dataset 
* Datasets are collections of Examples, the core building block for the evaluation workflow in LangSmith. Examples provide the inputs over which you will be running your pipeline, and, if applicable, the outputs that you will be comparing against. All examples in a given dataset should follow the same schema. Examples contain an "inputs" dict and an "output" dict, along with (optionally) a metadata dict.
* either using an llm
* create a dictionary with quesionas and answers and convert it as a dataset
* create a dataset using user provided examples or feedback
2. EValuator 
* The inputs to an evaluator consist of:

An Example - the inputs for your pipeline and optionally the reference outputs or labels
A Run - observed output gathered from running the inputs through the Task
An evaluator will then return an EvaluationResult (or similarly shaped dictionary), which is made up of:

    key: The name the metric being evaluated
    score: The value of the metric on this example
    comment: the reasoning trajectory or other string information motivating the score

* we can create a custom evaluator. see langchain  documentation for details
* we can use lanchains off-shell-evaluator functions. 
* most of the times we use `LLM-as-judge` to comapre questions and answers  to evaluate and it langchain has many evaluator fucntions like QA, cot_qa, Criteria, labeld criteria and many more.
3. Task. each example is processed through the task using an llm.
4. Applying evals 


The function client.create_examples() is part of the LangSmith SDK (Software Development Kit). It's used to create and add examples to a dataset in LangSmith for evaluating and monitoring your LLM (Large Language Model) applications.

Here's a breakdown of what this function does:

Purpose:

Create Examples: It allows you to programmatically create examples that will be used to test your LLM application. These examples typically consist of inputs (e.g., user prompts, questions) and optionally, expected outputs or reference answers.
Add to Dataset: The examples created by this function are added to a specific dataset within your LangSmith project. Datasets help you organize and manage your evaluation data.

In [13]:
# creating a dataset and adding it to the langsmith datasets.
# never forget to configure all the api_keys required.
#dataset name should be unique always.


from langsmith import Client

# QA
inputs = [
    "How can I directly pass a string to a runnable and use it to construct the input needed for my prompt?",
    "How can I make the output of my LCEL chain a string?",
    "How can I apply a custom function to one of the inputs of an LCEL chain?",
]

outputs = [
    "Use RunnablePassthrough. from langchain_core.runnables import RunnableParallel, RunnablePassthrough; from langchain_core.prompts import ChatPromptTemplate; from langchain_openai import ChatOpenAI; prompt = ChatPromptTemplate.from_template('Tell a joke about: {input}'); model = ChatOpenAI(); runnable = ({'input' : RunnablePassthrough()} | prompt | model); runnable.invoke('flowers')",
    "Use StrOutputParser. from langchain_openai import ChatOpenAI; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.output_parsers import StrOutputParser; prompt = ChatPromptTemplate.from_template('Tell me a short joke about {topic}'); model = ChatOpenAI(model='gpt-3.5-turbo') #gpt-4 or other LLMs can be used here; output_parser = StrOutputParser(); chain = prompt | model | output_parser",
    "Use RunnableLambda with itemgetter to extract the relevant key. from operator import itemgetter; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.runnables import RunnableLambda; from langchain_openai import ChatOpenAI; def length_function(text): return len(text); chain = ({'prompt_input': itemgetter('foo') | RunnableLambda(length_function),} | prompt | model); chain.invoke({'foo':'hello world'})",
]

qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

# Create dataset
client = Client()
dataset_name = "practice_erag_evaluation_dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about LCEL.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)



In [14]:
qa_pairs

[{'question': 'How can I directly pass a string to a runnable and use it to construct the input needed for my prompt?',
  'answer': "Use RunnablePassthrough. from langchain_core.runnables import RunnableParallel, RunnablePassthrough; from langchain_core.prompts import ChatPromptTemplate; from langchain_openai import ChatOpenAI; prompt = ChatPromptTemplate.from_template('Tell a joke about: {input}'); model = ChatOpenAI(); runnable = ({'input' : RunnablePassthrough()} | prompt | model); runnable.invoke('flowers')"},
 {'question': 'How can I make the output of my LCEL chain a string?',
  'answer': "Use StrOutputParser. from langchain_openai import ChatOpenAI; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.output_parsers import StrOutputParser; prompt = ChatPromptTemplate.from_template('Tell me a short joke about {topic}'); model = ChatOpenAI(model='gpt-3.5-turbo') #gpt-4 or other LLMs can be used here; output_parser = StrOutputParser(); chain = prompt | model |

# Evaluator.
 Lets consider the chain retrieved answert to  a reference answer.

 we use `LLM-as-Judge`  (`cot_qa` one of the kind ) as evaluator which gives us cot contextual accuracy.

 This evaluator uses a well predefined prompt. we can get that prompt from langchain hub

from langchain import hub

prompt = hub.pull("langchain-ai/cot_qa")


each langchainstringevaluator() takes 3 parameters. 
* questioin from dataset-> input defined in the dataset
* answer(rag chain answer or response of the retriever)
* reference(the ground truth) or answer from the dataset

In [15]:
# RAG chain

def predict_rag_answer(examples:dict):
    """ use this for answer evalution. response produced by rag chain to the input question from dataset."""
    response = rag_bot.get_answer(examples['question'])
    return {'answwer': response['answer']}

def predict_rag_answe_with_context(examples:dict):
    response = rag_bot.get_answer(examples['question'])
    return {'answer':response['answer'],'contexts': response['contexts']}
    

In [16]:

# RAG chain
# dataset is collections of examples. each input is an example. 
# so, here predict_rag_answer takes an example which is a input in the dataset and send it to retriever to retireve the output.
# predict_rag_answer return the only answer part of response which is a dict ype.

def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}


# predict_rag_answer_with_context also do the same thing. and it return the contexts aloing with answer from the response.
# response is a dict with keys answer, contexts, 
def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [17]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evalulator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        },
    )
]
dataset_name = "practice_erag_evaluation_dataset"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="rag-qa-oai",
    metadata={"variant": "LCEL context, gpt-3.5-turbo"},
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'rag-qa-oai-6b06aa9d' at:
https://smith.langchain.com/o/0d3dfbd3-48b9-4b11-9c49-27d343c90e06/datasets/5204bc66-d277-4dfe-aae5-1b4d1259d56a/compare?selectedSessions=4ba625b2-65d4-4b02-8ddd-25f2610a5896




3it [00:24,  8.19s/it]


In [21]:
# type 2: Answer Hallucination
# In this scenario, we gonna compare Rag chain answer to the retrieved documents.
# to comapre rag chain answers and retrieved documents we commonly use Criteria as evaluator
# we will use labeled_Score_string as LLM-as-Judge evaluator.

# here we need only two inputs. one is rag chain answer and the ither is retrieved documents(context)

from langsmith.evaluation import LangChainStringEvaluator, evaluate

answer_hallucination_evaluator = LangChainStringEvaluator(
    'labeled_score_string',
    config ={
        "criteria":{
            'accuracy':"""Is the Assistant's Answer grounded in the Ground Truth documentation? A score of [[1]] means that the
            Assistant answer contains is not at all based upon / grounded in the Groun Truth documentation. A score of [[5]] means 
            that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth 
            documentation. A score of [[10]] means that the Assistant answer is fully based upon the in the Ground Truth documentation."""
        },
        'normalize_by': 10,
    },
    prepare_data= lambda run, examples:{
        'prediction':run.outputs['answer'],
        'reference': run. outputs['contexts'],
        'input': examples.inputs['question'],
    },
)

In [22]:
dataset_name = "practice_erag_evaluation_dataset"
experiment_results = evaluate(

    predict_rag_answer_with_context,
    data = dataset_name,
    evaluators = [answer_hallucination_evaluator],
    experiment_prefix="rag-qa-oai-hallucination",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "LCEL context, gpt-3.5-turbo",}

)

View the evaluation results for experiment: 'rag-qa-oai-hallucination-7bc8b1aa' at:
https://smith.langchain.com/o/0d3dfbd3-48b9-4b11-9c49-27d343c90e06/datasets/5204bc66-d277-4dfe-aae5-1b4d1259d56a/compare?selectedSessions=4b299e61-245c-4bd3-ba96-b614b8f58a8b




3it [00:30, 10.32s/it]
