In [None]:
! pip install langsmith langchain-community langchain chromadb tiktoken

Collecting langsmith
  Downloading langsmith-0.1.63-py3-none-any.whl (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.8/122.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)
Collecting openai<2.0.0,>=1.24.0 (from langchain_openai)
  Downloading openai-1.30.4-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai, langchain_openai
Successfully installed langchain_openai-0.1.7 openai-1.30.4


In [10]:
from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader



# Load
url = "https://python.langchain.com/docs/expression_language/"
loader = RecursiveUrlLoader(
    url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()


In [11]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [12]:

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key="sk-proj-cTotrBRfhyvnCd19WbL5T3BlbkFJMdEikah6ilTftXKCh2KW"))

# Index
retriever = vectorstore.as_retriever()

In [16]:
# Set the environment variable
import os
os.environ["OPENAI_API_KEY"] = "sk"

In [17]:
### RAG

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4-0125-preview"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def get_answer(self, question: str):
        similar = self.retrieve_docs(question)
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{similar}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in similar],
        }


rag_bot = RagBot(retriever)

In [18]:
response = rag_bot.get_answer("What is LCEL?")
response["answer"][:150]

'LangChain Expression Language (LCEL) is a declarative language designed to easily compose chains together. Its primary goal is to support the developm'

In [19]:
import getpass
import os


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


_set_env("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
_set_env("LANGCHAIN_API_KEY")

LANGCHAIN_API_KEY: ··········


In [20]:
from langsmith import Client

# QA
inputs = [
    "How can I directly pass a string to a runnable and use it to construct the input needed for my prompt?",
    "How can I make the output of my LCEL chain a string?",
    "How can I apply a custom function to one of the inputs of an LCEL chain?",
]

outputs = [
    "Use RunnablePassthrough. from langchain_core.runnables import RunnableParallel, RunnablePassthrough; from langchain_core.prompts import ChatPromptTemplate; from langchain_openai import ChatOpenAI; prompt = ChatPromptTemplate.from_template('Tell a joke about: {input}'); model = ChatOpenAI(); runnable = ({'input' : RunnablePassthrough()} | prompt | model); runnable.invoke('flowers')",
    "Use StrOutputParser. from langchain_openai import ChatOpenAI; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.output_parsers import StrOutputParser; prompt = ChatPromptTemplate.from_template('Tell me a short joke about {topic}'); model = ChatOpenAI(model='gpt-3.5-turbo') #gpt-4 or other LLMs can be used here; output_parser = StrOutputParser(); chain = prompt | model | output_parser",
    "Use RunnableLambda with itemgetter to extract the relevant key. from operator import itemgetter; from langchain_core.prompts import ChatPromptTemplate; from langchain_core.runnables import RunnableLambda; from langchain_openai import ChatOpenAI; def length_function(text): return len(text); chain = ({'prompt_input': itemgetter('foo') | RunnableLambda(length_function),} | prompt | model); chain.invoke({'foo':'hello world'})",
]

qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

# Create dataset
client = Client()
dataset_name = "RAG_test_LCEL"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about LCEL.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

In [21]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [23]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evalulator = [
    LangChainStringEvaluator(
        "cot_qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        },
    )
]
dataset_name = "RAG_test_LCEL"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="rag-qa-oai",
    metadata={"variant": "LCEL context, gpt-3.5-turbo"},
)

View the evaluation results for experiment: 'rag-qa-oai-37a3e6cd' at:
https://smith.langchain.com/o/e8f9edfe-5705-5c8b-af73-f907cb69efb9/datasets/8a965e50-b609-49da-85f1-2c58b7ae2676/compare?selectedSessions=e0cba317-60d9-4030-b923-8281c17d103b




0it [00:00, ?it/s]

In [24]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

answer_hallucination_evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    config={
        "criteria": {
            "accuracy": """Is the Assistant's Answer grounded in the Ground Truth documentation? A score of [[1]] means that the
            Assistant answer contains is not at all based upon / grounded in the Groun Truth documentation. A score of [[5]] means
            that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth
            documentation. A score of [[10]] means that the Assistant answer is fully based upon the in the Ground Truth documentation."""
        },
        # If you want the score to be saved on a scale from 0 to 1
        "normalize_by": 10,
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["contexts"],
        "input": example.inputs["question"],
    },
)

In [25]:
dataset_name = "RAG_test_LCEL"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-qa-oai-hallucination",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "LCEL context, gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'rag-qa-oai-hallucination-3d3de570' at:
https://smith.langchain.com/o/e8f9edfe-5705-5c8b-af73-f907cb69efb9/datasets/8a965e50-b609-49da-85f1-2c58b7ae2676/compare?selectedSessions=1b951d32-c771-486e-bf3f-6b15a7e172b4




0it [00:00, ?it/s]

In [26]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate
import textwrap

docs_relevance_evaluator = LangChainStringEvaluator(
    "score_string",
    config={
        "criteria": {
            "document_relevance": textwrap.dedent(
                """The response is a set of documents retrieved from a vectorstore. The input is a question
            used for retrieval. You will score whether the Assistant's response (retrieved docs) is relevant to the Ground Truth
            question. A score of [[1]] means that none of the  Assistant's response documents contain information useful in answering or addressing the user's input.
            A score of [[5]] means that the Assistant answer contains some relevant documents that can at least partially answer the user's question or input.
            A score of [[10]] means that the user input can be fully answered using the content in the first retrieved doc(s)."""
            )
        },
        # If you want the score to be saved on a scale from 0 to 1
        "normalize_by": 10,
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["contexts"],
        "input": example.inputs["question"],
    },
)



In [27]:
dataset_name = "RAG_test_LCEL"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-qa-oai-doc-relevance",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "LCEL context, gpt-4",
    },
)

View the evaluation results for experiment: 'rag-qa-oai-doc-relevance-26235bd1' at:
https://smith.langchain.com/o/e8f9edfe-5705-5c8b-af73-f907cb69efb9/datasets/8a965e50-b609-49da-85f1-2c58b7ae2676/compare?selectedSessions=ce23103a-1ec3-40c5-aa1a-2219ebd406de




0it [00:00, ?it/s]

In [28]:
from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

def document_relevance_grader(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see if retrieved documents are relevant to the question
    """

    # Get documents and question
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "get_answer")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs")
    doc_txt = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
    question = retrieve_run.inputs["question"]

    # Data model for grade
    class GradeDocuments(BaseModel):
        """Binary score for relevance check on retrieved documents."""
        binary_score: int = Field(description="Documents are relevant to the question, 1 or 0")

    # LLM with function call
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    structured_llm_grader = llm.with_structured_output(GradeDocuments)

    # Prompt
    system = """You are a grader assessing relevance of a retrieved document to a user question. \n
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
        It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
        Give a binary score 1 or 0 score, where 1 means that the document is relevant to the question."""

    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
        ]
    )

    retrieval_grader = grade_prompt | structured_llm_grader
    score = retrieval_grader.invoke({"question": question, "document": doc_txt})
    return {"key": "document_relevance", "score": int(score.binary_score)}

def answer_hallucination_grader(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """

    # Get documents and answer
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "get_answer")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs")
    doc_txt = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
    generation = rag_pipeline_run.outputs["answer"]

    # Data model
    class GradeHallucinations(BaseModel):
        """Binary score for hallucination present in generation answer."""

        binary_score: int = Field(description="Answer is grounded in the facts, 1 or 0")

    # LLM with function call
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    structured_llm_grader = llm.with_structured_output(GradeHallucinations)

    # Prompt
    system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
         Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts."""
    hallucination_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
        ]
    )

    hallucination_grader = hallucination_prompt | structured_llm_grader
    score = hallucination_grader.invoke({"documents": doc_txt, "generation": generation})
    return {"key": "answer_hallucination", "score": int(score.binary_score)}

from langsmith.evaluation import evaluate

dataset_name = "RAG_test_LCEL"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[document_relevance_grader,answer_hallucination_grader],
    experiment_prefix= "LCEL context, gpt-3.5-turbo"
    )

View the evaluation results for experiment: 'LCEL context, gpt-3.5-turbo-22e157fc' at:
https://smith.langchain.com/o/e8f9edfe-5705-5c8b-af73-f907cb69efb9/datasets/8a965e50-b609-49da-85f1-2c58b7ae2676/compare?selectedSessions=fe92c0a7-6f00-4634-84a1-2c363b2e418c




0it [00:00, ?it/s]