In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")
## Langsmith Tracking
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGSMITH_ENDPOINT"]=os.getenv("LANGSMITH_ENDPOINT")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")

In [2]:
## Data Ingestion -from website
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
docs=WebBaseLoader("https://docs.langchain.com/langsmith/evaluation-concepts")
docs

<langchain_community.document_loaders.web_base.WebBaseLoader at 0x1a2929a8910>

In [4]:
docs=docs.load()
docs

[Document(metadata={'source': 'https://docs.langchain.com/langsmith/evaluation-concepts', 'title': 'Evaluation concepts - Docs by LangChain', 'language': 'en'}, page_content='Evaluation concepts - Docs by LangChainSkip to main contentDocs by LangChain home pageLangSmithSearch...⌘KSupportGitHubTry LangSmithTry LangSmithSearch...NavigationEvaluation conceptsGet startedObservabilityEvaluationPrompt engineeringDeploymentPlatform setupReferenceOverviewQuickstartConceptsEvaluation approachesDatasetsCreate a datasetManage datasetsCustom output renderingSet up evaluationsRun an evaluationEvaluation typesFrameworks & integrationsEvaluation techniquesImprove evaluatorsTutorialsAnalyze experiment resultsAnalyze an experimentCompare experiment resultsFilter experiments in the UIFetch performance metrics for an experimentUpload experiments run outside of LangSmithAnnotation & human feedbackUse annotation queuesSet up feedback criteriaAnnotate traces and runs inlineAudit evaluator scoresCommon data 

In [5]:
## Load Data --> Docs --> Divide our text into chunks --> textVectors --> Vector Embedding --> vector Storing
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
document = text_splitter.split_documents(docs)


In [6]:
document

[Document(metadata={'source': 'https://docs.langchain.com/langsmith/evaluation-concepts', 'title': 'Evaluation concepts - Docs by LangChain', 'language': 'en'}, page_content='Evaluation concepts - Docs by LangChainSkip to main contentDocs by LangChain home pageLangSmithSearch...⌘KSupportGitHubTry LangSmithTry LangSmithSearch...NavigationEvaluation conceptsGet startedObservabilityEvaluationPrompt engineeringDeploymentPlatform setupReferenceOverviewQuickstartConceptsEvaluation approachesDatasetsCreate a datasetManage datasetsCustom output renderingSet up evaluationsRun an evaluationEvaluation typesFrameworks & integrationsEvaluation techniquesImprove evaluatorsTutorialsAnalyze experiment resultsAnalyze an experimentCompare experiment resultsFilter experiments in the UIFetch performance metrics for an experimentUpload experiments run outside of LangSmithAnnotation & human feedbackUse annotation queuesSet up feedback criteriaAnnotate traces and runs inlineAudit evaluator scoresCommon data 

In [7]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [8]:
from langchain_community.vectorstores import FAISS
vektorstoredb = FAISS.from_documents(document, embeddings)

In [9]:
vektorstoredb

<langchain_community.vectorstores.faiss.FAISS at 0x1a2abb3d000>

In [10]:
## Query From a vector db
query="LLM outputs are non-deterministic, which makes response quality hard to assess."
result=vektorstoredb.similarity_search(query)
result[0].page_content

'LLM-as-judge evaluators require careful review of scores and prompt tuning. Few-shot evaluators, which include examples of inputs, outputs, and expected grades in the grader prompt, often improve performance.\nLearn about how to define an LLM-as-a-judge evaluator.\n\u200bPairwise\nPairwise evaluators compare outputs from two application versions using heuristics (e.g., which response is longer), LLMs (with pairwise prompts), or human reviewers.\nPairwise evaluation works well when directly scoring an output is difficult but comparing two outputs is straightforward. For example, in summarization tasks, choosing the more informative of two summaries is often easier than assigning an absolute score to a single summary.\nLearn how run pairwise evaluations.\n\u200bReference-free vs reference-based evaluators\nUnderstanding whether an evaluator requires reference outputs is essential for determining when it can be used.'

In [11]:
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(model="gpt-4o-mini")

In [12]:
## Retrieval Chain, Document chain

from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template(
    """
Answer the following question based only on the provided context:
<context>
{context}
</context>


"""
)

document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nAnswer the following question based only on the provided context:\n<context>\n{context}\n</context>\n\n\n'), additional_kwargs={})])
| ChatOpenAI(profile={'max_input_tokens': 128000, 'max_output_tokens': 16384, 'image_inputs': True, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True, 'structured_output': True, 'image_url_inputs': True, 'pdf_inputs': True, 'pdf_tool_message': True, 'image_tool_message': True, 'tool_choice': True}, client=<openai.resources.chat.completions.co

In [13]:
from langchain_core.documents import Document
document_chain.invoke({
    "input":"What is Evaluation concepts?",
    "context":[Document(page_content="LLM outputs are non-deterministic, which makes response quality hard to assess. Evaluations (evals) are a way to breakdown what “good” looks like and measure it. LangSmith Evaluation provides a framework for measuring quality throughout the application lifecycle, from pre-deployment testing to production monitoring. ")]
})

'What is LangSmith Evaluation used for? \n\nLangSmith Evaluation is used for measuring quality throughout the application lifecycle, from pre-deployment testing to production monitoring.'

In [14]:
### Input--->Retrieval ---> Document Chain ---> LLM ---> Output

vektorstoredb

<langchain_community.vectorstores.faiss.FAISS at 0x1a2abb3d000>

In [17]:
retriever=vektorstoredb.as_retriever()
from langchain_classic.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [18]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001A2ABB3D000>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nAnswer the following question based only on the provided context:\n<context>\n{context}\n</context>\n\n\n'), additional_kwargs={})])
            | 

In [20]:
response=retrieval_chain.invoke({
    "input":"What is Evaluation concepts?"
})
response['answer']

'What is the primary purpose of evaluations in the context of LangSmith Evaluation? \n\nThe primary purpose of evaluations in the context of LangSmith Evaluation is to breakdown what "good" looks like and measure the quality of systems throughout the application lifecycle, from pre-deployment testing to production monitoring. Evaluations help in assessing response quality, especially in the presence of non-deterministic outputs.'

In [21]:
response

{'input': 'What is Evaluation concepts?',
 'context': [Document(id='dbb2d00a-5dc7-409d-a3bd-f3a634af1a5a', metadata={'source': 'https://docs.langchain.com/langsmith/evaluation-concepts', 'title': 'Evaluation concepts - Docs by LangChain', 'language': 'en'}, page_content='Evaluation concepts - Docs by LangChainSkip to main contentDocs by LangChain home pageLangSmithSearch...⌘KSupportGitHubTry LangSmithTry LangSmithSearch...NavigationEvaluation conceptsGet startedObservabilityEvaluationPrompt engineeringDeploymentPlatform setupReferenceOverviewQuickstartConceptsEvaluation approachesDatasetsCreate a datasetManage datasetsCustom output renderingSet up evaluationsRun an evaluationEvaluation typesFrameworks & integrationsEvaluation techniquesImprove evaluatorsTutorialsAnalyze experiment resultsAnalyze an experimentCompare experiment resultsFilter experiments in the UIFetch performance metrics for an experimentUpload experiments run outside of LangSmithAnnotation & human feedbackUse annotatio

In [22]:
response['context']

[Document(id='dbb2d00a-5dc7-409d-a3bd-f3a634af1a5a', metadata={'source': 'https://docs.langchain.com/langsmith/evaluation-concepts', 'title': 'Evaluation concepts - Docs by LangChain', 'language': 'en'}, page_content='Evaluation concepts - Docs by LangChainSkip to main contentDocs by LangChain home pageLangSmithSearch...⌘KSupportGitHubTry LangSmithTry LangSmithSearch...NavigationEvaluation conceptsGet startedObservabilityEvaluationPrompt engineeringDeploymentPlatform setupReferenceOverviewQuickstartConceptsEvaluation approachesDatasetsCreate a datasetManage datasetsCustom output renderingSet up evaluationsRun an evaluationEvaluation typesFrameworks & integrationsEvaluation techniquesImprove evaluatorsTutorialsAnalyze experiment resultsAnalyze an experimentCompare experiment resultsFilter experiments in the UIFetch performance metrics for an experimentUpload experiments run outside of LangSmithAnnotation & human feedbackUse annotation queuesSet up feedback criteriaAnnotate traces and ru