In [None]:
!pip install "langchain>=0.1.0" langchain-openai "openai>=1" "arize-phoenix[evals]" tiktoken nest-asyncio

In [None]:
import json
import os
from getpass import getpass
from urllib.request import urlopen

import nest_asyncio
import numpy as np
import pandas as pd
import phoenix as px
from langchain.chains import RetrievalQA
from langchain.retrievers import KNNRetriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from phoenix.trace.langchain import LangChainInstrumentor
from tqdm import tqdm


from kaggle_secrets import UserSecretsClient

nest_asyncio.apply()  # needed for concurrent evals in notebook environments

In [None]:
session = px.launch_app()

In [None]:

# setup OpenAI API connection
user_secrets = UserSecretsClient()
api_key=user_secrets.get_secret("openaivision")
os.environ['OPENAI_API_KEY']= api_key

In [None]:
# build the LangChain application
df = pd.read_parquet(
    "http://storage.googleapis.com/arize-phoenix-assets/datasets/"
    "unstructured/llm/context-retrieval/langchain/database.parquet"
)
knn_retriever = KNNRetriever(
    index=np.stack(df["text_vector"]),
    texts=df["text"].tolist(),
    embeddings=OpenAIEmbeddings(),
)
chain_type = "stuff"  # stuff, refine, map_reduce, and map_rerank
chat_model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=chat_model_name)
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=knn_retriever,
    metadata={"application_type": "question_answering"},
)

In [None]:
LangChainInstrumentor().instrument()

In [None]:
url = "http://storage.googleapis.com/arize-phoenix-assets/datasets/unstructured/llm/context-retrieval/arize_docs_queries.jsonl"
queries = []
with urlopen(url) as response:
    for line in response:
        line = line.decode("utf-8").strip()
        data = json.loads(line)
        queries.append(data["query"])
queries[:10]

Run a few queries.

In [None]:
for query in tqdm(queries[:10]):
    chain.invoke(query)

In [None]:
queries_df = get_qa_with_reference(px.Client())
retrieved_documents_df = get_retrieved_documents(px.Client())

In [None]:
eval_model = OpenAIModel(
    model="gpt-4-turbo-preview",
)
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

hallucination_eval_df, qa_correctness_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_correctness_evaluator],
    provide_explanation=True,
)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df,
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
    DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)