# Setup

In [None]:
# ! pip install langchain langchain-community faiss-cpu sentence-transformers octoai-sdk langchain-text-splitters lxml tiktoken python-dotenv 'arize-phoenix[evals]' openai

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OCTOAI_API_TOKEN = os.environ["OCTOAI_API_TOKEN"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [2]:
import phoenix as px
session = px.launch_app()
# session = px.launch_app(trace=px.TraceDataset.load("5f612e9f-e796-469d-8a5c-16aa2ea234c8"))

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [3]:
from phoenix.trace.langchain import LangChainInstrumentor

LangChainInstrumentor().instrument()

# Ingest Data

In [4]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

In [5]:
files = os.listdir("../city_data")
file_texts = []
for file in files:
    with open(f"../city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=800, chunk_overlap=200, separator="."
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [7]:
embeddings = HuggingFaceEmbeddings()

  warn_deprecated(


In [8]:
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [9]:
len(file_texts)

543

# Search the Data

In [10]:
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="meta-llama-3.1-8b-instruct",
        max_tokens=2000,
        presence_penalty=0,
        temperature=0.0,
        top_p=0.9,
    )

                model was transferred to model_kwargs.
                Please confirm that model is what you intended.


In [11]:
retriever = vector_store.as_retriever()

In [12]:
from langchain.prompts import ChatPromptTemplate
template="""You are a helpful tour guide. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
with open("test_qs.txt", "r") as f:
    qs = f.read().splitlines()
qs

['List the cities from oldest to youngest: Paris, Berlin, San Francisco.',
 'Which historical monuments should I visit in Cairo?',
 'Is Chicago more or less populated than New York?',
 'Compare and contrast night life in Houston and Moscow.',
 'Which city has a more active tech scene? San Francisco or Lisbon?',
 'Which city has a more active financial sector? London or Boston?',
 'Where is the Eiffel Tower located?',
 'When should I visit the Empire State Building in Houston?',
 'Who is Yujian Tang?']

In [15]:
responses = []
for q in qs:
    res = chain.invoke(q)
    responses.append(res)

# Run Evals

In [16]:
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations

In [17]:
queries_df = get_qa_with_reference(px.Client())
retrieved_documents_df = get_retrieved_documents(px.Client())

In [18]:
eval_model = OpenAIModel(
    model="gpt-4-turbo",
)
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

hallucination_eval_df, qa_correctness_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_correctness_evaluator],
    provide_explanation=True,
)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df,
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
    DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df),
)
trace_id = px.Client().get_trace_dataset().save()

WARNI [phoenix.evals.executors] 🐌!! If running llm_classify inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/18 (0.0%) | ⏳ 00:00<? | ?it/s

WARNI [phoenix.evals.executors] 🐌!! If running llm_classify inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/36 (0.0%) | ⏳ 00:00<? | ?it/s

💾 Trace dataset saved to under ID: c45e3e51-926b-4352-837e-4378167ad7b1
📂 Trace dataset path: /Users/yujian/.phoenix/trace_datasets/trace_dataset-c45e3e51-926b-4352-837e-4378167ad7b1.parquet


  df_attributes = pd.DataFrame.from_records(


First example

💾 Trace dataset saved to under ID: 5f612e9f-e796-469d-8a5c-16aa2ea234c8

📂 Trace dataset path: /Users/yujian/.phoenix/trace_datasets/trace_dataset-5f612e9f-e796-469d-8a5c-16aa2ea234c8.parquet
