In [None]:
# !pip install llama-index llama-index-readers-web

In [None]:
from llama_index.readers.web import FireCrawlWebReader
from dotenv import load_dotenv
import os, pymongo, pprint
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

## Initialize Keys

In [None]:
load_dotenv()
FIRECRAWL_API = os.environ.get('FIRECRAWL_API')
ATLAS_CONNECTION_STRING = os.environ.get('ATLAS_URI')
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_KEY")

## Crawl & Load Documents

In [None]:
firecrawl_reader = FireCrawlWebReader(
    api_key=FIRECRAWL_API, 
    mode="crawl",  # Choose between "crawl" and "scrape" for single page scraping
)

In [None]:
# Load documents from a single page URL
documents = firecrawl_reader.load_data(url="https://truera.com/")

In [None]:
for document in documents:
    # Update the 'ogLocaleAlternate' value to None
    document.metadata["ogLocaleAlternate"] = None

In [None]:
import pickle
# Define the file path
file_path = "documents.pkl"

if not os.path.exists(file_path):
    # Open the file in binary write mode
    with open(file_path, "wb") as f:
        # Serialize and write the Document object to the file
        pickle.dump(documents, f)
    print("Document saved successfully.")
else:
    # Open the file in binary read mode
    with open(file_path, "rb") as f:
        # Load the Document object from the file
        documents = pickle.load(f)
    print("Document loaded successfully.")

## Create Vector Embeddings

In [None]:

Settings.llm = OpenAI(model="gpt-4-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
# Settings.chunk_size = 100
# Settings.chunk_overlap = 10

In [None]:
# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(ATLAS_CONNECTION_STRING)

# Instantiate the vector store
atlas_vector_search = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = "truera_db",
    collection_name = "web_docs",
    index_name = "truera_vector_index"
)
 
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_search)

In [None]:
vector_store_index = VectorStoreIndex.from_documents(
   documents, storage_context=vector_store_context, show_progress=True
)

In [None]:
# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)
# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

## Load Embeddings

In [None]:
Settings.llm = OpenAI(model="gpt-4-turbo")

# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(ATLAS_CONNECTION_STRING)

# Instantiate the vector store
atlas_vector_search = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = "truera_db",
    collection_name = "web_docs",
    index_name = "truera_vector_index"
)

# Create VectorStoreIndex from the vector store
vector_store_index = VectorStoreIndex.from_vector_store(atlas_vector_search)

# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)
# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

In [None]:
query_engine.query("what is truelens?")

In [None]:
from trulens_eval import Tru
tru = Tru()

In [None]:
import numpy as np

# Initialize provider class
from trulens_eval.feedback.provider.openai import OpenAI
openai = OpenAI()

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(query_engine)

# imports for feedback
from trulens_eval import Feedback

# Define a groundedness feedback function
from trulens_eval.feedback import Groundedness
grounded = Groundedness(groundedness_provider=OpenAI())
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on(context.collect()) # collect context chunks into a list
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()

# Question/statement relevance between question and each context chunk.
f_qs_relevance = (
    Feedback(openai.qs_relevance)
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

In [None]:
from trulens_eval import TruLlama
tru_query_engine_recorder = TruLlama(query_engine,
    app_id='SupportSage_App',
    feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])

In [None]:
test_queries = [
    "What does TruEra do?",
    "Who uses TruEra?",
    "How does TruEra interact with my models and data?",
    "What are the security and privacy policies?",
    "Can TruEra ingest data from my local files?",
    "How does Truera help HR organizations",
    "Why use TruEra for HR?",
    "Why use TruEra for banking?",
    "What drives the Truera company?",
    "What products does truera offer?",
    "What is truera's culture",
    "When to use TruLens vs TruEra"
    "why should i pick TruEra?",
    "what is TruEra?",
    "what is TruLens?",
]

with tru_query_engine_recorder as recording:
    for test in test_queries:
        query_engine.query(test)

In [None]:
# The record of the app invocation can be retrieved from the `recording`:

# rec = recording.get() # use .get if only one record
recs = recording.records # use .records if multiple

display(recs)

In [None]:
records, feedback = tru.get_records_and_feedback(app_ids=["SupportSage_App"])

records.head()

In [None]:
tru.get_leaderboard(app_ids=["SupportSage_App"])

In [None]:
tru.run_dashboard()