In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

from trulens_eval import Tru
from trulens_eval import TruCustomApp
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.langchain import Langchain
from trulens_eval.tru_custom_app import instrument
from trulens_eval.feedback import prompts
import custom_prompts

from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

from genai import Client, Credentials
from genai.extensions.langchain import LangChainInterface
from genai.schema import (
    DecodingMethod,
    TextGenerationParameters,
)

load_dotenv()

True

In [2]:
def bam_model(model_id='mistralai/mixtral-8x7b-instruct-v0-1', decoding_method='greedy', max_new_tokens=1000, 
              min_new_tokens=1, temperature=0.5, top_k=50, top_p=1, repetition_penalty=1):

    if decoding_method == 'greedy':
        decoding_method = DecodingMethod.GREEDY
        parameters=TextGenerationParameters(
            decoding_method=decoding_method,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            repetition_penalty=repetition_penalty
        )
    else:
        decoding_method = DecodingMethod.SAMPLE
        parameters=TextGenerationParameters(
            decoding_method=decoding_method,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty
        )

    llm = LangChainInterface(
        model_id=model_id,
        client=Client(credentials=Credentials.from_env()),
        parameters=parameters,
    )

    return llm

In [3]:
mixtral_llm = bam_model(model_id="mistralai/mixtral-8x7b-instruct-v0-1", repetition_penalty=1.1)

# prompt = "Tell me about IBM."
# result = mixtral_llm.invoke(prompt)
# print(result)

In [5]:
df = pd.read_csv("data/ms-marco.csv")
df["contexts"] = df["contexts"].apply(lambda x: ast.literal_eval(x)[0])
df

Unnamed: 0,question,contexts
0,walgreens store sales average,The average Walgreens salary ranges from appro...
1,how much do bartenders make,A bartender’s income is comprised mostly of ti...
2,what is a furuncle boil,"Knowledge center. A boil, also known as a furu..."
3,what can urinalysis detect,Urinalysis: One way to test for bladder cancer...
4,what is vitamin a used for,Since vitamin A is fat-soluble it is not neede...
...,...,...
195,who wrote Nothing compares to you,Nothing Compares 2 U is a song originally wri...
196,cost for relining dentures,When dentures that used to fit are now loose a...
197,what is sherry wine made of,"Sherry vinegar is made from sherry, a fortifie..."
198,dna in bacteria,Bacterial DNA in Human Genomes. A new study fi...


In [6]:
# Initialie the embedding model
model_name = "intfloat/e5-large-v2"
model_kwargs = {'device': 'cpu'}

embeddings_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
)

In [8]:
data = []
for i in range(len(df)):
    doc = Document(
        metadata={
            "question": df['question'][i],
        },
        page_content=df['contexts'][i])
    data.append(doc)
data[0]

Document(page_content='The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.The average revenue in 2011 of a Starbuck Store was $1,078,000, up  from $1,011,000 in 2010.    The average ticket (total purchase) at domestic Starbuck stores in  No … vember 2007 was reported at $6.36.    In 2008, the average ticket was flat (0.0% change).In fiscal 2014, Walgreens opened a total of 184 new locations and acquired 84 locations, for a net decrease of 273 after relocations and closings. How big are your stores? The average size for a typical Walgreens is about 14,500 square feet and the sales floor averages about 11,000 square feet. How do we select loca

In [9]:
texts = ["text1", "text2", "text3"]
faiss = FAISS.from_texts(texts, embeddings_model)

# Define the child and parent splitters
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=25)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=75)

store = InMemoryStore()

# Initialize the ParentDocumentRetriever with FAISS
parent_document_retriever = ParentDocumentRetriever(
  vectorstore=faiss,
  docstore=store,
  child_splitter=child_splitter,
  parent_splitter=parent_splitter
)

# Add documents to the retriever
parent_document_retriever.add_documents(data, ids=None)

In [10]:
def prompt_generation(context, query):
    template = (
        "<s>"
        "[INST] \n"
        "Context: {context} \n"
        "- Take the context above and use that to answer questions in a detailed and professional way. \n"
        "- If you don't know the answer just say \"I don't know\".\n"
        "- Refrain from using any other knowledge other than the text provided.\n"
        "- Don't mention that you are answering from the text, impersonate as if this is coming from your knowledge\n"
        "- For the questions whose answer is not available in the provided context, just say \"I don't know\".\n"
        "Question: {query}? \n"
        "[/INST] \n"
        "</s>\n"
        "Answer: "
    )

    qa_template = PromptTemplate.from_template(template)
    return qa_template.format(context=context, query=query)


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [11]:
query= "how much do bartenders make"
docs = parent_document_retriever.get_relevant_documents(query)

pretty_print_docs(docs)

Document 1:

the highest incomes in 2011, an average of $26,180 a year. Bartenders employed in full-service restaurants tended to earn somewhat less, averaging about $22,130 a year. Bartenders employed by bars earned an average of $20,230 per year, and bartenders who worked for civic and social organizations
----------------------------------------------------------------------------------------------------
Document 2:

night total (on a good night)...just depends.Report Abuse. no way to tell you how much bartenders make. in wages anything from minimum to $15 an hour. tips, anywhere from $20 to $300 or more a night. depends on a lot of things. those top dollar jobs only come after a lot of experience. If the bar
----------------------------------------------------------------------------------------------------
Document 3:

of bartenders in the U.S. reported annual incomes of between $16,170 and $31,860.Tips make up half or more of bartender's salaries. If a bartender earned $6.00 an h

In [12]:
docs

[Document(page_content='the highest incomes in 2011, an average of $26,180 a year. Bartenders employed in full-service restaurants tended to earn somewhat less, averaging about $22,130 a year. Bartenders employed by bars earned an average of $20,230 per year, and bartenders who worked for civic and social organizations', metadata={'question': 'how much do bartenders make'}),
 Document(page_content='night total (on a good night)...just depends.Report Abuse. no way to tell you how much bartenders make. in wages anything from minimum to $15 an hour. tips, anywhere from $20 to $300 or more a night. depends on a lot of things. those top dollar jobs only come after a lot of experience. If the bar', metadata={'question': 'how much do bartenders make'}),
 Document(page_content="of bartenders in the U.S. reported annual incomes of between $16,170 and $31,860.Tips make up half or more of bartender's salaries. If a bartender earned $6.00 an hour, their tips generally average out to $12.00 to $18.

In [13]:
context = "\n".join([doc.page_content for doc in docs])
prompt = prompt_generation(context, query)
print(prompt)

<s>[INST] 
Context: the highest incomes in 2011, an average of $26,180 a year. Bartenders employed in full-service restaurants tended to earn somewhat less, averaging about $22,130 a year. Bartenders employed by bars earned an average of $20,230 per year, and bartenders who worked for civic and social organizations
night total (on a good night)...just depends.Report Abuse. no way to tell you how much bartenders make. in wages anything from minimum to $15 an hour. tips, anywhere from $20 to $300 or more a night. depends on a lot of things. those top dollar jobs only come after a lot of experience. If the bar
of bartenders in the U.S. reported annual incomes of between $16,170 and $31,860.Tips make up half or more of bartender's salaries. If a bartender earned $6.00 an hour, their tips generally average out to $12.00 to $18.00 an hour as additional income. A bartender in an average bar will typically
wants to make. I wouldn't say there is a cap on how much a bartender make in one year. I

In [14]:
result = mixtral_llm.invoke(prompt)
print(f"Answer: {result}")

Answer: 
Bartenders can make varying amounts depending on several factors such as their employer and level of experience. On average, bartenders earn between $16,170 and $31,860 per year, with tips making up at least half of their salary. The hourly wage for a bartender can range from minimum wage to around $15, with tips potentially adding another $12 to $18 per hour. It's important to note that high-earning bartenders can make significantly more than this, especially in busy locations or with excellent customer service skills. However, without specific information about the situation, it's difficult to provide an exact figure for how much a bartender makes.


In [15]:
class Processor:
    def __init__(self, retriever, llm):
        self._retriever = retriever
        self._llm = llm
    
    @instrument
    def retrieve_chunks(self, query, num_chunks):
        self._retriever.search_kwargs = {"k": num_chunks}
        docs = self._retriever.get_relevant_documents(query)
        docs = [doc.page_content for doc in docs]
        return docs
    
    @instrument
    def join_chunks(self, chunks):
        return "\n".join(chunks)
    
    @instrument
    def respond_to_query(self, query, num_chunks=3):
        chunks = self.retrieve_chunks(query, num_chunks=num_chunks)
        context = self.join_chunks(chunks)
        prompt = prompt_generation(context, query)
        retries_left = 3
        while True:
            try:
                answer = self._llm.invoke(prompt).strip()
                break
            except Exception as e:
                print("Error while generating answer", e)
                if retries_left>0:
                    retries_left -= 1
                    print("Retrying. Retries Remaining -", retries_left)
                else:
                    raise
        return answer
    

In [16]:
parent_doc_rag = Processor(parent_document_retriever, mixtral_llm)

In [17]:
class IBMLangchain(Langchain):
    def _create_chat_completion(self, prompt = None, messages = None, **kwargs):
        if prompt is not None:
            # prompt += "\nANSWER:\n"
            prompt = f"[INST]\n{prompt}\n[/INST]"
            predict = self.endpoint.chain.invoke(prompt, **kwargs)
            predict = re.sub(r'Score: (\d+)/\d+', r'Score: \1', predict)

        elif messages is not None:
            prompt = messages[0]['content']
            # prompt += "\nANSWER:\n"
            prompt = f"[INST]\n{prompt}\n[/INST]"
            predict = self.endpoint.chain.invoke(prompt, **kwargs)
            predict = re.sub(r'Score: (\d+)/\d+', r'Score: \1', predict)

        else:
            raise ValueError("`prompt` or `messages` must be specified.")
        
        return predict
    
    def _groundedness_doc_in_out(self, premise: str, hypothesis: str) -> str:
        """
        An LLM prompt using the entire document for premise and entire statement
        document for hypothesis.

        Args:
            premise (str): A source document
            hypothesis (str): A statement to check

        Returns:
            str: An LLM response using a scorecard template
        """
        assert self.endpoint is not None, "Endpoint is not set."

        return self.endpoint.run_in_pace(
            func=self._create_chat_completion,
            prompt=str.format(custom_prompts.LLM_GROUNDEDNESS_FULL_SYSTEM,) +
            str.format(
                prompts.LLM_GROUNDEDNESS_FULL_PROMPT,
                premise=premise,
                hypothesis=hypothesis
            )
        )

In [18]:
eval_llm = bam_model(model_id="mistralai/mixtral-8x7b-instruct-v0-1", min_new_tokens=1, max_new_tokens=1000, repetition_penalty=1.1)
# eval_llm = bam_model(model_id="meta-llama/llama-2-70b-chat", max_new_tokens=1000, repetition_penalty=1.1)

langchain_provider = IBMLangchain(chain=eval_llm)

In [23]:
tru = Tru()
tru.reset_database()

In [24]:
# Question/statement relevance between question and each context chunk.
f_qs_relevance = (
    Feedback(
        langchain_provider.qs_relevance_with_cot_reasons,
        name="Context Relevance"
    )
    .on_input()
    .on(Select.RecordCalls.retrieve_chunks.rets[:])
    .aggregate(np.mean)
)

# Define a groundedness feedback function
grounded = Groundedness(groundedness_provider=langchain_provider)
f_groundedness = (
    Feedback(
        grounded.groundedness_measure_with_cot_reasons,
        name="Groundedness"
    )
    .on(Select.RecordCalls.join_chunks.rets) # collect context chunks into a list
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
    langchain_provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input().on_output()

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.retrieve_chunks.rets[:] .
✅ In Groundedness, input source will be set to __record__.app.join_chunks.rets .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [25]:
tru_rag = TruCustomApp(parent_doc_rag,
    app_id = 'Parent doc retrieval RAG Pipeline',
    feedbacks = [f_qs_relevance, f_groundedness, f_qa_relevance])

In [26]:
with tru_rag as recording:
    for query in tqdm(df["question"], total=len(df)):
        ans = parent_doc_rag.respond_to_query(query)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 200/200 [21:40<00:00,  6.50s/it]


In [27]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevance,Groundedness,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Parent doc retrieval RAG Pipeline,0.98191,0.708164,0.721465,5.795,0.0
