In [29]:
from langchain_nvidia_trt.llms import TritonTensorRTLLM

# Connect to the TRT-LLM Llama-2 model running on the Triton server at the url below
# Replace "llm" with the url of the system where llama2 is hosted
triton_url = "llm:8001"
pload = {
            'tokens':500,
            'server_url': triton_url,
            'model_name': "ensemble"
}
llm = TritonTensorRTLLM(**pload)

In [30]:
from langchain.prompts import PromptTemplate

LLAMA_PROMPT_TEMPLATE = (
 "<s>[INST] <<SYS>>"
 "Use the following context to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer."
 "<</SYS>>"
 "<s>[INST] Context: {context} Question: {question} Only return the helpful answer below and nothing else. Helpful answer:[/INST]"
)

LLAMA_PROMPT = PromptTemplate.from_template(LLAMA_PROMPT_TEMPLATE)

# Document Ingestion

In [32]:
from pathlib import Path
from langchain.document_loaders import UnstructuredFileLoader
import time
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
TEXT_SPLITTER_MODEL = "intfloat/e5-large-v2"
TEXT_SPLITTER_TOKENS_PER_CHUNK = 510
TEXT_SPLITTER_CHUNCK_OVERLAP = 200


documents_path = "./docs"
documents = list(Path(documents_path).glob("*.pdf"))

document_chunks = []
for document in documents:
    loader = UnstructuredFileLoader(document.as_posix())
    data = loader.load()

    text_splitter = SentenceTransformersTokenTextSplitter(
        model_name=TEXT_SPLITTER_MODEL,
        tokens_per_chunk=TEXT_SPLITTER_TOKENS_PER_CHUNK,
        chunk_overlap=TEXT_SPLITTER_CHUNCK_OVERLAP,
    )
    start_time = time.time()
    document_chunks += text_splitter.split_documents(data)

In [34]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Milvus
import torch
import time

#Running the model on CPU as we want to conserve gpu memory.
#In the production deployment (API server shown as part of the 5th notebook we run the model on GPU)
model_name = "intfloat/e5-large-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)
start_time = time.time()
vectorstore = Milvus.from_documents(documents=document_chunks, embedding=hf_embeddings,
                                    collection_name="rafay_ragathon",
                                    connection_args={"host": "milvus", "port": "19530"})
print(f"--- {time.time() - start_time} seconds ---")

--- 1282.9010062217712 seconds ---


In [38]:
from langchain_core.runnables import RunnablePassthrough
import time

chain = (
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
    | LLAMA_PROMPT
    | llm
)
start_time = time.time()
output = ""
for token in chain.stream(question):
    output += token
    
print(f"\n--- {time.time() - start_time} seconds ---")

print(output)


--- 6.757351875305176 seconds ---
  I don't have access to the specific safety evaluation of llama2 chat, as it is a proprietary document owned by Microsoft. However, I can provide some general information on the safety considerations for chatbots and conversational AI systems.

Safety evaluation of chatbots and conversational AI systems involves assessing the potential risks and hazards associated with these technologies, such as:

1. Data privacy and security: Chatbots and conversational AI systems often have access to sensitive user data, which can be vulnerable to cyber attacks, data breaches, or unauthorized access.
2. User safety: Chatbots and conversational AI systems can potentially engage users in harmful or dangerous activities, such as scams, phishing attacks, or cyberbullying.
3. Ethical considerations: Chatbots and conversational AI systems may be used to spread misinformation, propaganda, or hate speech, which can have serious ethical implications.
4. Dependence on AI: O

In [39]:
import pandas as pd

import os
os.environ["OPENAI_API_KEY"] = "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"


qna_data = pd.read_csv("./qna_data.csv")

qna_data = qna_data[qna_data["Source Chunk Type"] == "Text"]
qna_data = qna_data[qna_data["Question Type"] == "Single-Doc Multi-Chunk RAG"]

In [40]:

_data = []
for idx, row in qna_data.iterrows():

    question = row[0]
    answer =  row[4]

    context = []
    _docs = retriever.get_relevant_documents(question)

    for _doc in _docs:
        context.append(_doc.page_content)

    # context = "\n".join(context)

    start_time = time.time()
    output = ""
    for token in chain.stream(question):
        output += token
        
    inference_time = time.time() - start_time

    _data.append([question, answer, context, output, inference_time])

  question = row[0]
  answer =  row[4]


NameError: name 'retriever' is not defined

In [36]:
# Ragas Evaluation

In [37]:
df_eval = pd.DataFrame(_data, columns=["question", "ground_truth", "contexts", "answer"])

NameError: name '_data' is not defined

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from datasets import Dataset

In [None]:
eval_dataset = Dataset.from_pandas(df_eval)
result = evaluate(eval_dataset, metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],)
res = result.to_pandas()[["context_precision","faithfulness","answer_relevancy","context_recall"]]
res.fillna(0.0,inplace=True)

res.mean(axis=0)