In [24]:
import os
os.chdir("../")
LANGCHAIN_TRACING_V2 = 'True'
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY'] = "LANGCHAIN_API_KEY"  
LANGCHAIN_PROJECT="rag-pipleline"
os.environ['OPENAI_API_KEY'] = "OPENAI_API_KEY"

In [21]:
from langchain.document_loaders import Docx2txtLoader# Importing Docx loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma

In [None]:
#load Docx files
loader = Docx2txtLoader("/home/martin/Contract_Q-A_RAG/data/Raptor Q&A2.docx")
docs = loader.load()

In [None]:
docs

In [5]:
#load Docx files
loader = Docx2txtLoader("/home/martin/Contract_Q-A_RAG/data/Raptor Q&A2.docx")
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=0)
splits = text_splitter.split_documents(docs)



In [None]:
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings()
    )

In [7]:
retriever = vectorstore.as_retriever()

In [None]:
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

questions = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?", 
    "Are there any conditions to the closing?",
    "Is any of the Sellers bound by a non-competition covenant after the Closing?"
]
ground_truths = [
    ["Except in the case of fraud, the Sellers have no liability for breach of representations and warranties (See section 10.01)"],
    ["No, as the signing and closing are simultaneous."],
    ["No."]
]

answers = []
contexts = []

# Inference
for query in questions:
    try:
        answers.append(rag_chain.invoke(query))
        contexts.append([doc.page_content for doc in retriever.get_relevant_documents(query)])
    except Exception as e:
        print(f"Error processing query '{query}': {e}")
        answers.append("")
        contexts.append([])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

# Evaluation with exception handling
try:
    result = evaluate(
        dataset=dataset, 
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        raise_exceptions=False  # Show warning instead of raising exception
    )
    df = result.to_pandas()
    print(df)
except Exception as e:
    print(f"Exception during evaluation: {e}")
