<h1> Install and import libraries

In [None]:
!pip install langchain
!pip install pymupdf
!pip install cohere
!pip install pinecone-client
!pip install PyPDF2
!pip install openai
!pip install datasets
!pip install ragas
!pip install --upgrade --quiet  langchain-google-genai pillow
!pip install python-dotenv



from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as PineconeStore
from PyPDF2 import PdfReader
from langchain.embeddings import CohereEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from datasets import Dataset
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv




from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

import os
import random


<h1> Create pinecode index and load pdf <h1>

In [45]:
load_dotenv('raga.env')

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

INDEX_NAME = "quickstart"

# Create a serverless index
# "dimension" needs to match the dimensions of the vectors you upsert
pc = Pinecone(api_key=PINECONE_API_KEY)

embeddings = CohereEmbeddings(model = "embed-multilingual-v3.0", cohere_api_key=COHERE_API_KEY)

pc.delete_index(INDEX_NAME)

####
if INDEX_NAME not in [index.name for index in pc.list_indexes()]:
    pc.create_index(name=INDEX_NAME, dimension=1024,
    spec=ServerlessSpec(cloud='aws', region='us-west-2')
    )
    # Load PDF with hebrew support
    pdf_file = open('example.pdf', 'rb')  # Open your PDF in binary mode
    reader = PdfReader(pdf_file)  # Create a PdfFileReader object
    heb_pages = reader.pages

    pages = ""

    # Create large string
    for page in reader.pages:
      pages += page.extract_text()

    # Split the PDF into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_text(pages)

    # text_content = [doc.extract_text() for doc in heb_pages]
    text_content = [doc for doc in texts]

    docsearch = PineconeStore.from_texts(text_content, embeddings, index_name=INDEX_NAME)
else:
  text_field = "text"

  # switch back to normal index for langchain
  index = pc.Index(INDEX_NAME)

  docsearch = PineconeStore(
      index, embeddings, text_field
  )



<h1> Create large language model using gemini

In [46]:

llm = ChatGoogleGenerativeAI(model="gemini-pro")


In [48]:
from langchain_openai.chat_models import AzureChatOpenAI

GPT_DEPLOYMENT_NAME="chatgpt_16k"

#os.environ["AZURE_OPENAI_API_KEY"] = "dabb4f01c7e84163b6f12d1b44bae002"
#os.environ["AZURE_OPENAI_ENDPOINT"] = "https://selling-chatbot.openai.azure.com/"

AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')


llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=GPT_DEPLOYMENT_NAME,
    model='azure',
    validate_base_url=False
)

<h1> Retreive k random documents from index. </br>
For each document, create a question who's answer is within the document content. </br>
Store each question in a list called questions.

</h1>

In [49]:
retriever = docsearch.as_retriever()
random_documents = random.choices(texts, k=2)

questions = []
documents = []

# Create k questions who's answer is in k docs respectively
for doc in random_documents:
  template = "Generate a question in hebrew who's answer is within the following text: {doc}"
  prompt = ChatPromptTemplate.from_template(template)

  # Setup RAG pipeline
  rag_chain = (
      {"context": retriever,  "doc": RunnablePassthrough()}
      | prompt
      | llm
      | StrOutputParser()
  )

  questions.append(rag_chain.invoke(doc))
  documents.append(doc)


<h1> Create a chat prompt using gemini. </br>
Using the document as context, send a prompt requesting an answer to a each  question, respectively.
Save the answers in a list called answers 
</h1>

In [50]:
import google.generativeai as genai

model = "gemini-pro"
genai.configure(api_key=GOOGLE_API_KEY)
generation_config = {
    "temperature": 1,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 1024,
}
safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_ONLY_HIGH"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_ONLY_HIGH"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_ONLY_HIGH"},
]

gemini = genai.GenerativeModel(
    model_name=model,
    safety_settings=safety_settings,
    generation_config=generation_config,
)

answers = []
chat=gemini.start_chat()

for i in range(len(documents)):
  question = questions[i]
  document = documents[i]

  # Define prompt template
  template = f"""You are an assistant for question-answering tasks.
  Use the following pieces of retrieved context and question below to answer the question in hebrew.
  Use two sentences maximum and keep the answer concise.
  Question: {question}
  Context: {document}
  Answer:
  """

  response = chat.send_message(template)
  answers.append(response.text)


<h1>
Prepare a dataset to be used with metrics evalution.
</h1>

In [51]:
ground_truths = []

for answer in answers:
    list = []
    list.append(answer)
    ground_truths.append(list)

print(len(questions))
print(len(answers))

contexts = []
answersb = []

# Inference
for query in questions:
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])
    response = chat.send_message(query)
    answersb.append(response.text)

# To dict
data = {
    "question": questions,
    "answer": answersb,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

2
2


<h1> Evaluate metrics using dataset  </h1>

In [None]:
from IPython.display import display

result = evaluate(
    llm =llm,
    dataset = dataset,
    metrics=[
        context_precision,
        context_recall,
        answer_relevancy,
        faithfulness
    ],
)

df = result.to_pandas()
display(df)