In [19]:
# Import necessary modules and define env variables

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain import FAISS
from langchain.callbacks import get_openai_callback
from PyPDF2 import PdfReader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import os
import io
import chainlit as cl
import PyPDF2
from io import BytesIO


from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file
#load_dotenv()

In [5]:
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY") 

In [7]:
#OPENAI_API_KEY

### Prompt Template Initialization

In [8]:
# text_splitter and system template

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)



system_template = """Use the following pieces of context to answer the users question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    ALWAYS return a "SOURCES" part in your answer.
    The "SOURCES" part should be a reference to the source of the document from which you got your answer.

    Example of your response should be:

    ```
    The answer is foo
    SOURCES: xyz
    ```

    Begin!
    ----------------
    {summaries}"""

In [9]:
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
chain_type_kwargs = {"prompt": prompt}

In [10]:
pdf_reader = PdfReader("../Langchain_Tutorials/amazon-rainforest-sample_doc.pdf")
# Text variable will store the pdf text
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [11]:
# Split the text into chunks
texts = text_splitter.split_text(text)

In [12]:
texts[0]

'The Amazon Rainforest \xa0\n \xa0\n The Amazon rainforest is the \u200b largest remaining tropical rainforest in the world\u200b , blanketing the Earth’s \xa0\n surface in approximately\u200b  \u200b three billion trees\u200b . Spanning nine countries in South America, the Amazon is an \xa0\n expansive and incredibly diverse biome— almost twenty-five times the size of the United Kingdom. Through the \xa0\n region snakes the Amazon River, flowing for more than 4,100 miles. \xa0 \xa0\n \xa0\n ● One fifth of world’s flowing water runs through the Amazon. \xa0\n ● About 20% of the planet’s oxygen is produced in the Amazon. \xa0\n \xa0\n Biodiversity in the Amazon \xa0\n \xa0\n As of 2005, the Amazon is home to at least \u200b 10% of the entire planet’s known species\u200b , including, at least: \xa0\n \xa0\n ● 437 mammal species \xa0 \xa0\n ● 1,300 bird species \xa0\n ● 378 reptile species \xa0\n ● 400 amphibian species \xa0 \xa0\n ● 3,000 fish species \xa0 \xa0\n ● 40,000 to 53,000 tree 

In [13]:
# Create metadata for each chunk
metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]

In [14]:
metadatas

[{'source': '0-pl'},
 {'source': '1-pl'},
 {'source': '2-pl'},
 {'source': '3-pl'},
 {'source': '4-pl'},
 {'source': '5-pl'}]

In [15]:
embeddings = OpenAIEmbeddings(api_key = os.environ["OPENAI_API_kEY"])
docsearch = Chroma.from_texts(
    texts, embeddings, metadatas=metadatas
)

  warn_deprecated(


2024-02-20 11:31:49 - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-02-20 11:31:50 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [21]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0)

retriver = docsearch.as_retriever(search_kwargs={"k":2})

qa = RetrievalQA.from_chain_type(llm = llm, chain_type='stuff', retriever = retriver)

In [25]:
query = "What is the Oxygen perchentage of Amazon?"

In [26]:
 with get_openai_callback() as cost:
    response = qa.run(query)
    print(cost)

2024-02-20 11:36:40 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-02-20 11:36:41 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Tokens Used: 517
	Prompt Tokens: 501
	Completion Tokens: 16
Successful Requests: 1
Total Cost (USD): $0.0002745


In [27]:
response

"The Amazon rainforest produces more than 20% of the Earth's oxygen."