In [None]:
from dotenv import load_dotenv
import os
import openai
from PyPDF2 import PdfReader
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain import FAISS
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings


# Load environment variables
load_dotenv()

#### Loading The OpenAPI Key from Environment Variables

In [None]:
openai_api_key = os.environ["OPENAI_API_KEY"]

In [None]:
#Initializing Teh Client Object
from openai import OpenAI
import os

client = OpenAI(
    api_key = os.environ["OPENAI_API_kEY"]
 )

Splitting Text into Chunks: The function utilizes the CharacterTextSplitter from Langchain to split the input text into smaller chunks. These chunks are separated by newline characters ("\n") and are of a maximum size of 1000 characters (chunk_size=1000). Additionally, there is an overlap of 200 characters between consecutive chunks (chunk_overlap=200).

Converting Text Chunks into Embeddings: After splitting the text into chunks, the function seems to convert each chunk into embeddings using OpenAIEmbeddings. This assumes that OpenAIEmbeddings is a class responsible for converting text into embeddings, possibly using a pre-trained model like GPT.

Forming a Knowledge Base: Finally, the function seems to create a knowledge base from the embeddings generated from the text chunks. It's using FAISS to index and retrieve embeddings efficiently. The FAISS.from_texts method likely creates an index from the embeddings of the text chunks.

In [None]:
def process_text(text):
    # Split the text into chunks using Langchain's CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    
    # Convert the chunks of text into embeddings to form a knowledge base (OpenAI Embedding)
    embeddings = OpenAIEmbeddings()
    
    #Huggingface Embedding ("all-MiniLM-L6-v2")
    #embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    knowledgeBase = FAISS.from_texts(chunks, embeddings)
    
    return knowledgeBase

In [None]:
pdf_reader = PdfReader("amazon-rainforest-sample_doc.pdf")
# Text variable will store the pdf text
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()
    
# Create the knowledge base object
knowledgeBase = process_text(text)

### Based on a Query -> We are deriving the similarity score using Huggingface Embedding

In [None]:
query = "What is the Oxygen perchentage of Amazon?"

In [None]:
docs = knowledgeBase.similarity_search(query)

# Check for first two results
print(docs[0])
print("___"*10)
print(docs[1])

In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0)

In [None]:
chain = load_qa_chain(llm, chain_type='stuff')

In [None]:
 with get_openai_callback() as cost:
    response = chain.run(input_documents=docs, question=query)
    print(cost)

In [None]:
response