#### RAG 2: AWS S3 Loader with Pinecone

In [9]:
import os
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Now you can access your variables
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")

aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_DEFAULT_REGION")


##### Load Documents from the AWS S3

In [2]:
import boto3
from langchain.docstore.document import Document
import os
from io import BytesIO
import PyPDF2

In [12]:
# Initialize boto3 S3 client 
s3_client = boto3.client('s3')

bucket_name = "ai-school-project"
prefix = ""

# List objects in the specified S3 bucket and prefix
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
documents = []


if "Contents" in response:
    for obj in response["Contents"]:
        key = obj["Key"]
        # Process files ending in .pdf
        if key.endswith(".pdf"):
            s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
            pdf_bytes = s3_object["Body"].read()  # Read the PDF as binary
            pdf_file = BytesIO(pdf_bytes)
            
            reader = PyPDF2.PdfReader(pdf_file)
            content = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:  # Ensure that text was extracted
                    content += page_text
            
            # Create a Document object for LangChain
            documents.append(Document(page_content=content, metadata={"source": key}))

print(f"Loaded {len(documents)} documents from S3.")

Loaded 1 documents from S3.


#### Create a Vector Store Using Pinecone

In [13]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter


# # Initialize Pinecone
# pc = Pinecone(api_key="pcsk_Ssz1e_TNzELd1GLMRRUwfQC3jWxz6tzFYTsMQJjMqSLPL51B9YL6MSUbi6T4aFc4Y2iPx")
# index = pc.Index("ai-project")


os.environ['OPENAI_API_KEY'] = openai_api_key
#os.environ['PINECONE_API_KEY'] = pinecone_api_key
os.environ['PINECONE_API_KEY'] = "pcsk_Ssz1e_TNzELd1GLMRRUwfQC3jWxz6tzFYTsMQJjMqSLPL51B9YL6MSUbi6T4aFc4Y2iPx"

# Create an embeddings object
embeddings = OpenAIEmbeddings()

# Define an index name for Pinecone
index_name = "ai-project"

# Create the Pinecone vector store from documents
vector_store = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)

print("Documents indexed in Pinecone.")


Documents indexed in Pinecone.


#### Build the RAG Using Pinecone

In [11]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI


template = """ Answer the question only based on the following context.
If you can't find answer in context, use your own knowlege
{context}

Question: {question}
"""

prompt = PromptTemplate(template=template)
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


# Create the RetrievalQA chain using the vector store’s retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [7]:
combine_docs_chain = create_stuff_documents_chain(llm=model, prompt=prompt)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [16]:
retrieval_chain.invoke({"input":"Which Country is Trump Invading?","question":"Which Country is Trump Invading??" })

{'input': 'Which Country is Trump Invading?',
 'question': 'Which Country is Trump Invading??',
 'answer': 'Greenland'}