In [1]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

In [19]:
from dotenv import load_dotenv
load_dotenv()
key = os.getenv("OPENAI_API_KEY")
api_key = os.getenv("PINECONE_API_KEY")
INDEX=os.getenv("INDEX_NAME")


In [34]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

if "test" not in pc.list_indexes().names():
    pc.create_index(
        name="test",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [22]:
!mkdir pdfs

A subdirectory or file pdfs already exists.


In [23]:
loader=PyPDFDirectoryLoader("pdfs")

In [24]:
data=loader.load()

In [25]:
if data:
    print("Data loaded successfully!")
    display(data[0])
else:
    print("No data loaded. Make sure you have uploaded PDF files to the 'pdfs' directory.")

Data loaded successfully!


Document(metadata={'producer': 'jsPDF 2.5.1', 'creator': 'PyPDF', 'creationdate': '2025-08-02T22:43:29+05:30', 'source': 'pdfs\\converted (1).pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Converted using ILovePDF3.com\nMachine learning approaches are traditionally divided into three broad\ncategories, which correspond to learning paradigms, depending on the\nnature of the "signal" or "feedback" available to the learning system:')

In [26]:
text_splitter = RecursiveCharacterTextSplitter()

In [27]:
text_chunks = text_splitter.split_documents(data)

In [28]:
text_chunks

[Document(metadata={'producer': 'jsPDF 2.5.1', 'creator': 'PyPDF', 'creationdate': '2025-08-02T22:43:29+05:30', 'source': 'pdfs\\converted (1).pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Converted using ILovePDF3.com\nMachine learning approaches are traditionally divided into three broad\ncategories, which correspond to learning paradigms, depending on the\nnature of the "signal" or "feedback" available to the learning system:')]

In [29]:
embeddings = OpenAIEmbeddings()

In [30]:
from langchain_community.vectorstores import Pinecone as PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=INDEX,
)

In [31]:
llm = OpenAI(model="gpt-4o-mini", temperature=0)

  llm = OpenAI(model="gpt-4o-mini", temperature=0)


In [32]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [33]:
query = "What is the main idea discussed in the PDFs?"
result = qa.run(query)

print("Answer:", result)

  result = qa.run(query)


Answer:  The main idea discussed in the PDFs is the categorization of machine learning approaches into three broad categories based on the nature of the signal or feedback available to the learning system.
