In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API")

if not PINECONE_API_KEY:
    raise ValueError("Missing PINECONE_API_KEY in .env")

if not GROQ_API_KEY:
    raise ValueError("Missing GROQ_API in .env")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

print("Environment Loaded Successfully")


Environment Loaded Successfully


In [2]:
from pinecone import Pinecone, ServerlessSpec

index_name = "genativeai-encyclopedia"
pc = Pinecone(api_key=PINECONE_API_KEY)

existing = [i["name"] for i in pc.list_indexes()]

if index_name not in existing:
    print("Creating new index...")
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
else:
    print("Index already exists.")

index = pc.Index(index_name)
index.describe_index_stats()


Index already exists.


  from .autonotebook import tqdm as notebook_tqdm


{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '189',
                                    'content-type': 'application/json',
                                    'date': 'Wed, 26 Nov 2025 19:30:33 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '112',
                                    'x-pinecone-request-id': '6965663394103785864',
                                    'x-pinecone-request-latency-ms': '112'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 15351}},
 'storageFullness': 0.0,
 'total_vector_count': 15351,
 'vector_type': 'dense'}

In [4]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain.schema import Document

PDF_PATH = "D:\GenAiPedia\GenAIPed\data"   # change if needed

loader = DirectoryLoader(PDF_PATH, glob="*.pdf", loader_cls=PyMuPDFLoader)
docs = loader.load()

print("Loaded PDFs:", len(docs))


Loaded PDFs: 1658


In [5]:
minimal = []
for d in docs:
    minimal.append(
        Document(
            page_content=d.page_content,
            metadata={'source': d.metadata.get('source', 'unknown')}
        )
    )

len(minimal)


1658

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunks = splitter.split_documents(minimal)

print("Chunks:", len(chunks))


Chunks: 5113


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"}
)

print("Embeddings Loaded")


Embeddings Loaded


In [8]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=index_name
)

print("Indexing Completed!")


Indexing Completed!


In [9]:
from langchain.prompts import ChatPromptTemplate

system_prompt = """
You are a helpful AI assistant. Use ONLY the provided context to answer.
If the answer cannot be found, say "I don't know."

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])


In [10]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)


In [11]:
db = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

retriever = db.as_retriever(search_kwargs={"k": 12})


In [12]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

docs_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)

rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=docs_chain
)


In [13]:
query = "What is Artificial Intelligence?"
result = rag_chain.invoke({"input": query})

print(result["answer"])


Artificial Intelligence refers to the development of computer systems capable of performing tasks that would normally require human intelligence. These tasks include reasoning, understanding language, recognizing patterns, learning from examples, making decisions, and solving problems.


In [14]:
query = "What is Machine learning?"
result = rag_chain.invoke({"input": query})

print(result["answer"])


Machine learning is a subfield of computer science that is concerned with building algorithms which, to be useful, rely on a collection of examples of some phenomenon. These examples can come from nature, be handcrafted by humans or generated by another algorithm. Machine learning can also be defined as the process of solving a practical problem by 1) gathering a dataset, and 2) algorithmically building a statistical model based on that dataset. That statistical model is assumed to be used somehow to solve the practical problem.


In [15]:
query = "What is srijan?"
result = rag_chain.invoke({"input": query})

print(result["answer"])


Srijan is an ML developer recognized for persistence, strong problem-solving skills, and collaborative project leadership.


In [16]:
query = "What is siddhant?"
result = rag_chain.invoke({"input": query})

print(result["answer"])


Siddhant is a full-stack web developer known for strong discipline, teamwork, and delivering high-quality solutions in real projects.


In [17]:
query = "What is labelled data?"
result = rag_chain.invoke({"input": query})

print(result["answer"])


In the context of machine learning, labeled data refers to a dataset where each item or observation has been assigned a label or a target value that corresponds to the input data. This label or target value is used to identify the output or the correct classification of the input data.

In other words, labeled data is a dataset where the output or the correct answer is already known, and it is used to train a machine learning model to make predictions or classify new, unseen data.

For example, in a supervised learning model, each item (such as a car, product, or customer) must have labeled input and output values, known in data science as a "labeled dataset." This means that the input data (such as features or attributes) is paired with the corresponding output or label (such as a classification or a prediction).
