In [29]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [30]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key="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",
    model_name="llama-3.3-70b-versatile",  # Make sure this is a valid Groq model name
    temperature=0.8,
    max_tokens=512
)


In [31]:
def load_pdf(data):
    loader = DirectoryLoader(data , 
                    glob="*.pdf" , 
                    loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [32]:
extracted_data = load_pdf('data/')

In [33]:
print(f"Total pages loaded: {len(extracted_data)}")
print(f"First doc metadata: {extracted_data[0].metadata}")


Total pages loaded: 440
First doc metadata: {'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2020-08-28T16:45:56-04:00', 'moddate': '2025-04-04T12:11:28+05:30', 'trapped': '/False', 'source': 'data/jehp101.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}


In [34]:
extracted_data

[Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2020-08-28T16:45:56-04:00', 'moddate': '2025-04-04T12:11:28+05:30', 'trapped': '/False', 'source': 'data/jehp101.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='Physical education (PE) aims at the optimum development \nof an individual through continuous process of learning and \nparticipation in guided physical activities. In other words, it \naims at optimum physical, mental and social development of \nan individual. \nWhat is Physical  Education ?\nAccording to Webster’s Dictionary, “Physical education is a \npart of education which gives instructions in the development \nand care of the body rending from simple callisthenic exercises \nto a course of study providing training in hygiene, gymnastics, \nand the performance and management of athletics and \ngames.” Central Advisory Board of Physical Education and \nRecreation defines PE as “an ed

In [35]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap =50)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks


In [36]:
text_chunks= text_split(extracted_data)
print(len(text_chunks))

2034


In [37]:
import os

os.environ["GOOGLE_API_KEY"] = 'AIzaSyDc0-nLQCGSOHA2Y22RZj-YCliOO1_f6vM'


In [38]:
def download_genAi():
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return embeddings


In [39]:
embeddings =  download_genAi()

In [40]:
embeddings

GoogleGenerativeAIEmbeddings(client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x149be0ce0>, async_client=<google.ai.generativelanguage_v1beta.services.generative_service.async_client.GenerativeServiceAsyncClient object at 0x148301430>, model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [41]:
#simple eg not part 
q = embeddings.embed_query('hello world whats up?')

In [42]:
len(q)

768

In [43]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

# Set API key directly (hardcoded for testing; use secrets in production)
api_key = "pcsk_47vyMh_QYDJNv155bd78wVoUxjUE6WNnEvW1BVKMhHBXB2LLAmdttr5DR3mF3XmBwZ4xzu"

# Set the environment variable for LangChain's internal Pinecone client
os.environ["PINECONE_API_KEY"] = api_key

# Initialize Pinecone client explicitly (optional for verification)
pc = Pinecone(api_key=api_key)

index_name = "sportsbot"
DIM = 768  # Must match your embeddings dimension

# Create index if it doesn't exist (run this only once)
existing = [ix["name"] for ix in pc.list_indexes().get("indexes", [])]
if index_name not in existing:
    pc.create_index(
        name=index_name,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

texts = [t.page_content for t in text_chunks]
docsearch = PineconeVectorStore.from_texts(
    texts=texts,
    embedding=embeddings,
    index_name=index_name,
)


In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer,don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:"""

In [45]:
PROMPT = PromptTemplate(template = prompt_template , input_variables = ['context','question'])
chain_type_kwargs = {'prompt':PROMPT}

In [55]:
qa= RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = 'stuff',
    retriever = docsearch.as_retriever(search_kwargs={'k':3}),
    return_source_documents = True,
    chain_type_kwargs=chain_type_kwargs
)

In [56]:
while True:
    user_input = input("Input Prompt: ")
    if user_input.lower() == "quit":
        break
    result = qa.invoke({"query": user_input})
    print("Response:", result['result'])


Response: There is not enough information provided to answer the question about a cricket ground.
Response: A basketball court's dimensions can vary depending on the level of play, but here are the standard dimensions: 
- Professional (NBA): 94 feet long and 50 feet wide
- College (NCAA) and International: 94 feet long and 50 feet wide
- High School: 84 feet long and 50 feet wide
Response: A cricket bat is a piece of equipment used in the sport of cricket, typically made of wood, to hit the ball. It usually consists of a long handle and a flat face, with a spine running down the center. The bat is used by the batsmen to score runs and defend against deliveries from the opposing team's bowlers.
Response: A cricket ball is a hard, leather-covered ball with a cork core, used in the sport of cricket. It weighs between 5.5 and 5.75 ounces (155.9 to 163 grams) and has a circumference of between 8.69 and 8.81 inches (22.1 to 22.4 cm).
Response: The information provided does not contain an ans