In [1]:
print("Hey its working")

Hey its working


In [2]:
#change the present working directory -> need to work fom root folder
%pwd 

import os
os.chdir("../")

%pwd


'c:\\Users\\Samuel\\Desktop\\Investory\\Investory_bot'

In [3]:
# import orchestration... 
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import importlib
import web_scraping

# 💡 This reloads your script, so any changes you make in web_scraping.py are used.
importlib.reload(web_scraping)

# List of bank websites you want to scrape
urls_to_scrape = [
    "https://www.icicibank.com/personal-banking/deposits/fixed-deposit/fd-interest-rates",
    "https://www.hdfcbank.com/personal/save/deposits/recurring-deposit",
    "https://sbi.bank/web/personal-banking/investments-deposits/deposits/recurring-deposit"
]

# Loop through each URL and call the scraping function
print("Starting the web scraping process...")
for url in urls_to_scrape:
    print("-" * 20)
    web_scraping.scrape_and_save_as_pdf(url)

print("-" * 20)
print("Scraping complete!")

ModuleNotFoundError: No module named 'bs4'

In [None]:
#Extracting our content from odf files.
def load_pdf_files(data):
    loader = DirectoryLoader(
        data, 
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

'''  

Calls the .load() method on the loader object. This method will read all the 
matching PDF files and return their contents, packaged as "documents" (often in LangChain, these are Document objects).

'''

In [None]:
#Check the extracted data. 
extracted_data = load_pdf_files("data")


In [None]:
len(extracted_data)

In [None]:
extracted_data

In [None]:
#Filtering process

from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

'''    
3. Argument Type in a Filter Function
The filter_to_minimal_docs function you showed expects an argument of type List[Document]:

docs: List[Document] means a list (Python list) whose members are Document objects.
The return type -> List[Document] means it gives back a list of Document objects.

In Python, this type hint is just a suggestion for readers and tools; it doesnt enforce the type at runtime.


The built-in list type is used to create and manipulate lists at runtime.
The List from typing is used only for type hinting, to specify that a variable or parameter expects a list 
of a certain type—like List[int] for a list of integers, or List[Document] for a list of Document objects.
'''

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk




In [None]:
text_chunk = text_split(minimal_docs)
print(f"The number of chunks :  {len(text_chunk)}")

In [None]:
#Getting Vector Embedding

#Loading the model. 
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():

    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )

    return embeddings

embeddings = download_embeddings() #this is the object of the embedding model, gonna use this. 


In [None]:
vector = embeddings.embed_query("Hi, My name is Samuel.") # See how introducing myself would look like in a vector x0
print(vector)
print(len(vector))

In [None]:
# Loading env file

from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY


#Returns None if the key is not found (or a default value if provided).
# os.environ[key] raises a KeyError if the key is missing.

In [None]:
#Importing, autehnticating and making a client 
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key = pinecone_api_key)
pc 


In [None]:
#creating a pinecone database (index)

from pinecone import ServerlessSpec

index_name = "invest-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )

index = pc.Index(index_name)

In [None]:
#Now storing everything in PineCone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunk,
    embedding= embeddings,
    index_name= index_name
)

In [None]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

'''  
Since already created, something like caching, retrive from stored rather than start whole new process. 

'''

In [None]:
# Testing to see how the retrival of data is. 

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs



In [None]:
# Connecting the LLM for better readability. 
from langchain_groq import ChatGroq

# Initialize Groq chat model


from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [None]:
# Remember there's always 2 types of prompt. 1 -> System and 2 -> User prompt. 


system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# create_stuff_documents_chain(chatModel, prompt)

Creates a documents chain (called StuffDocumentsChain) that:

- Takes a list of Document objects (text chunks).
- Combines ("stuffs") them into a single input text block.
- Uses the given language model (`chatModel`) to process this combined text.
- The prompt specifies how to formulate the input for the model, typically inserting the combined text into the prompt template.
- This chain is useful for summarization, Q&A, extraction tasks where you want the model to see the entire context at once.

# create_retrieval_chain(retriever, question_answer_chain)

Creates a retrieval augmented generation (RAG) chain that:

- Uses the `retriever` (which fetches relevant document chunks based on a query).
- Passes retrieved documents to the `question_answer_chain`.
- Returns the language model's output based on the retrieved context.
- This enables efficient document retrieval + language modeling for tasks like open-domain Q&A, where not all documents are sent to the model but only the relevant ones.


In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

### Flow: PDF to Document Object

1. **PDF File**  
   You start with a PDF file containing raw text, images, formatting, etc.

2. **Loading & Parsing**  
   Use a PDF parser (like `PyPDFLoader` inside `DirectoryLoader`) to extract text content from the PDF.

3. **Create Document Object**  
   Each piece of extracted content (e.g., pages or chunks) is wrapped into a `Document` object. This object stores text (`page_content`) and metadata such as the document source or page number.

4. **Document Processing**  
   These `Document` objects can be passed around your pipeline (search, embedding, language model input, etc.) in a consistent format.
