In [1]:

!pip install langchain langchain-google-genai chromadb PyMuPDF pillow python-dotenv






In [2]:

import os
import fitz
import shutil
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("Please set GOOGLE_API_KEY in your .env file")

print("API Key Loaded ✅")
print("Task Cleared ✅")


API Key Loaded ✅
Task Cleared ✅


In [5]:

def extract_pdf_content(pdf_path):
    text = ""
    image_paths = []
    image_dir = "pdf_images"

    if os.path.exists(image_dir):
        shutil.rmtree(image_dir)
    os.makedirs(image_dir)

    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        text += page.get_text() + "\n"
        for idx, img in enumerate(page.get_images(full=True)):
            pix = fitz.Pixmap(doc, img[0])
            img_name = f"{image_dir}/page{page_num+1}_img{idx+1}.png"
            if pix.n < 5:
                pix.save(img_name)
            else:
                pix_rgb = fitz.Pixmap(fitz.csRGB, pix)
                pix_rgb.save(img_name)
            image_paths.append(img_name)
    return text, image_paths

pdf_path = input("Enter PDF path: ")
text_data, image_files = extract_pdf_content(pdf_path)

print(f"\nExtracted {len(text_data)} characters of text.")
print(f"Extracted {len(image_files)} images.")
print("Task Cleared ✅")



Extracted 41424 characters of text.
Extracted 6 images.
Task Cleared ✅


In [6]:

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_text(text_data)

print(f"\nTotal Chunks: {len(chunks)}")
for i, chunk in enumerate(chunks[:3], 1):  # preview first 3
    print(f"\nChunk {i}:\n{chunk[:200]}...")
print("\nTask Cleared ✅")



Total Chunks: 51

Chunk 1:
© 2019 IJRAR June 2019, Volume 6, Issue 2                                           www.ijrar.org  (E-ISSN 2348-1269, P- ISSN 2349-5138) 
IJRAR1ARP035 
International Journal of Research and Analytical...

Chunk 2:
applications of Machine Learning were discovered and brought to light, especially in healthcare, finance, speech recognition, 
augmented reality, and more complex 3D and video applications. In machine...

Chunk 3:
can access data and use it learn for themselves. Machine learning (ML) is the scientific study of algorithms and statistical models that 
computer systems use in order to perform a specific task effec...

Task Cleared ✅


In [7]:

embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
llm_model = GoogleGenerativeAI(model="gemini-1.5-flash")
print("Models Initialized ✅")
print("Task Cleared ✅")


Models Initialized ✅
Task Cleared ✅


In [8]:

db_path = "chroma_db"
if os.path.exists(db_path):
    shutil.rmtree(db_path)

vector_db = Chroma.from_texts(texts=chunks, embedding=embedding_model, persist_directory=db_path)
vector_db.persist()

print("Chroma Vector Store Ready ✅")
print("Task Cleared ✅")


Chroma Vector Store Ready ✅
Task Cleared ✅


  vector_db.persist()


In [9]:

retriever = vector_db.as_retriever(search_kwargs={"k": 3})
qa_chain = RetrievalQA.from_chain_type(llm=llm_model, retriever=retriever)
print("Retrieval QA Chain Ready ✅")
print("Task Cleared ✅")


Retrieval QA Chain Ready ✅
Task Cleared ✅


In [None]:

print("\nChatbot Ready! Ask about the document (type 'exit' to quit).")

while True:
    question = input("\nYou: ")
    if question.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break

    answer = qa_chain.run(f"Answer concisely based on the document: {question}")
    print("\nAI:", answer)
    print("Task Cleared ✅")



Chatbot Ready! Ask about the document (type 'exit' to quit).


  answer = qa_chain.run(f"Answer concisely based on the document: {question}")



AI: The provided text describes machine learning (ML), its algorithms (random forests, neural networks, etc.), and its applications.  It does not contain a response to "hi".

Task Cleared ✅

AI: The provided text does not contain information about how "u" (presumably "you") are.

Task Cleared ✅

AI: The provided text does not name the authors of the paper.

Task Cleared ✅
