In [None]:
!pip install -q PyPDF2 sentence-transformers chromadb google-generativeai


In [None]:
import os
import io
import json
import zipfile
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
import google.generativeai as genai

# Embedding model
EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

# ChromaDB persistent client
CHROMA_CLIENT = chromadb.PersistentClient(path="docs_db")
COLLECTION = CHROMA_CLIENT.get_or_create_collection(name="doc_chunks")

# Gemini API key (replace with your key or use Colab secret)
GEN_API_KEY = "AIzaSyAPteCJMtCZBCP4QJbfmfksFk3yEoG1Dt0"
genai.configure(api_key=GEN_API_KEY)
GEMINI_MODEL = genai.GenerativeModel("gemini-2.5-flash")


In [None]:
def parse_pdf(file_path: str, chunks: list, file_id: str):
    reader = PdfReader(file_path)
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        chunk = {
            "chunkId": f"{file_id}_page_{i+1}",
            "pageNumbers": [i + 1],
            "fileName": os.path.basename(file_path),
            "text": text.strip(),
        }
        chunks.append(chunk)

def parse_folder(folder_path: str, json_out="parsed_pdfs.json"):
    chunks = []
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, file_name)
            file_id = os.path.splitext(file_name)[0]
            parse_pdf(file_path, chunks, file_id)
    data = {"chunks": chunks}
    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    return json_out


In [None]:
def store_in_vector_db(json_path="parsed_pdfs.json"):
    try:
        COLLECTION.delete(where={"page": {"$gte": 0}})
    except:
        pass

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    texts, ids, metadatas = [], [], []
    for i, chunk in enumerate(data.get("chunks", [])):
        text = chunk.get("text", "")
        if not text.strip():
            continue
        ids.append(chunk.get("chunkId", f"chunk_{i}"))
        metadatas.append({
            "page": chunk.get("pageNumbers", [None])[0],
            "file": chunk.get("fileName", "unknown")
        })
        texts.append(text)

    embeddings = EMBED_MODEL.encode(texts, show_progress_bar=True).tolist()
    COLLECTION.add(ids=ids, documents=texts, embeddings=embeddings, metadatas=metadatas)
    return len(texts)


In [None]:
def retrieve(query: str, n_results=5):
    embedding = EMBED_MODEL.encode(query).tolist()
    results = COLLECTION.query(query_embeddings=[embedding], n_results=n_results)
    return results

def generate_answer(query: str, results):
    documents = results.get("documents", [[]])
    metadatas = results.get("metadatas", [[]])
    if not documents or not documents[0]:
        return "No relevant context found."

    context_texts = []
    for doc, meta in zip(documents[0], metadatas[0]):
        context_texts.append(f"{doc}\n(File: {meta.get('file')}, Page: {meta.get('page')})")

    context = "\n\n".join(context_texts)
    prompt = f"""
You are a documentation assistant.
Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer clearly and concisely.
"""
    try:
        response = GEMINI_MODEL.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"❌ LLM Error: {str(e)}"


In [None]:
from google.colab import files
import os

# Create folder for uploaded PDFs
os.makedirs("uploaded_pdfs", exist_ok=True)

print("📂 Upload one or more PDFs (hold Ctrl/Cmd to select multiple)")

# Upload files
uploaded = files.upload()

# Save all uploaded PDFs
pdf_files = []
for filename in uploaded.keys():
    if filename.lower().endswith(".pdf"):
        path = os.path.join("uploaded_pdfs", filename)
        with open(path, "wb") as f:
            f.write(uploaded[filename])
        pdf_files.append(filename)
        print(f"✅ Saved: {filename}")

if not pdf_files:
    print("❌ No PDFs were uploaded.")
else:
    print(f"📄 PDFs ready for processing: {pdf_files}")

# Parse PDFs and store in vector DB
json_file = parse_folder("uploaded_pdfs")
count = store_in_vector_db(json_file)
print(f"✅ Stored {count} chunks from uploaded PDFs.")


📂 Upload one or more PDFs (hold Ctrl/Cmd to select multiple)


Saving INDHUJA A AND DEEPIKA M.pdf to INDHUJA A AND DEEPIKA M (4).pdf
Saving HamsaVardhiniM_Proj_Doc.pdf to HamsaVardhiniM_Proj_Doc (4).pdf
Saving Divit_team (1).pdf to Divit_team (1) (4).pdf
✅ Saved: INDHUJA A AND DEEPIKA M (4).pdf
✅ Saved: HamsaVardhiniM_Proj_Doc (4).pdf
✅ Saved: Divit_team (1) (4).pdf
📄 PDFs ready for processing: ['INDHUJA A AND DEEPIKA M (4).pdf', 'HamsaVardhiniM_Proj_Doc (4).pdf', 'Divit_team (1) (4).pdf']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Stored 32 chunks from uploaded PDFs.


In [None]:
query = "Summarize all uploaded documents."
results = retrieve(query, n_results=8)
print("\n🤖 Answer:\n", generate_answer(query, results))



🤖 Answer:
 The uploaded documents describe two distinct projects:

1.  **Amazon Review Scraper with GAN-Based Sentiment Analysis (ReviewVisionGAN)**: This Python-based tool automates Amazon login, scrapes product reviews (including author, rating, title, text, date) from multiple pages, and uses a Generative Adversarial Network (GAN) model to classify review sentiment as positive, negative, or neutral. The enriched data is saved in JSON format. Key features include asynchronous scraping with Playwright and asyncio, robust error handling with logging and screenshots, and randomized user-agent/locale settings for realistic browsing. Future improvements aim to automate CAPTCHA/MFA, rotate proxies, and offer additional export formats or direct NLP integration.

2.  **Integration of Large Language Models (LLMs) for Predictive Capabilities**: This enhancement focuses on integrating LLMs into an existing framework that uses a Feed-Forward Neural Network (FNN) to predict adsorption efficiency

In [None]:
while True:
    query = input("❓ Ask a question (or type 'exit'): ")
    if query.lower() in ["exit", "quit"]:
        break
    results = retrieve(query, n_results=8)
    print("\n🤖 Answer:\n", generate_answer(query, results), "\n")


❓ Ask a question (or type 'exit'): hamsa vardhini

🤖 Answer:
 Hamsa Vardhini M is a B.Tech ADS Final Year student with Registration No: 312322201052.

She is working on a project titled "Adsorptive Removal of Heavy Metals from Wastewater: A Review on Artificial Neural Networks Based Predictive Models". This project aims to develop a predictive model for the efficient removal of heavy metals, specifically gold, from wastewater using artificial neural networks (ANNs) and biochar. The project utilizes a synthetic dataset of 5,000 rows, including parameters such as Initial Metal Concentration, Adsorbent Dosage, Contact Time, Solution pH, Temperature, Mixing Speed, and Pyrolysis Temperature. 

