In [14]:
import os
from pathlib import Path
from typing import List
from io import BytesIO
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
from pymongo import MongoClient
from langchain.tools import tool

# -- Gemini API Key Configuration --
os.environ["GOOGLE_API_KEY"] = "AIzaSyAv2vEdJGNZadv86nHRJWfjD2Yt_JX_pmM"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# -- System Instruction --
SYSTEM_INSTRUCTION = (
    "You are a helpful college assistant. Only answer using the content from the provided "
    "Sahyadri college PDFs and website pages. If the answer is not available in the documents, "
    "clearly say that you don't have the information instead of guessing or making up answers."
)

BASE_DIR = Path(os.getcwd()).resolve()

# -- MongoDB Connection --
mongo_client = MongoClient("mongodb://localhost:27017")
db = mongo_client["chatbot_platform"]
KM_documents_collection = db["KM_documents"]
KM_URLs_collection = db["KM_URLs"]

# -- Load PDFs from MongoDB --
pdfs = {}
for doc in KM_documents_collection.find():
    try:
        file_path = (BASE_DIR.parent / doc['path']).resolve()
        if not file_path.exists():
            print(f"[ERROR] File not found: {file_path}")
            continue
        file = genai.upload_file(file_path, mime_type="application/pdf")
        pdfs[doc['filename']] = {
            "file": file,
            "description": doc.get("description", "")
        }
    except Exception as e:
        print(f"[ERROR] Failed to load PDF {doc['filename']}: {e}")

@tool
def filter_relevant_pdfs(query: str) -> List[str]:
    """Returns relevant PDF filenames based on query."""
    relevant = []
    query_lower = query.lower()
    for name, data in pdfs.items():
        if name.lower() in query_lower or any(word in query_lower for word in data["description"].lower().split()):
            relevant.append(name)
    return relevant

@tool
def query_pdfs(query: str, relevant_pdfs: List[str]) -> str:
    """Answer using Gemini over relevant PDF files."""
    if not relevant_pdfs:
        return "No relevant PDF documents found."
    files = [pdfs[name]["file"] for name in relevant_pdfs]
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        system_instruction=SYSTEM_INSTRUCTION
    )
    response = model.generate_content([*files, query])
    return response.text.strip()

# -- Load and Process Website Content --
webpages = {}
for doc in KM_URLs_collection.find():
    try:
        response = requests.get(doc['url'], timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all("p")).strip()
        if not text:
            print(f"[WARN] No text content extracted from: {doc['url']}")
            continue
        buffer = BytesIO(text.encode("utf-8"))
        file = genai.upload_file(buffer, mime_type="text/plain")
        webpages[doc['url']] = {
            "file": file,
            "description": doc.get("description", ""),
            "text": text  # Store content for relevance check
        }
    except Exception as e:
        print(f"[ERROR] Could not load website {doc['url']}: {e}")

@tool
def filter_relevant_websites(query: str) -> List[str]:
    """Returns relevant website URLs based on query and content."""
    relevant = []
    query_lower = query.lower()
    for url, data in webpages.items():
        if (
            url.lower() in query_lower or
            any(word in query_lower for word in data["description"].lower().split()) or
            any(word in data.get("text", "").lower() for word in query_lower.split())
        ):
            relevant.append(url)
    return relevant

@tool
def query_websites(query: str, relevant_websites: List[str]) -> str:
    """Answer using Gemini over relevant websites."""
    if not relevant_websites:
        return "No relevant website content found."
    files = [webpages[url]["file"] for url in relevant_websites if webpages[url].get("file")]
    if not files:
        return "No website content could be processed."
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        system_instruction=SYSTEM_INSTRUCTION
    )
    response = model.generate_content([*files, query])
    return response.text.strip()

@tool
def classify_data_source(query: str) -> str:
    """Decide if the question should be answered using PDFs or Websites."""
    keywords_pdf = ["syllabus", "notes", "pdf", "lecture", "module", "exam"]
    keywords_web = ["admission", "fee", "placements", "faculty", "website", "about", "college"]
    query_lower = query.lower()
    if any(k in query_lower for k in keywords_pdf):
        return "pdf"
    elif any(k in query_lower for k in keywords_web):
        return "web"
    return "pdf"

# -- Chatbot CLI --
def chatbot_interaction():
    from IPython.display import display, Markdown
    display(Markdown("**🤖 Ask me anything about Sahyadri college. Type 'exit' to quit.**"))
    while True:
        user_input = input("You: ")
        if user_input.lower() in {"exit", "quit", "bye"}:
            display(Markdown("**👋 Goodbye!**"))
            break

        try:
            source = classify_data_source.invoke(user_input)

            if source == "pdf":
                relevant = filter_relevant_pdfs.invoke(user_input)
                display(Markdown(f"**📄 Relevant PDFs:** {', '.join(relevant) or 'None'}"))
                response = query_pdfs.invoke({"query": user_input, "relevant_pdfs": relevant})
            else:
                relevant = filter_relevant_websites.invoke(user_input)
                display(Markdown(f"**✨ Relevant Websites:** {', '.join(relevant) or 'None'}"))
                response = query_websites.invoke({"query": user_input, "relevant_websites": relevant})

            display(Markdown(f"**🤖 Answer:** {response}"))

        except Exception as e:
            display(Markdown(f"**❌ Error:** {e}"))

if __name__ == "__main__":
    chatbot_interaction()


**🤖 Ask me anything about Sahyadri college. Type 'exit' to quit.**

**✨ Relevant Websites:** https://www.sahyadri.edu.in/

**🤖 Answer:** Sahyadri College of Engineering and Management (SCEM), established in 2007 by the Bhandary Foundation, is located on the Mangalore-Bangalore National Highway 48, on the banks of the Nethravathi River.  It's a NAAC 'A' grade accredited institution with NBA approval for five engineering programs.  The college offers a range of engineering courses, including Mechanical Engineering (established 2007, implementing Outcome Based Education (OBE) with Continuous Quality Improvement (CQI)), Information Science & Engineering (established 2007, offering BE and PhD degrees, accredited by NAAC and IE, with OBE since 2016), Electronics and Communication Engineering (established 2007, continually upgrading labs), and Computer Science & Engineering (established 2007,  with an increased student intake from 60 to 240 by 2022, adding specialized programs).  A Department of Business Administration, offering a two-year MBA program, was established in 2008 and is affiliated with Visvesvaraya Technological University (VTU).  The college boasts a strong placement record, with over 1000 students receiving placement offers and a highest package of 43.98 LPA (Belc Co Ltd).  The college also highlights its support for student startups and projects,  including funding from GoK.

**📄 Relevant PDFs:** Artificial Intelligence 1.pdf

**🤖 Answer:** Here's the fifth question from the provided Artificial Intelligence 1.pdf:

**Q.5 Solve Any Two of the following.**

A) What is Natural Language Processing? Explain various types of NLP techniques. (Understand, 6 marks)

B) What is Expert system? Explain its Architecture, Features & applications. (Understand, 6 marks)

C) Comparison between Syntactic Processing & Semantic Processing. (Remember, 6 marks)

**👋 Goodbye!**

In [None]:
from bson.binary import Binary

for doc in KM_documents_collection.find().limit(5):
    print("Filename:", doc.get("filename"))
    print("Path:", doc.get("path"))
    print("Has 'file' field:", "file" in doc)
    if "file" in doc:
        print("Type of 'file':", type(doc["file"]))
        print("Size in bytes:", len(doc["file"]))
    print("-" * 40)


Filename: Artificial Intelligence 1.pdf
Path: uploads\Artificial Intelligence 1.pdf
Has 'file' field: False
----------------------------------------
Filename: Big Data Analytics 1.pdf
Path: uploads\Big Data Analytics 1.pdf
Has 'file' field: False
----------------------------------------


In [4]:
file_path = Path(doc['path']).resolve()
print(file_path)


C:\Users\sameer.kavale\git clone collab\Smart-FAQ-Bot\backend\notebook\uploads\Big Data Analytics 1.pdf
