<a href="https://colab.research.google.com/github/VPIITB24/InstiGuru---RAG-based-Conversation-ChatBot/blob/main/InstiGuru_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# InstiGuru:Chatbot

In [2]:
# import the library
import os
import re
import spacy
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.document_loaders import PyMuPDFLoader, PDFPlumberLoader
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from google.colab import drive # Import the drive object

# set the path folder :
nlp = spacy.load("en_core_web_sm")
drive.mount('/content/drive')
pdf_folder = "/content/drive/MyDrive/IITB_pdf"
faiss_folder = "/content/drive/MyDrive/IITB_vector_db2"
groq_api_key = os.getenv("GROQ_API_KEY")

Mounted at /content/drive


In [3]:
'''from langchain_community.document_loaders import PyMuPDFLoader
import os

def load_pdfs(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for f in files:
            if f.lower().endswith(".pdf"):
                path = os.path.join(root, f)
                loader = PyMuPDFLoader(path)  # preserves more hyperlinks
                pages = loader.load()
                for p in pages:
                    p.metadata["source"] = f
                    docs.append(p)
    return docs

docs = load_pdfs(pdf_folder)
print(f"Loaded {len(docs)} pages.")'''


'from langchain_community.document_loaders import PyMuPDFLoader\nimport os\n\ndef load_pdfs(folder):\n    docs = []\n    for root, _, files in os.walk(folder):\n        for f in files:\n            if f.lower().endswith(".pdf"):\n                path = os.path.join(root, f)\n                loader = PyMuPDFLoader(path)  # preserves more hyperlinks\n                pages = loader.load()\n                for p in pages:\n                    p.metadata["source"] = f\n                    docs.append(p)\n    return docs\n\ndocs = load_pdfs(pdf_folder)\nprint(f"Loaded {len(docs)} pages.")'

In [None]:
# Load the pdf Document of the IITB from the google drive

def load_pdfs(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for f in files:
            if f.lower().endswith(".pdf"):
                path = os.path.join(root, f)
                loader = PyPDFLoader(path)
                pages = loader.load()
                for p in pages:
                    p.metadata["source"] = f
                    docs.append(p)
    return docs

docs = load_pdfs(pdf_folder)
print(f"Loaded {len(docs)} pages.")


In [None]:
# Keep critical stopwords for meaning
important_stopwords = {"not", "no", "if", "but"}
custom_stopwords = set(spacy.lang.en.stop_words.STOP_WORDS) - important_stopwords

def clean_basic(text):
    if not text:
        return ""
    text = text.replace("\xa0", " ")  # non-breaking space
    text = re.sub(r"\s+", " ", text)  # multiple spaces -> one
    return text.strip()

def nlp_preprocess_safe(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        # Keep emails, URLs, and numbers exactly as they are
        if token.like_email or token.like_url or token.is_digit:
            tokens.append(token.text)
        elif token.is_alpha:
            if token.text.lower() not in custom_stopwords:
                tokens.append(token.lemma_.lower())  # normalize words
        else:
            # Keep important punctuation that can signal structure
            if token.text in {"?", "!", ".", ",", "@", ":", "-", "/"}:
                tokens.append(token.text)
    return " ".join(tokens)

def clean_and_nlp_safe(text):
    cleaned = clean_basic(text)
    return nlp_preprocess_safe(cleaned)

# Apply to all docs without losing info
docs = [
    Document(page_content=clean_and_nlp_safe(d.page_content), metadata=d.metadata)
    for d in docs
]

print("Text cleaned safely — emails, URLs, and numbers preserved.")


In [None]:
# chunk the pdf text using semantic and recursive text splitter

# Function for hybrid splitting
def hybrid_split_documents(docs, embeddings, chunk_size=1000, chunk_overlap=250):
    semantic_splitter = SemanticChunker(embeddings=embeddings)
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    semantic_chunks = semantic_splitter.split_documents(docs)

    final_chunks = []
    for chunk in semantic_chunks:
        if len(chunk.page_content) > chunk_size:
            final_chunks.extend(recursive_splitter.split_documents([chunk]))
        else:
            final_chunks.append(chunk)

    return final_chunks

In [None]:

# Create embeddings
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# First, call the hybrid splitter to get the chunks
final_chunks = hybrid_split_documents(docs, embeddings, chunk_size=1000, chunk_overlap=250)


In [None]:

# Then build FAISS index
print("Creating FAISS index from chunks...")
faiss_index = FAISS.from_documents(final_chunks, embeddings)

# save the index.file as FAISS  in the vector form :

print(f"Saving FAISS index locally to {faiss_folder} ...")
faiss_index.save_local(faiss_folder)
print("FAISS index saved.")

In [4]:
# Load the FIASS index from the disk
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
faiss_index = FAISS.load_local(faiss_folder, embeddings, allow_dangerous_deserialization=True)
print("FAISS index loaded.")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

FAISS index loaded.


In [5]:
# set the retriver and memory

retriever = faiss_index.as_retriever(search_kwargs={"k":3})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [6]:
# Corrected Prompt Template
prompt = PromptTemplate(
    input_variables=["chat_history", "context", "question"],
    template="""
You are a knowledgeable and respectful assistant specializing in IIT Bombay.

**Answering Rules (Follow Strictly):**
- ONLY use information from the "Context from Documents" section below.
- DO NOT use outside knowledge, assumptions, or fabricated details.
- If the question is about any student, provide all information explicitly available in the context.
- If the answer is NOT explicitly in the context, reply exactly with: "Hmm, I'm not sure."
- Never guess or infer beyond the given context.

**Style Guidelines:**
- Keep tone warm, polite, and respectful.
- If context supports it, write answers in detail with clear explanations; otherwise, give a concise single-line response.
- Present the answer line-by-line in a well-structured manner.

---

Chat History:
{chat_history}

Context from Documents:
{context}

User Question:
{question}

Answer:
"""
)


In [15]:
import os
from langchain_groq import ChatGroq
from google.colab import userdata

# Fetch API key from Colab secrets
groq_api_key = userdata.get("GROQ_API_KEY")

# Initialize LLM
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0.3,
    api_key=groq_api_key
)

In [16]:
print("Groq API Key Loaded:", groq_api_key[:5] + "*****")  # shows only partial key


Groq API Key Loaded: gsk_q*****


In [17]:
# Runnable pipeline

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

chain = (
    RunnablePassthrough.assign(
        chat_history=lambda x: memory.chat_memory.messages,
        context=lambda x: "\n".join([doc.page_content for doc in retriever.get_relevant_documents(x["question"])]),
        question=lambda x: x["question"]
    )
    | prompt
    | llm
    | StrOutputParser()
)


In [18]:
# now  ask our query

response = chain.invoke({"question": "can  best place in the campus for visit "})

print(response)

Here are some great places to visit on the IIT Bombay campus:

1. **Guest Houses**: The campus has three charming guest houses - Jalvihar, Vanvihar, and Padmavihar - that offer a serene environment and modern amenities.

2. **Gulmohar Café**: Located near the guest houses, it's a great spot to relax and unwind.

3. **LHC Food Court (LHCFC)**: Situated in the Atrium Lecture Hall Complex, opposite the Kresit building, it features various outlets offering juice, beverages, North Indian, South Indian, Asian cuisine, and chaat.

4. **Quantum Café**: This new dining space is located on the floor of the Rahul building and provides a pleasant atmosphere.

5. **Institute Central Library**: This fully air-conditioned library is a great place to explore, with a vast collection of books, journals, and theses. It's open till 10 pm on weekdays and 5 pm on weekends and holidays, and also provides high-speed Wi-Fi internet access.

These are some of the best places to visit on the IIT Bombay campus.


In [None]:
# vindeshwari _end project_ chabot

In [1]:
! pip install os-sys   # (usually comes pre-installed with Python, so you can
! pip install regex    # for advanced regex if needed, but re is built-in
! pip install spacy
! python -m spacy download en_core_web_sm

! pip install langchain
! pip install langchain-community
! pip install langchain-experimental
! pip install langchain-groq
! pip install faiss-cpu
! pip install fastembed
! pip install pymupdf
! pip install pdfplumber


Collecting os-sys
  Downloading os_sys-2.1.4-py3-none-any.whl.metadata (9.9 kB)
Collecting pygubu (from os-sys)
  Downloading pygubu-0.38.2-py3-none-any.whl.metadata (7.3 kB)
Collecting progress (from os-sys)
  Downloading progress-1.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting progressbar (from os-sys)
  Downloading progressbar-2.5.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jupyter (from os-sys)
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting Eel (from os-sys)
  Downloading eel-0.18.2.tar.gz (26 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting extract-zip (from os-sys)
  Downloading extract_zip-1.0.0-py3-none-any.whl.metadata (403 bytes)
INFO: pip is looking at multiple versions of os-sys to determine which version is compatible with other requirements. This could take a while.
Collecting os-sys
  Downloading os_sys-2.1.3-py3-none-any.whl.metadata (9.9 kB)
  Downloading os_sys-2.1.2-py3-none-any.

In [13]:
response = chain.invoke({"question": "What are the library hours?"})
print(response)

According to the context, the library hours are as follows:

* On weekdays, the library is open in the evening until a certain time in the night (exact timing not specified).
* On weekends and holidays, the library is also open (exact timing not specified).

Please note that the exact timings are not provided in the context. If you need more information, I suggest visiting the IIT Bombay Central Library website or contacting the library directly.
