In [1]:
# !pip install groq

In [2]:
!pip install langchain -q
!pip install langchain_chroma -q
!pip install langchain_community -q
!pip install langchain_groq -q
!pip install grandalf -q
!pip install numpy -q
!pip install pandas -q
!pip install pypdf -q
!pip install sentence-transformers -q #takes 2 min to exec
# !pip install groq-gradio

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
import os
os.environ["GROQ_API_KEY"] = "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"
os.environ["TOKENIZERS_PARALLELISM"] = "false" # To suppress huggingface warnings

import warnings
warnings.filterwarnings("ignore")

from groq import Groq

In [5]:
from langchain_groq import ChatGroq
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings


embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
rag_llm = ChatGroq(
    model="llama3-8b-8192",
    temperature = 0.1,
  ) # Used for RAG

  embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")


In [6]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Loading
pdf_path = "./ugrulebook.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load() # list of pages

# Splitting
# split a long document into smaller chunks that can fit into your model's context window
# 2 hyperparameters : chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n",
        "\n",
        " ",
        "",],
    chunk_size=2000,
    chunk_overlap=200
  )
docs_spl = text_splitter.split_documents(docs)
len(docs_spl)


92

In [7]:

from langchain_chroma import Chroma

# Storing
vectorstore = Chroma.from_documents(docs_spl, embedding=embed_model, collection_name="groq_rag") # takes one min to run
retriever = vectorstore.as_retriever()
print(f"Documents indexed: {len(docs_spl)}")

Documents indexed: 92


In [8]:
await retriever.ainvoke("What are the eligibility criteria for applying for a change of branch/ programme?")

# splitting can be improved ... eg \n is part of a word

[Document(id='10c149a0-ff0a-412e-81e5-239fc9ab2c76', metadata={'author': 'pritap', 'creationdate': '2025-01-27T11:14:55+05:30', 'creator': 'Microsoft® Word 2016', 'moddate': '2025-01-27T11:14:55+05:30', 'page': 34, 'page_label': '35', 'producer': 'Microsoft® Word 2016', 'source': './ugrulebook.pdf', 'total_pages': 52}, page_content='there are valid requests. \nE) All changes of branch can b e effected only once at the beginning of the second academic \nyear. No application for change of branch during the subsequent academic years will be \nentertained. \nF) Branch change decisions will be final and will not be reversed. \nG) To run the LASE programme, the minimum student strength for the LASE programme should \nbe 10. If less than 10 students are allotted the LASE programme after branch change then \nthe result will be considered as null and void.'),
 Document(id='883e3aa9-9612-45ba-9098-008f401f7ddb', metadata={'author': 'pritap', 'creationdate': '2025-01-27T11:14:55+05:30', 'creator'

In [9]:
from langchain_core.documents import Document
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from typing import List, Dict

RAG_SYSTEM_PROMPT = """\
You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context given within delimiters to answer the human's questions.
```
{context}
```
If you don't know the answer, just say that you don't know.\
""" # adapted from https://smith.langchain.com/hub/rlm/rag-prompt-llama3

RAG_HUMAN_PROMPT = "{input}"

RAG_PROMPT = ChatPromptTemplate.from_messages([
    ("system", RAG_SYSTEM_PROMPT),
    ("human", RAG_HUMAN_PROMPT)
])

def format_docs(docs: List[Document]):
    """Format the retrieved documents"""
    return "\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context": retriever | format_docs, # Use retriever to retrieve docs from vectorstore -> format the documents into a string
        "input": RunnablePassthrough() # Propogate the 'input' variable to the next step
    }
    | RAG_PROMPT # format prompt with 'context' and 'input' variables
    | rag_llm # get response from LLM using the formatteed prompt
    | StrOutputParser() # Parse through LLM response to get only the string response

)

In [10]:
await rag_chain.ainvoke("What are the time for holding lectures for first year UG students ?")


'According to the provided context, the lectures for first-year UG students are to be held only between 8:30 am and 5:30 pm, and only on working days.'

In [11]:
await rag_chain.ainvoke("How many candidates does IIT Bombay take in annually ?")


'According to the provided context, IIT Bombay annually admits:\n\n* More than 1000 candidates for undergraduate programmes (B.Tech./Dual Degree and B.S.) through the Joint Entrance Examination (JEE)\n* More than 30 candidates for B.Des. Programme through the Undergraduate Common Entrance Exam for Design (UCEED)\n* Around 300 candidates for M.Sc. and M.Sc. Ph.D. Dual Degree programmes\n* More than 1000 candidates for postgraduate programmes\n* Around 300 candidates for Ph.D. programmes'

In [12]:
await rag_chain.ainvoke("What are the requirements of getting a degree ?")


'According to the provided context, the requirements for the degree are:\n\na) The student should have taken and passed all the courses prescribed for the degree under the general institutional and departmental requirements.\nb) The student should have satisfactorily fulfilled other academic requirements such as practical training, NSS/NSO/NCC, work visits, seminar and projects, as specified for the discipline/programme.\nc) The student should have paid all the Institute dues.\n\nAdditionally, the student must complete the academic requirements of the B.Tech. Degree.'

In [13]:
await rag_chain.ainvoke("Explain the organisational structure for academic matters.")


'According to the provided context, the organisational structure for academic matters at the Institute is as follows:\n\n* The Senate is the supreme body that governs all academic matters of the Institute. It is a statutory body that approves rules and regulations for academic programmes.\n* The Senate has two Institute-level sub-committees:\n\t+ Undergraduate Programmes Committee (UGPC) for undergraduate programmes\n\t+ Post-Graduate Programmes Committee (PGPC) for post-graduate programmes\n* The Dean of Academic Programmes (Dean, AP) and the Associate Dean of Academic Programmes (Associate Dean, AP) are the Conveners & Co-conveners respectively of these committees.\n* The Senate also has two Institute-level committees for performance and evaluation:\n\t+ Undergraduate Academic Performance Evaluation Committee (UGAPEC)\n\t+ Postgraduate Academic Performance Evaluation Committee (PGAPEC)\n* Conveners for these committees are designated from among Senate members.\n* Each department has 

In [16]:
response = await rag_chain.ainvoke("What is SPI and what is CPI ? Refer to the Glossary")
print(response)

According to the provided context, SPI stands for Semester Performance Index, which is a number that indicates the performance of a student in a semester. It is the weighted average of the grade points obtained in all the courses registered by the student during the semester.

CPI stands for Cumulative Performance Index, which is an up-to-date assessment of the overall performance of a student from the time they entered the Institute. It is calculated in a manner similar to the calculation of SPI, considering all the courses registered by the student towards the minimum requirement of the degree they have enrolled for, since they entered the Institute.


In [17]:
def add_docs(path):
    loader = PyPDFLoader(file_path=path)
    docs = loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(chunk_size = 1000,
                                                                                chunk_overlap = 200,
                                                                                length_function = len,
                                                                                is_separator_regex=False))
    vectorstore = Chroma.from_documents(documents=docs,embedding= embed_model, persist_directory="output/general_knowledge")
    return vectorstore

def answer_query(message, chat_history):
    vectorstore = Chroma(persist_directory="output/general_knowledge", embedding_function=embed_model)
    retriever = vectorstore.as_retriever()

    response = rag_chain.invoke()
    chat_history.append(response)
    return "", chat_history


In [18]:
import gradio as gr
import groq_gradio

with gr.Blocks() as demo:
    gr.HTML("<h1 align = 'center'>Smart Assistant</h1>")

    with gr.Row():

        upload_files = gr.File(label = 'Upload a PDF',file_types=['.pdf'],file_count='single')

    chatbot = gr.Chatbot()
    msg = gr.Textbox(label = "Enter your question here")
    upload_files.upload(add_docs,upload_files)
    msg.submit(answer_query,[msg,chatbot],[msg,chatbot])