# Installing the Essential Libraries

In [33]:
!pip install PyPDF2 langchain groq transformers faiss-cpu > /dev/null
!pip install -U langchain-community > /dev/null
!pip install -U langchain-huggingface > /dev/null

# Importing Libraries

In [34]:
import os
import PyPDF2
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from google.colab import files
from transformers import AutoTokenizer, AutoModel
import torch
from pprint import pprint
from IPython.display import Markdown as md

In [3]:
groq_api_key = "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"
os.environ["GROQ_API_KEY"] = groq_api_key

In [72]:
from google.colab import files
uploaded = files.upload()

Saving Student_Handbook_latest.pdf to Student_Handbook_latest.pdf


# Converting PDF to Text

In [73]:
def pdf_to_text(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text # returns the whole text of the pdf as single string

In [9]:
pdf_path = list(uploaded.keys())[0]
print(f"pdf_path: {pdf_path}")
pdf_text = pdf_to_text(pdf_path)

pdf_path: Student_Handbook_latest Extract[1-12].pdf


# Converting the Entire Text to Chunks

In [38]:
import re
def split_text_into_sentences(text):
    """Splits text into sentences using regex."""
    sentences = re.split(r'(?<=[.?!])\s+', text)  # Split at sentence boundaries
    cleaned_sentences = []
    for sent in sentences:
      sent = sent.replace("\n", " ")  # Replace newlines with spaces
      sent = re.sub(r'\s+', ' ', sent)  # Replace multiple spaces with a single space
      cleaned_sentences.append(sent)
    return cleaned_sentences

In [39]:
def chunk_text_with_some_overlap(text, chunk_size=5, overlap=2):
    """Splits a long text into chunks of max `chunk_size` words with `overlap`."""
    initial_chunks, final_chunks = [], [] # initial_chunks are non-overlapping; final_chunks are overlapping
    clean_sentences = split_text_into_sentences(text)
    concatenated_text = " ".join(clean_sentences) # Concatenate separated sentences into a single string
    spaced_list = concatenated_text.split() # Split concatenated text into a list of words

    step = chunk_size - overlap
    for i in range(0, len(spaced_list), step): # create non-overlapping chunks
      # print(f"i={i}")
      initial_chunks.append(spaced_list[i:i + step])

    for i in range(0, len(initial_chunks)-1): # create overlapping chunks
      temp = list(initial_chunks[i]) # Make a copy
      # print(f"TEMP = {temp}")
      temp.extend(initial_chunks[i+1][:overlap])
      final_chunks.append(' '.join(temp))

    return final_chunks


In [75]:
chunks = chunk_text_with_some_overlap(pdf_text, chunk_size=300, overlap=60)
for chunk in chunks[7:10]:
  print(chunk, end="\n********\n")

courses can be credited within the 142 credits required for the BS degree. The Minor can be earned only with the BS degree and not the Bsc degree. A separate document will be issued by IITM stating that the Minor has been completed. There will be no change in transcript or the degree certificate for those who do the minor. 4. Fees for the entire programme ● First year fees kept low to enable learners to try out the programme ● Entry fee for Qualifier exam: Rs. 3000 (non-refundable) - with suitable waivers ● Fee waivers for learners belonging to certain categories and economic backgrounds Number of credits in each level: Level Theory Project Apprenticeship NPTEL MG/HS/HM Foundation 32 Diploma in DS 23 4 Diploma in P 23 4 BSc 28-X X (0-4) BS 28-A-B A (0/4/12) B=4 *Only maximum of 24 credits can be transferred towards non counted CGP A course Foundation: Rs 32000/- Diploma Level: 62500 x 2 = Rs 125,000/- BSc Level: Rs 2.21L - 2.47L BS Level: 3.25L - 3.87L Cost per credit (Rs) Number of cr

In [60]:
len(chunks)

13

In [61]:
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # setting up a way to get embeddings (numerical representations) of text using a pre-trained model from Hugging Face.

# Creating the Vector Store from the text in terms of chunks taking the help of HF embeddings.

In [62]:
vector_store = FAISS.from_texts(chunks, hf_embeddings) # internally uses the hf_embeddings object to convert each text chunk in the chunks list into a numerical vector representation (an embedding).

In [63]:
system_prompt = """
You are an AI assistant trained to answer questions based on the content of the PDF document provided, which is a student handbook.
The document is highly detailed and contains knowledge about the entire IIT Madras BS Degree.
When asked a question, you should refer to the document and provide the most accurate and relevant answer based on the information in the document.
Answer only based on the content from the document and do not make up any information.
"""


# Preparing the Model with RAG

In [70]:
from groq import Groq

client = Groq(api_key=groq_api_key)

def query_model_with_rag(query):
    relevant_chunks = vector_store.similarity_search(query, k=3) # k=3: This parameter specifies the number of most similar chunks to retrieve. In this case, it will return the top 3 most relevant chunks.
    # pprint(f"RELEVANT CHUNKS = {relevant_chunks}")
    # print(f"LENGTH RELEVANT CHUNKS = {len(relevant_chunks)}")
    context = "\n".join([doc.page_content for doc in relevant_chunks]) # concatenates the text content of the relevant chunks into a single string.
    # print(f"Length of context is len(context): {len(context)}")
    prompt = system_prompt + "\n\nContext:\n" + context + "\n\nQuestion: " + query + "\nAnswer:"
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile",
        stream=False
    )
    return relevant_chunks, chat_completion.choices[0].message.content.strip()

In [65]:
from pprint import pprint
from IPython.display import Markdown as md

# Evaluation

In [76]:
query = "What is the total amount of credits required to pass the BS Degree?"
relevant_chunks, answer = query_model_with_rag(query)
print(answer)

According to the document, the total number of credits required to graduate with the BS degree is 142.


In [77]:
query = "What is the fee of level 3 courses?"
relevant_chunks, answer = query_model_with_rag(query)
print(answer)

The fee for level 3 courses is Rs 2500/credit.


In [78]:
query = "Explain the fee structure for each level of the program, including fee waivers and additional costs."
relevant_chunks, answer = query_model_with_rag(query)
md(answer)

The fee structure for the IIT Madras BS Degree program is as follows:

- For Level 3 courses, the fee is Rs 2500 per credit.
- For Level 4 courses, the fee is Rs 5000 per credit.
- For NPTEL courses, the fee is Rs 1000 per credit.

Fee waivers are available based on the category of learner and family income. The fee waiver structure is as follows:

- General category: No waiver
- EWS + Family Income > 1 LPA and <= 5 LPA: 50% waiver
- EWS + Family Income <= 1 LPA: 75% waiver
- OBC-NCL + Family Income > 1 LPA and <= 5 LPA: 50% waiver
- OBC-NCL + Family Income <= 1 LPA: 75% waiver
- SC/ST: 50% waiver for family income > 5 LPA, 75% waiver for family income <= 1 LPA
- PwD: 50% waiver for family income > 5 LPA, 75% waiver for family income <= 1 LPA

Additional costs include:

- Qualifier exam fee: Rs 3000 (with waivers for SC/ST and PwD with 40% disability backgrounds)
- SBI loan interest: starting from 8.10% with no collateral security and simple interest till course completion

Note: The fee structure and waivers are subject to change, and learners should check the official website for the most up-to-date information.

In [79]:
query = "In which courses are project required?"
relevant_chunks, answer = query_model_with_rag(query)
md(answer)

According to the document, project courses are required in the following:

1. Diploma in Programming:
   - Project course in Application Development - 1
   - Project course in Application Development - 2

2. Diploma in Data Science:
   - Project course in Business Data Management
   - Project course in Machine Learning Practice

Additionally, in the BS Degree level, Apprenticeship has been split into 2 courses - one of 4 credits that corresponds to the 4 month internship and another as 8 credits for the term of apprenticeship that is continued from months 5-8.