# Installing the Essential Libraries

In [1]:
!pip install PyPDF2 langchain groq transformers faiss-cpu > /dev/null
!pip install -U langchain-community > /dev/null
!pip install -U langchain-huggingface > /dev/null

# Importing Libraries

In [2]:
import os
import PyPDF2
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from google.colab import files
from transformers import AutoTokenizer, AutoModel
import torch
from pprint import pprint
from IPython.display import Markdown as md

In [3]:
groq_api_key = "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"
os.environ["GROQ_API_KEY"] = groq_api_key

In [4]:
from google.colab import files
uploaded = files.upload()

Saving IITM BS Degree Programme - Student Handbook.pdf to IITM BS Degree Programme - Student Handbook.pdf


# Converting PDF to Text

In [5]:
def pdf_to_text(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

In [6]:
pdf_path = list(uploaded.keys())[0]
print(f"pdf_path: {pdf_path}")
pdf_text = pdf_to_text(pdf_path)

pdf_path: IITM BS Degree Programme - Student Handbook.pdf


# Converting the Entire Text to Chunks

In [7]:
import re
def split_text_into_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    cleaned_sentences = []
    for sent in sentences:
      sent = sent.replace("\n", " ")
      sent = re.sub(r'\s+', ' ', sent)
      cleaned_sentences.append(sent)
    return cleaned_sentences

In [9]:
def chunk_text_with_some_overlap(text, chunk_size=5, overlap=2):
    initial_chunks, final_chunks = [], []
    clean_sentences = split_text_into_sentences(text)
    concatenated_text = " ".join(clean_sentences)
    spaced_list = concatenated_text.split()

    step = chunk_size - overlap
    for i in range(0, len(spaced_list), step):
      initial_chunks.append(spaced_list[i:i + step])

    for i in range(0, len(initial_chunks)-1):
      temp = list(initial_chunks[i])
      temp.extend(initial_chunks[i+1][:overlap])
      final_chunks.append(' '.join(temp))

    return final_chunks

In [10]:
chunks = chunk_text_with_some_overlap(pdf_text, chunk_size=300, overlap=60)
for chunk in chunks[7:10]:
  print(chunk, end="\n********\n")

courses can be credited within the 142 credits required for the BS degree. The Minor can be earned only with the BS degree and not the Bsc degree. A separate document will be issued by IITM stating that the Minor has been completed. There will be no change in transcript or the degree certificate for those who do the minor. 4. Fees for the entire programme ● First year fees kept low to enable learners to try out the programme ● Entry fee for Qualifier exam: Rs. 3000 (non-refundable) - with suitable waivers ● Fee waivers for learners belonging to certain categories and economic backgrounds Number of credits in each level: Level Theory Project Apprenticeship NPTEL MG/HS/HM Foundation 32 Diploma in DS 23 4 Diploma in P 23 4 BSc 28-X X (0-4) BS 28-A-B A (0/4/12) B=4 *Only maximum of 24 credits can be transferred towards non counted CGP A course Foundation: Rs 32000/- Diploma Level: 62500 x 2 = Rs 125,000/- BSc Level: Rs 2.21L - 2.47L BS Level: 3.25L - 3.87L Cost per credit (Rs) Number of cr

In [11]:
len(chunks)

54

In [12]:
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Creating the Vector Store from the text in terms of chunks taking the help of HF embeddings.

In [13]:
vector_store = FAISS.from_texts(chunks, hf_embeddings)

In [14]:
system_prompt = """
You are an AI assistant trained to answer questions based on the content of the PDF document provided, which is a student handbook.
The document is highly detailed and contains knowledge about the entire IIT Madras BS Degree.
When asked a question, you should refer to the document and provide the most accurate and relevant answer based on the information in the document.
Answer only based on the content from the document and do not make up any information.
"""


# Preparing the Model with RAG

In [15]:
from groq import Groq

client = Groq(api_key=groq_api_key)

def query_model_with_rag(query):
    relevant_chunks = vector_store.similarity_search(query, k=3)
    context = "\n".join([doc.page_content for doc in relevant_chunks])
    prompt = system_prompt + "\n\nContext:\n" + context + "\n\nQuestion: " + query + "\nAnswer:"
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile",
        stream=False
    )
    return chat_completion.choices[0].message.content.strip()

In [16]:
from pprint import pprint

# Evaluation

In [17]:
query = "What is the total amount of credits required to pass the BS Degree?"
answer = query_model_with_rag(query)
print(answer)

The total amount of credits required to pass the BS Degree is 142.


In [18]:
query = "What is the fee of level 3 courses?"
answer = query_model_with_rag(query)
print(answer)

The fee for the level 3 courses is Rs 2500/credit.


In [20]:
query = "How can I get minor in Economics and Finance"
answer = query_model_with_rag(query)
md(answer)

To get a minor in Economics and Finance, you need to complete 2 mandatory courses: Corporate Finance and Managerial Economics, and one elective course from the list provided. The elective course options currently include Game Theory and Strategy, and more courses will be added shortly. These 3 courses can be credited within the 142 credits required for the BS degree. Note that the Minor can be earned only with the BS degree and not the BSc degree, and a separate document will be issued by IITM stating that the Minor has been completed.

In [21]:
query = "What is the validity of the qualifier score? By when can I used it?"
answer = query_model_with_rag(query)
md(answer)

The qualifier marks will be valid for the 3 terms that come subsequent to the qualifier exam date for the learner to register to the Foundation level. This score will be invalid after this period and the learner will have to go through the entire qualifier process again if they wish to join the programme. For students in std XII who take the qualifier exam, the validity is for 3 terms from when they pass std XII.

In [22]:
query = "What is the learner life cycle?"
answer = query_model_with_rag(query)
md(answer)

The text does not provide a detailed definition of the "Learner Life Cycle" in this specific section. However, it is mentioned in section 11, which is not provided in the given context. Therefore, I cannot provide a detailed answer based on the given information.

In [23]:
query = "How will the assignments have to be submitted and how will the be graded?"
answer = query_model_with_rag(query)
md(answer)

According to the document, the assignments will have to be submitted online within the due date specified. The grading of assignments will be as follows: 

- A 12-week course will have one or more weekly assignments.
- The average score of the best 5 out of the first 7 weekly assignments should be >= 40/100 to be eligible to write the final exam.
- The score in any unattempted assignment will be counted as 0.
- The assignments will contribute to the Final score and course grade.

It's also mentioned that the details of the grading may change from course to course, and students should check the grading document for actual details.

In [24]:
query = "Explain to me the entire passing criteria for each course?"
answer = query_model_with_rag(query)
md(answer)

According to the document, the passing criteria for each course is as follows:

1. A candidate is deemed to have passed a course if the Total Course Score (T) is greater than or equal to 40/100.
2. The Total Course Score (T) is the sum of all assessment components in every course as defined in the Grading document of that term.
3. The absolute grading system is followed, which is as follows:
   - T ≥ 90: Pass with Grade S and 10 grade points
   - 90 > T ≥ 80: Pass with Grade A and 9 grade points
   - 80 > T ≥ 70: Pass with Grade B and 8 grade points
   - 70 > T ≥ 60: Pass with Grade C and 7 grade points
   - 60 > T ≥ 50: Pass with Grade D and 6 grade points
   - 50 > T ≥ 40: Pass with Grade E and 4 grade points
   - T < 40: Fail with Grade U and 0 grade points

Additionally, there are specific conditions for failing, such as:
- Weekly average assignment score < 40/100 or 0 quizzes attended: Fail with Grade WA/WQ and 0 grade points

Note that the relaxations in pass criteria are only applicable for the qualifier process and not for the courses within the program.