In [5]:
##Environment Setup

!pip install langchain faiss-cpu sentence-transformers transformers evaluate streamlit pyyaml langchain-community pdfminer.six
!apt install poppler-utils

Collecting pdfminer.six
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20251107
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [3]:
##Importing Libraries

import os, json, re
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from evaluate import load

In [9]:
##PDF Parsing & Text Cleaning

from pdfminer.high_level import extract_text
pdf_path = '/content/class8.pdf'
raw_text = extract_text(pdf_path)

# Clean text
text = re.sub(r'\s+', ' ', raw_text)
text = text.replace('2024-25', '').strip()

# Split into chapters using NCERT headers
chapters = re.split(r'(?=CROP PRODUCTION AND MANAGEMENT|MICROORGANISMS|COAL AND PETROLEUM)', text)

In [10]:
##Chapter Structuring

data = []
for ch in chapters:
    title = ch.split('\n')[0].strip()[:80]
    data.append({"chapter_title": title, "text": ch.strip()})

with open('class8_science.json', 'w') as f:
    json.dump(data, f, indent=2)

In [11]:
##Embedding Model Initialization

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Chunking text for better retrieval
def chunk_text(text, size=500):
    return [text[i:i+size] for i in range(0, len(text), size)]

docs = []
for chapter in data:
    for chunk in chunk_text(chapter['text']):
        docs.append({"chapter": chapter["chapter_title"], "content": chunk})

embeddings = [embedder.encode(d["content"]) for d in docs]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
##FAISS Vector Store Creation

from langchain.vectorstores import FAISS
import numpy as np
import faiss

dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [None]:
##Llama-2 Model Setup

from huggingface_hub import login
import os

# IMPORTANT: Replace my token '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' with your actual Hugging Face token.
login(token='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')

model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

In [None]:
##RAG Pipeline

retriever = FAISS(FAISS.load_local(".", embeddings, docs))
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

query = "Explain irrigation methods used in agriculture."
result = qa.run(query)
print("Query:", query)
print("Answer:", result)

In [None]:
##Evaluation (BLEU + ROUGE-L)

bleu = load("bleu")
rouge = load("rouge")

def evaluate_response(predictions, references):
    bleu_score = bleu.compute(predictions=predictions, references=references)
    rouge_score = rouge.compute(predictions=predictions, references=references)
    return {"BLEU": bleu_score["bleu"], "ROUGE-L": rouge_score["rougeL"]}

# Example usage
evaluate_response(["Plants need water for photosynthesis."],
                  ["Plants require water for making food using sunlight."])

In [None]:
##Chapter Summarization + Topic Tagging

def summarize_chapter(ch_text):
    prompt = f"Summarize the following chapter in 100 words and list 5 key topics:\n{ch_text[:1500]}"
    return qa.run(prompt)

for ch in data:
    summary = summarize_chapter(ch['text'])
    ch['summary'] = summary
    ch['topics'] = re.findall(r'[A-Z][a-z]+(?: [A-Z][a-z]+)*', summary)

with open('class8_science.json', 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
##Visualization & Output

import pandas as pd
import matplotlib.pyplot as plt

# Load evaluation results
df = pd.read_csv('evaluation.csv')
plt.bar(df['Metric'], df['Score'])
plt.title("AI Tutor Evaluation Metrics")
plt.xlabel("Metric")
plt.ylabel("Score")
plt.show()

In [None]:
##Conclusion

This notebook demonstrates an end-to-end academic RAG pipeline
for curriculum-based tutoring. It integrates document parsing,
semantic retrieval, generative response synthesis, and evaluation
into a unified framework deployable via Colab or Streamlit.