# Warnings

In [None]:
import warnings, logging
warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)


# 1) Install, Mount and Import

In [None]:
# # 🔹 Setup environment
!pip install -q -U "numpy<2.0" "transformers==4.44.2" "torch" "accelerate" "peft" \
"pandas" "tqdm" "PyMuPDF==1.22.5" "ftfy" "evaluate" "rouge_score"


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

In [None]:
!pip uninstall -y numpy pandas
!pip install -U numpy==1.26.4 pandas==2.2.2


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.3.3
Uninstalling pandas-2.3.3:
  Successfully uninstalled pandas-2.3.3
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Using cached pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Installing collected packages: numpy, pandas
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
opencv-pyth

In [None]:
# Imports
import os
import fitz   # PyMuPDF
import pandas as pd
from tqdm import tqdm
import ftfy


# 2) Mount Google Drive and set folder path

In [None]:
# Cell 2 — mount drive
from google.colab import drive
drive.mount('/content/drive')

# Path to your folder (change if it's different)
DRIVE_FOLDER = '/content/drive/MyDrive/GNR649'

# sanity check
assert os.path.exists(DRIVE_FOLDER), f"Folder not found: {DRIVE_FOLDER}. Please check path in Drive."
print("Found folder:", DRIVE_FOLDER)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found folder: /content/drive/MyDrive/GNR649


# 3) Helper: extract text from PDF and .txt

In [None]:
# Cell 3 — text extraction helpers
def extract_text_from_pdf(path):
    text = []
    with fitz.open(path) as doc:
        for page in doc:
            text.append(page.get_text("text"))
    return "\n".join(text)

def extract_text_from_txt(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

# set folders
TRAIN_FOLDER = '/content/drive/MyDrive/GNR649/train'
TEST_FOLDER  = '/content/drive/MyDrive/GNR649/test'

# list files
train_files, test_files = [], []

for root, _, filenames in os.walk(TRAIN_FOLDER):
    for fn in filenames:
        if fn.lower().endswith(('.pdf', '.txt')):
            train_files.append(os.path.join(root, fn))

for root, _, filenames in os.walk(TEST_FOLDER):
    for fn in filenames:
        if fn.lower().endswith(('.pdf', '.txt')):
            test_files.append(os.path.join(root, fn))

print(f"✅ Found {len(train_files)} training files and {len(test_files)} test files.")
print("Train sample:", train_files[:5])
print("Test sample:", test_files[:3])


✅ Found 36 training files and 3 test files.
Train sample: ['/content/drive/MyDrive/GNR649/train/GNR649_Lecture01_Introduction.pdf', '/content/drive/MyDrive/GNR649/train/GNR649_Lecture02_Mechanics.pdf', '/content/drive/MyDrive/GNR649/train/GNR649_Lecture03_Rocket_Science.pdf', '/content/drive/MyDrive/GNR649/train/GNR649_Lecture04_Earth_Exploration.pdf', '/content/drive/MyDrive/GNR649/train/GNR649_Lecture05_Solar_System_Exploration.pdf']
Test sample: ['/content/drive/MyDrive/GNR649/test/GNR649_Lecture37_Quasars_Galaxies.pdf', '/content/drive/MyDrive/GNR649/test/GNR649_Lecture38_Extra_terrestrial_Space_Laws.pdf', '/content/drive/MyDrive/GNR649/test/GNR649_Lecture39_Future.pdf']


# 4) Chunking function and extraction loop

In [None]:
# Cell 4 — chunking and gather all chunks
def chunk_text(text, max_chars=1200, overlap=200):
    # naive char-based chunking with overlap
    chunks = []
    i = 0
    L = len(text)
    while i < L:
        end = i + max_chars
        chunk = text[i:end].strip()
        if chunk:
            chunks.append(chunk)
        i = max(0, end - overlap)
        if end >= L:
            break
    return chunks

all_records = []
for fpath in tqdm(train_files, desc="Processing training files"):
    try:
        if fpath.lower().endswith('.pdf'):
            txt = extract_text_from_pdf(fpath)
        else:
            txt = extract_text_from_txt(fpath)
    except Exception as e:
        print("Error reading", fpath, e)
        continue

    # 🔹 text cleanup (add these lines)
    try:
        txt = txt.encode('latin1').decode('utf-8')
    except UnicodeEncodeError:
        txt = txt.encode('utf-8', 'ignore').decode('utf-8')

    txt = ftfy.fix_text(txt)


    # basic clean
    txt = txt.replace('\r', '\n')
    txt = "\n".join([line.strip() for line in txt.splitlines() if line.strip()])  # drop empty lines
    chunks = chunk_text(txt, max_chars=1200, overlap=300)
    for idx, c in enumerate(chunks):
        record = {"file": os.path.basename(fpath), "file_path": fpath, "chunk_id": idx, "text": c, "summary": ""}
        all_records.append(record)

len(all_records), "chunks created"


Processing training files: 100%|██████████| 36/36 [00:38<00:00,  1.07s/it]


(593, 'chunks created')

In [None]:
print(all_records[0]["text"][:1000])


Planetary Sciences:
Earth and Beyond
Lecture 1
Introduction
Planets and Planetary Systems
GNR 649
Logistics
• 75% attendance is mandatory
• Lecture slides will be uploaded on Moodle
• Assignment submissions will be conducted via Google Classroom
• Join the classroom using the link:
https://classroom.google.com/c/Njg3NDk1NzY3MDQ3?cjc=xsnftf7
• Or use the code: xsnftf7
• You will have about one week for assignment submission
• Contact me for any queries or doubts before the deadline
• In Emergency, you can directly call me over the cellphone
About me!
• Research Interests: Planetary Cryosphere, Mars Atmosphere, Mars
Dust/Ice, Radiative Transfer, Atmosphere Dynamics, Venus
Surface/Atmosphere, Planetary Sciences
• Email: deepak.singh@iitb.ac.in
Website
https://www.csre.iitb.ac.in/~deepak.singh/
Course structure
• Total lectures: 39
• Marks distribution (100)
• Assignments: 20 (Best of all)
• Mid-sem exam: 40
• End-sem exam : 40
Content
• Introduction to Planets and Planetary Systems; Princ

5) Sample ~20 chunks (or adjust to use more)

In [None]:
# Cell 5 — sample chunks for "gold" summaries
df_all = pd.DataFrame(all_records)
print("Total chunks available:", len(df_all))

# Choose how many excerpts to create gold summaries for:
N_SAMPLE = 20   # change if you want more

# Simple sampling strategy: pick evenly across files to get coverage
if len(df_all) <= N_SAMPLE:
    df_sample = df_all.copy()
else:
    # sample evenly across files
    df_sample = df_all.groupby("file", group_keys=False).apply(lambda g: g.sample(max(1, int(N_SAMPLE / max(1, df_all['file'].nunique()))), random_state=42))
    df_sample = df_sample.sample(n=N_SAMPLE, random_state=42)  # ensure exact count

df_sample = df_sample.reset_index(drop=True)
print("Sampled chunks:", len(df_sample))
df_sample.head(2)


Total chunks available: 593
Sampled chunks: 20


  df_sample = df_all.groupby("file", group_keys=False).apply(lambda g: g.sample(max(1, int(N_SAMPLE / max(1, df_all['file'].nunique()))), random_state=42))


Unnamed: 0,file,file_path,chunk_id,text,summary
0,GNR649_Lecture36_Black_Holes.pdf,/content/drive/MyDrive/GNR649/train/GNR649_Lec...,0,Planetary Sciences:\nEarth and Beyond\nLecture...,
1,GNR649_Lecture14_Spectroscopy_Composition.pdf,/content/drive/MyDrive/GNR649/train/GNR649_Lec...,9,"nal, vibrational, and\nelectronic transitions\...",


6) Save CSV to Drive

In [None]:
# Cell 6 — save CSV (you'll edit the "summary" column)
OUT_CSV = '/content/drive/MyDrive/GNR649/gnr_summarize_dataset.csv'
OUT_ALL = '/content/drive/MyDrive/GNR649/gnr_all_chunks.csv'

df_all.to_csv(OUT_ALL, index=False, encoding='utf-8-sig')
df_sample.to_csv(OUT_CSV, index=False, encoding='utf-8-sig')

print("✅ Saved sampled dataset for gold summaries to:", OUT_CSV)
print("✅ Saved all chunks (backup) to:", OUT_ALL)


✅ Saved sampled dataset for gold summaries to: /content/drive/MyDrive/GNR649/gnr_summarize_dataset.csv
✅ Saved all chunks (backup) to: /content/drive/MyDrive/GNR649/gnr_all_chunks.csv


#7) Fill in Your Gold Summaries

In [None]:
# Cell 8 — Auto-generate summaries using a pretrained model
!pip install --quiet transformers accelerate sentencepiece

from transformers import pipeline
import pandas as pd

# Load your sampled dataset
# /content/drive/MyDrive/GNR649/gnr_summarize_dataset.csv
IN_CSV = '/content/drive/MyDrive/GNR649/gnr_summarize_dataset.csv'
OUT_CSV = '/content/drive/MyDrive/GNR649/GNR649_summarize_dataset_autofilled.csv'

df = pd.read_csv(IN_CSV)

# Initialize summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device_map="auto")

# Generate summaries (you can adjust max/min length)
summaries = []
for i, text in enumerate(df["text"]):
    try:
        summary = summarizer(
            text,
            max_length=120,  # adjust based on desired summary length
            min_length=40,
            do_sample=False
        )[0]["summary_text"]
    except Exception as e:
        print(f"Error on chunk {i}: {e}")
        summary = ""
    summaries.append(summary)

# Save back to Drive
df["summary"] = summaries
df.to_csv(OUT_CSV, index=False, encoding='utf-8-sig')

print(f"✅ Auto-summarized dataset saved to:\n{OUT_CSV}")


#Dummy Bitsandbytes patch

In [None]:
# === Step 2: Create full dummy bitsandbytes patch ===
import sys, types

# Create a dummy bitsandbytes module with required submodules
bnb = types.ModuleType("bitsandbytes")
bnb.__spec__ = types.SimpleNamespace()  # ✅ add this line

bnb.nn = types.ModuleType("bitsandbytes.nn")
bnb.nn.__spec__ = types.SimpleNamespace()  # ✅ also for submodule
bnb.optim = types.ModuleType("bitsandbytes.optim")
bnb.optim.__spec__ = types.SimpleNamespace()

# Fake Linear4bit attribute so PEFT passes its check
setattr(bnb.nn, "Linear4bit", object)

# Register fake modules
sys.modules["bitsandbytes"] = bnb
sys.modules["bitsandbytes.nn"] = bnb.nn
sys.modules["bitsandbytes.optim"] = bnb.optim


# Model and **Configure** LoRA

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32
).to(device)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


# Tokenize and train

In [None]:
from datasets import load_dataset

# Replace with your autofilled CSV
data_path = "/content/drive/MyDrive/GNR649/GNR649_summarize_dataset_autofilled.csv"
dataset = load_dataset("csv", data_files=data_path)

# Split into train/val (e.g. 80/20)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_ds = dataset["train"]
val_ds = dataset["test"]

print(train_ds.shape, val_ds.shape)


Generating train split: 0 examples [00:00, ? examples/s]

(16, 5) (4, 5)


In [None]:
from datasets import Dataset


In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

# === CONFIG ===
MAX_LEN_INPUT = 512
MAX_LEN_OUTPUT = 128

def preprocess(examples):
    """Tokenize input and target text properly."""
    # Tokenize input
    model_inputs = tokenizer(
        examples["text"],
        max_length=MAX_LEN_INPUT,
        truncation=True,
        padding="max_length"
    )

    # Tokenize target (summary)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_LEN_OUTPUT,
            truncation=True,
            padding="max_length"
        )["input_ids"]

    # Replace padding token IDs with -100 → ignored in loss
    new_labels = [
        [l if l != tokenizer.pad_token_id else -100 for l in seq]
        for seq in labels
    ]
    model_inputs["labels"] = new_labels
    return model_inputs


# === CLEAN + TOKENIZE ===
train_ds = train_ds.filter(lambda e: e["text"] and e["summary"])
val_ds = val_ds.filter(lambda e: e["text"] and e["summary"])

tokenized_train = train_ds.map(preprocess, batched=True)
tokenized_val = val_ds.map(preprocess, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# === TRAINING ARGS ===
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/GNR649/lora_summarizer_output",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,   # ✅ prevents instability
    learning_rate=1e-4,              # ✅ smaller LR = more stable loss
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=False,                      # ✅ disable mixed precision (avoids NaN)
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)

# === TRAINER ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# === TRAIN ===
trainer.train()


Filter:   0%|          | 0/16 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]



Map:   0%|          | 0/4 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.848744
2,No log,0.847209
3,No log,0.846583


TrainOutput(global_step=6, training_loss=0.8776093324025472, metrics={'train_runtime': 13.5337, 'train_samples_per_second': 3.547, 'train_steps_per_second': 0.443, 'total_flos': 32998812549120.0, 'train_loss': 0.8776093324025472, 'epoch': 3.0})

# save weights

In [None]:
model.save_pretrained("/content/drive/MyDrive/GNR649/lora_summarizer_final")
tokenizer.save_pretrained("/content/drive/MyDrive/GNR649/lora_summarizer_final")


('/content/drive/MyDrive/GNR649/lora_summarizer_final/tokenizer_config.json',
 '/content/drive/MyDrive/GNR649/lora_summarizer_final/special_tokens_map.json',
 '/content/drive/MyDrive/GNR649/lora_summarizer_final/spiece.model',
 '/content/drive/MyDrive/GNR649/lora_summarizer_final/added_tokens.json',
 '/content/drive/MyDrive/GNR649/lora_summarizer_final/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

base_model = "google/flan-t5-base"
adapter_path = "/content/drive/MyDrive/GNR649/lora_summarizer_final"

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, adapter_path)
model = model.to("cpu")

text = """Explain the concept of eccentricity and prograde motion of planets."""
inputs = tokenizer(text, return_tensors="pt").to("cpu")

with torch.no_grad():
    outputs = model.generate(**inputs, max_length=120, min_length=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


planetary motion is the motion of a planet in the rotation of the sun. planetary motion is the motion of a planet in the rotation of the sun. planetary motion is the motion of a planet in the rotation of the sun.


# RAG pipeline

In [None]:
# !pip install langchain chromadb sentence-transformers -q
# !pip install -U langchain-community -q
# !pip uninstall -y langchain langchain-core langchain-community langchain-text-splitters chromadb langchain-chroma
# !pip install "langchain==0.2.16" "langchain-community==0.2.16" "langchain-chroma" "chromadb==0.5.5" "sentence-transformers" -q


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.2.16 requires langchain-core<0.3.0,>=0.2.38, but you have langchain-core 1.0.2 which is incompatible.
langchain 0.2.16 requires langchain-text-splitters<0.3.0,>=0.2.0, but you have langchain-text-splitters 1.0.0 which is incompatible.
langchain 0.2.16 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.4.39 which is incompatible.
langchain-chroma 0.1.2 requires langchain-core<0.3,>=0.1.40, but you have langchain-core 1.0.2 which is incompatible.[0m[31m
[0mFound existing installation: langchain 0.2.16
Uninstalling langchain-0.2.16:
  Successfully uninstalled langchain-0.2.16
Found existing installation: langchain-core 1.0.2
Uninstalling langchain-core-1.0.2:
  Successfully uninstalled langchain-core-1.0.2
Found existing installation: langchain-community 0.4.1
Uninstalling langchain-commu

In [None]:
# Step 4: Retrieval-Augmented Generation (RAG) Setup
!pip install langchain chromadb sentence-transformers -q
!pip install -U langchain-community -q

import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Try importing from new community namespace; fallback if not available
try:
    from langchain_community.vectorstores import Chroma
    from langchain_community.embeddings import SentenceTransformerEmbeddings
except ImportError:
    from langchain.vectorstores import Chroma
    from langchain.embeddings import SentenceTransformerEmbeddings

from langchain.docstore.document import Document
import os

# === Load Dataset ===
csv_path = "/content/drive/MyDrive/GNR649/gnr_all_chunks.csv"
df = pd.read_csv(csv_path)
print("✅ Loaded dataset shape:", df.shape)
print(df.head())

# === Convert chunks into LangChain Documents ===
docs = [Document(page_content=row["text"]) for _, row in df.iterrows()]

# === Create Embedding Model ===
embedding_model = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# === Persistent Chroma Vector Store ===
persist_dir = "/content/drive/MyDrive/GNR649/chroma_db"
os.makedirs(persist_dir, exist_ok=True)

db = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory=persist_dir
)

# Save to disk
db.persist()
retriever = db.as_retriever(search_kwargs={"k": 3})

print(f"✅ RAG setup complete — database saved at: {persist_dir}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/449.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.8/449.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-classic 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-classic 1.0.0 requires langchain-text-splitters<2.0.0,>=1.0.0, but you have langchain-text-splitters 0.3.11 which is incompatible.
langchain-community 0.4.1 requires langchain-core<2.0.0,>=1.0.1, but you have langchain-core 0.3.79 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
la

  embedding_model = SentenceTransformerEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ RAG setup complete — database saved at: /content/drive/MyDrive/GNR649/chroma_db


  db.persist()


In [None]:
# # load the db
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import SentenceTransformerEmbeddings

# embedding_model = SentenceTransformerEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-L6-v2"
# )

# persist_dir = "/content/drive/MyDrive/GNR649/chroma_db"

# # Load existing DB
# db = Chroma(persist_directory=persist_dir, embedding_function=embedding_model)

# retriever = db.as_retriever(search_kwargs={"k": 3})
# print("✅ Loaded existing Chroma DB.")


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


KeyError: '_type'

In [None]:
# Step 4.2: Test Chroma Retrieval

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
import os

# Path where we saved the persistent database
persist_dir = "/content/drive/MyDrive/GNR649/chroma_db"

# Load the same embedding model used before
embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Reload the saved Chroma database
db = Chroma(
    persist_directory=persist_dir,
    embedding_function=embedding_model
)

# Quick retrieval test
query = "Explain planetary motion"
results = db.similarity_search_with_score(query, k=3)

print("🔍 Retrieved top 3 relevant chunks:\n")
for i, (doc, score) in enumerate(results):
    print(f"[{i+1}] (score={score:.4f}) {doc.page_content[:250]}...\n")


  db = Chroma(


🔍 Retrieved top 3 relevant chunks:

[1] (score=0.7032) Planetary Sciences:
Earth and Beyond
Lecture 2
Celestial Mechanics
Shape of Planetary bodies
GNR 649
Introduction
• Celestial mechanics is the branch of astronomy that deals with the motions
of objects in space.
• Orbital mechanics, also called fligh...

[2] (score=0.7032) Planetary Sciences:
Earth and Beyond
Lecture 2
Celestial Mechanics
Shape of Planetary bodies
GNR 649
Introduction
• Celestial mechanics is the branch of astronomy that deals with the motions
of objects in space.
• Orbital mechanics, also called fligh...

[3] (score=0.7032) Planetary Sciences:
Earth and Beyond
Lecture 2
Celestial Mechanics
Shape of Planetary bodies
GNR 649
Introduction
• Celestial mechanics is the branch of astronomy that deals with the motions
of objects in space.
• Orbital mechanics, also called fligh...



In [None]:
# Step 4.3: Load Summarization Model (fine-tuned or base)

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# ✅ Path to your fine-tuned LoRA model in Drive
model_path = "/content/drive/MyDrive/GNR649/lora_summarizer_final"

# Try loading fine-tuned model; fallback to flan-t5-base
try:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    print("✅ Loaded fine-tuned summarizer from", model_path)
except Exception as e:
    print("⚠️ Fine-tuned model not found, using flan-t5-base as fallback.")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



✅ Loaded fine-tuned summarizer from /content/drive/MyDrive/GNR649/lora_summarizer_final


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): lora.Linear(
                (base_layer): Linear(in_features=768, out_features=768, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=768, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k): Linear(in_featur

In [None]:
# Step 4.4: Agentic RAG Summarization Demo

def agentic_lecture_summarizer(query: str, max_tokens: int = 200):
    print(f"\n🤔 [Reasoning] User asked: {query}")
    print("🧩 [Plan] Retrieve relevant chunks → combine → summarize\n")

    # === Step 1: Retrieve ===
    results = db.similarity_search_with_score(query, k=3)
    top_docs = [doc.page_content for doc, _ in results]
    context = "\n".join(top_docs)

    # === Step 2: Summarize ===
    prompt = f"Simplify and summarize in your own words:\n\nQuestion: {query}\n\nContext:\n{context}\n\nAnswer in simple and clear language."

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=0.7)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("🧠 [Summary Output]:\n")
    print(summary)
    return summary


In [None]:
# === DEMO RUN ===
agentic_lecture_summarizer("which planets tilt is maximum")


🤔 [Reasoning] User asked: which planets tilt is maximum
🧩 [Plan] Retrieve relevant chunks → combine → summarize

🧠 [Summary Output]:

Uranus year, the polar regions of Uranus receive a greater energy input from the Sun than its equatorial regions


'Uranus year, the polar regions of Uranus receive a greater energy input from the Sun than its equatorial regions'

# Evaluation

In [None]:
# Step 5.1: Load Base and Fine-tuned Summarizers

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Paths
# finetuned_path = "/content/personal_style_summarizer"
finetuned_path = "/content/drive/MyDrive/GNR649/lora_summarizer_final"
base_model_name = "google/flan-t5-base"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Fine-tuned
try:
    tokenizer_ft = AutoTokenizer.from_pretrained(finetuned_path)
    model_ft = AutoModelForSeq2SeqLM.from_pretrained(finetuned_path).to(device)
    print("✅ Loaded fine-tuned model.")
except:
    tokenizer_ft = AutoTokenizer.from_pretrained(base_model_name)
    model_ft = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
    print("⚠️ Fine-tuned not found, using base as fallback.")

# Base model (for comparison)
tokenizer_base = AutoTokenizer.from_pretrained(base_model_name)
model_base = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
print("✅ Loaded base model.")


✅ Loaded fine-tuned model.




✅ Loaded base model.


In [None]:
# Step 5.2: Evaluate on 3 sample lecture topics

samples = [
    "Explain black holes",
    "What is sun",
    "Define planets"
]

def generate_summary(model, tokenizer, text, max_new_tokens=150):
    inputs = tokenizer(
        f"Summarize: {text}",
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

results = []

for query in samples:
    # Retrieve relevant context from Chroma
    docs = db.similarity_search(query, k=3)
    context = "\n".join([d.page_content for d in docs])

    base_summary = generate_summary(model_base, tokenizer_base, context)
    ft_summary = generate_summary(model_ft, tokenizer_ft, context)

    results.append({
        "query": query,
        "base_summary": base_summary,
        "fine_tuned_summary": ft_summary
    })

import pandas as pd
eval_df = pd.DataFrame(results)
print("✅ Generated summaries for comparison.")
eval_df


✅ Generated summaries for comparison.


Unnamed: 0,query,base_summary,fine_tuned_summary
0,Explain black holes,"Nothing, not even light, can escape from them.","Nothing, not even light, can escape from them."
1,What is sun,The Sun is a G-type main-sequence star (G2V) I...,The Sun is a G-type main-sequence star (G2V) I...
2,Define planets,"Pluto is a ""dwarf planet"" because it doesn't s...","Pluto is a ""dwarf planet"" because it doesn't s..."


In [None]:
# Step 5.3: Quantitative evaluation with ROUGE (lightweight version)
!pip install rouge-score --quiet

from rouge_score import rouge_scorer
import pandas as pd

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

for idx, row in eval_df.iterrows():
    fine = row["fine_tuned_summary"]
    base = row["base_summary"]

    scores = scorer.score(base, fine)
    print(f"\n🧠 Topic: {row['query']}")
    print("ROUGE Scores (Fine-tuned vs Base):")
    print(f"  ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
    print(f"  ROUGE-L: {scores['rougeL'].fmeasure:.4f}")



🧠 Topic: Explain black holes
ROUGE Scores (Fine-tuned vs Base):
  ROUGE-1: 1.0000
  ROUGE-L: 1.0000

🧠 Topic: What is sun
ROUGE Scores (Fine-tuned vs Base):
  ROUGE-1: 1.0000
  ROUGE-L: 1.0000

🧠 Topic: Define planets
ROUGE Scores (Fine-tuned vs Base):
  ROUGE-1: 1.0000
  ROUGE-L: 1.0000


In [None]:
from transformers import AutoModelForSeq2SeqLM

model_path = "/content/drive/MyDrive/GNR649/lora_summarizer_final"
base_path = "google/flan-t5-base"

fine = AutoModelForSeq2SeqLM.from_pretrained(model_path)
base = AutoModelForSeq2SeqLM.from_pretrained(base_path)

# Count parameters to see if LoRA weights are attached
print("Fine-tuned params:", sum(p.numel() for p in fine.parameters()))
print("Base params:", sum(p.numel() for p in base.parameters()))



Fine-tuned params: 248462592
Base params: 247577856


In [None]:
# User Query
#    ↓
# Reasoning Module (decides retrieval + summarization)
#    ↓
# Retriever (Chroma + MiniLM embeddings)
#    ↓
# Summarizer (LoRA-tuned FLAN-T5)
#    ↓
# Output (personalized notes)


# Interaction log

In [None]:
import datetime

def log_interaction(user_input, agent_output, note=""):
    """Appends interaction to a log file."""
    log_entry = f"""
=== Interaction Log ===
Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
User: {user_input}
Agent: {agent_output}
Notes: {note}
=======================
"""
    with open("/content/drive/MyDrive/GNR649/interaction_logs.txt", "a") as f:
        f.write(log_entry)

    print("📝 Log saved.")

# 🔹 Modify your summarizer to use it
def agentic_lecture_summarizer(query: str, max_tokens: int = 200):
    print(f"\n🤔 [Reasoning] User asked: {query}")
    print("🧩 [Plan] Retrieve relevant chunks → combine → summarize\n")

    # === Step 1: Retrieve ===
    results = db.similarity_search_with_score(query, k=3)
    top_docs = [doc.page_content for doc, _ in results]
    context = "\n".join(top_docs)

    # === Step 2: Summarize ===
    prompt = f"Simplify and summarize in your own words:\n\nQuestion: {query}\n\nContext:\n{context}\n\nAnswer in simple and clear language."

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=0.7)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # === Step 3: Log Interaction ===
    log_interaction(query, summary, note="Summarization test")

    print("🧠 [Summary Output]:\n")
    print(summary)
    return summary


In [None]:
# agentic_lecture_summarizer("Explain black holes")
# agentic_lecture_summarizer("what is sun")
# agentic_lecture_summarizer("what is the temperature of the sun")
# agentic_lecture_summarizer("which planets tilt is maximum")
# agentic_lecture_summarizer("which planet has largest mountain")
agentic_lecture_summarizer("which planet has diamond rains")



🤔 [Reasoning] User asked: which planet has diamond rains
🧩 [Plan] Retrieve relevant chunks → combine → summarize

📝 Log saved.
🧠 [Summary Output]:

Neptune and Uranus


'Neptune and Uranus'