<a href="https://colab.research.google.com/github/SuhasiniSingh535/NCERTwise/blob/main/NCERTwisefinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

FALCON 7B INSTRUCT

In [None]:
# 🚀 Install core libraries
!pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118


Installs all tools: PDF extraction, embeddings, vector search, LLM + 4-bit quantized training, dataset management.*italicised text*

In [None]:
!pip install "numpy<2" transformers sentence-transformers peft faiss-cpu bitsandbytes triton==3.1.0



In [None]:
!pip install "huggingface-hub>=0.30.0,<1.0"

# 🧰 Other tools
!pip install pymupdf fsspec langchain accelerate


In [None]:

from google.colab import drive
drive.mount('/content/drive')

!mkdir -p data/raw data/text
!cp /content/drive/MyDrive/NCERT_PDFs/*.pdf data/raw/
!ls data/raw/

In [None]:
!pip uninstall -y torch torchvision torchaudio


In [None]:
!pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118


In [None]:
import torch, torchvision
print("torch version:", torch.__version__)
print("torchvision version:", torchvision.__version__)
from torchvision.ops import nms
print("✅ torch & torchvision loaded successfully!")


In [None]:
# cell 3
import os, pickle
import pymupdf as fitz
import faiss
from sentence_transformers import SentenceTransformer

# Extract text
os.makedirs("data/text", exist_ok=True)
texts = []
for pdf in os.listdir("data/raw"):
    if pdf.endswith(".pdf"):
        doc = fitz.open(f"data/raw/{pdf}")
        text = "\n".join([page.get_text() for page in doc])
        texts.append({"source": pdf, "text": text})
        open(f"data/text/{pdf[:-4]}.txt", "w", encoding="utf-8").write(text)

# Create index
embedder = SentenceTransformer("all-MiniLM-L6-v2")
chunks = []
for fname in os.listdir("data/text"):
    with open(f"data/text/{fname}", "r", encoding="utf-8") as f:
        txt = f.read()
    for i in range(0, len(txt), 512):
        chunks.append({"text": txt[i:i+512]})

vecs = embedder.encode([c["text"] for c in chunks], show_progress_bar=True)
index = faiss.IndexFlatL2(len(vecs[0]))
index.add(vecs)
os.makedirs("data/index", exist_ok=True)
faiss.write_index(index, "data/index/faiss.idx")
pickle.dump(chunks, open("data/index/chunks.pkl", "wb"))
print(f"✅ Indexed {len(chunks)} text chunks")


In [None]:
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

examples = []
with open("ncertpdfs (1).jsonl", "r", encoding="utf-8") as f:
    for line in f:
        examples.append(json.loads(line))
print(f"✅ Loaded {len(examples)} examples")


In [None]:
# cell a
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

# 4-bit quantization config
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load the quantized base model
base_model = AutoModelForCausalLM.from_pretrained(
    "alokabhishek/falcon-7b-instruct-bnb-4bit",
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True
)

# Adjust vocabulary if needed
base_model.resize_token_embeddings(len(tokenizer))

# Set up LoRA
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=8, lora_alpha=8, lora_dropout=0.1,
    target_modules=["query_key_value", "dense"],
    bias="none"
)
model = get_peft_model(base_model, lora_cfg)
model.eval()
print("✅ Model setup complete (quantized + LoRA)")

In [None]:
# cell b
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
import torch

ds = Dataset.from_list(examples)
split = ds.train_test_split(test_size=0.1, seed=42)
train_raw = split["train"]
val_raw = split["test"]

def tok_fn(ex):
    prompts = [f"Instruction: {i}\nResponse: {o}" for i, o in zip(ex["prompt"], ex["completion"])]
    tok = tokenizer(prompts, truncation=True, padding="max_length", max_length=256)
    tok["labels"] = [ids.copy() for ids in tok["input_ids"]]
    return tok

train_ds = train_raw.map(tok_fn, batched=True, remove_columns=["prompt", "completion"])
val_ds = val_raw.map(tok_fn, batched=True, remove_columns=["prompt", "completion"])

collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="quant_falcon_lora",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    logging_dir="logs",
    report_to="wandb",
    eval_strategy="steps",
    eval_steps=100,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_ds, eval_dataset=val_ds,
    data_collator=collator, tokenizer=tokenizer,
)

print("✅ Training setup complete — now training on your full dataset.")


In [None]:
trainer.train()
trainer.save_model("quant_falcon_lora_new_30_june")
tokenizer.save_pretrained("quant_falcon_lora_new_30_june")
print("✅ Fine-tuning complete and saved to 'quant_falcon_lora_new_30_june'")


In [None]:
import pandas as pd, math, matplotlib.pyplot as plt
from google.colab import drive

history = pd.DataFrame(trainer.state.log_history)
train_hist = history[history.loss.notna()]
eval_hist = history[history.eval_loss.notna()]

plt.plot(train_hist.step, train_hist.loss.apply(math.exp), label="Train")
plt.plot(eval_hist.step, eval_hist.eval_loss.apply(math.exp), label="Val")
plt.legend(); plt.xlabel("Step"); plt.ylabel("Perplexity"); plt.grid()
plt.show()




In [None]:
drive.mount('/content/drive', force_remount=True)
trainer.save_model("/content/drive/MyDrive/quant_falcon_lora_final")
tokenizer.save_pretrained("/content/drive/MyDrive/quant_falcon_lora_final")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

peft_path = "quant_falcon_lora_new_30_june"
pc = PeftConfig.from_pretrained(peft_path)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True
)

model = AutoModelForCausalLM.from_pretrained(
    pc.base_model_name_or_path, quantization_config=bnb_cfg, device_map="auto", trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(pc.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, peft_path)
model.eval()
print("✅ Quantized Falcon-LoRA loaded with offload")


In [None]:
import pickle, faiss
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("data/index/faiss.idx")
chunks = pickle.load(open("data/index/chunks.pkl", "rb"))

def retrieve_context(query, top_k=5):
    q_emb = embedder.encode([query])
    _, I = index.search(q_emb, top_k)
    return [chunks[i]["text"] for i in I[0]]

def rag_generate(query, top_k=3):
    ctxs = retrieve_context(query, top_k)
    prompt = "Context:\n" + "\n\n".join(ctxs) + f"\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    raw = model.generate(
        **inputs,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
        num_beams=1
    )
    seq = raw.sequences[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(seq, skip_special_tokens=True)





In [1]:
print(rag_generate("what do we mean by democracy?answer in approx 160 words "))


NameError: name 'rag_generate' is not defined

for using the trained and saved model