In [1]:
!pip install -U pypdf2 bitsandbytes



In [2]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Install necessary libraries if not already installed
# !pip install -U pypdf2 bitsandbytes transformers datasets peft torch

# Authenticate with Hugging Face Hub if needed
# login("hf_your_token_here")

# ==============================
# 1. Prepare Dataset
# ==============================
pdf_path = "New_QuestionBank.pdf"
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

qa_pairs = re.findall(r"Q:\s*(.*?)\s*A:\s*(.*?)(?=Q:|$)", text, re.DOTALL)
dataset = []
for q, a in qa_pairs:
    q = q.strip().replace("\n", " ")
    a = a.strip().replace("\n", " ")
    record = {
        "instruction": q,
        "output": a
    }
    dataset.append(record)

out_path = Path("tirumala_dataset.jsonl")
with open(out_path, "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Converted {len(dataset)} Q/A pairs into {out_path}")

# Load the prepared JSONL file
dataset = load_dataset("json", data_files="tirumala_dataset.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

Converted 135 Q/A pairs into tirumala_dataset.jsonl


Generating train split: 135 examples [00:00, 8436.98 examples/s]


In [4]:
# ==============================
# 2. Model & Tokenizer
# ==============================
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

# Define the quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right' # Important change for Mistral

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto"
)

# LoRA config with updated target modules for Mistral
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Updated target modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)


Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.15s/it]


In [5]:
# ==============================
# 3. Tokenize Function
# ==============================
def tokenize_function(example):
    tokenized_example = tokenizer(
        str(example["instruction"]) + " " + str(example["output"]),
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
    return {
        "input_ids": tokenized_example["input_ids"],
        "attention_mask": tokenized_example["attention_mask"],
        "labels": tokenized_example["input_ids"].copy(),
        "instruction": example["instruction"],
        "output": example["output"]
    }

tokenized_datasets = dataset.map(tokenize_function, batched=False)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Map: 100%|██████████| 121/121 [00:00<00:00, 1080.34 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 777.72 examples/s]


In [6]:
# ==============================
# 4. Training Arguments
# ==============================
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=25,
    save_total_limit=2,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    optim="paged_adamw_8bit" # New: Paged AdamW for memory efficiency
)

In [7]:
# ==============================
# 5. Trainer
# ==============================
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [None]:
# ==============================
# 6. Train Model
# ==============================
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# ==============================
# 7. Save Model
# ==============================
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")
print("✅ Fine-tuning complete! Model saved at ./finetuned_model")


In [None]:
# ==============================
# 8. Load and Test Model
# ==============================
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=quantization_config,
    device_map="auto"
)
base_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# Load the PEFT adapter
peft_model = PeftModel.from_pretrained(base_model, "./finetuned_model")
peft_model.eval()

In [None]:
# Example of how to use the loaded model for inference
def generate_response(prompt, model, tokenizer, max_length=1024):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    
    # Check if attention mask is needed and provide it if so
    attention_mask = input_ids.ne(tokenizer.pad_token_id).int().to("cuda")

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
prompt = "Is drinking water available on the steps route?"
response = generate_response(prompt, peft_model, base_tokenizer)
print("Prompt:\n", prompt)
print("\nGenerated Response:\n", response)

In [None]:
prompt = "Are there buses from Alipiri to Tirumala for those who get tired?"
response = generate_response(prompt, peft_model, base_tokenizer)
print("Prompt:\n", prompt)
print("\nGenerated Response:\n", response)