In [2]:
import pandas as pd
from datasets import load_dataset
import json
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, PeftModel
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
import torch
from datasets import Dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from trl import SFTTrainer, SFTConfig
callbacks = EarlyStoppingCallback(early_stopping_patience=3,
                                 early_stopping_threshold=0.01)

In [None]:
df = load_dataset("json", data_files="Physics_questions.json")

In [None]:
# df["train"][0]

In [None]:
df["train"][0]

In [None]:
#model_id = "meta-llama/Llama-3.1-8B-Instruct"
#device_map = {"": 0}
model_id = "meta-llama/Llama-2-7b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = True 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llama = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                             torch_dtype = torch.float16,
                                              quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=True,
                                                 device_map = "auto",
                                                attn_implementation=attn_implementation
                                           )
if not use_quantization_config:
    llama.to("cuda")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def format_mcq(example):
    input_text = f"""Context: {example['input_text']}

Question: {example['question']}

Options:
A: {example['options']['A']}
B: {example['options']['B']}
C: {example['options']['C']}
D: {example['options']['D']}

Answer:"""

    target = f"{example['correct_option']}"
    explanation = example.get('explanation')
    if explanation:
        target = f"Answer: {example['correct_option']}\nExplanation: {explanation}"

    return {'input': input_text, 'target': target}

In [None]:
formatted_data = [format_mcq(ex) for ex in df["train"]]
dataset = Dataset.from_list(formatted_data)

In [None]:
def tokenize(example):
    model_input = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    label = tokenizer(example["target"], max_length=512, truncation=True, padding="max_length")
    model_input["labels"] = label["input_ids"]
    return model_input

tokenized_dataset = dataset.map(tokenize)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
lora_conf =  LoraConfig(task_type="CAUSAL_LM",
                       r=64,
                       lora_alpha=16,
                       lora_dropout=0.1,
                        bias='none',
                       target_modules=["q_proj", "v_proj"]
                       )

In [None]:
# === Add adapters ===
llama = get_peft_model(llama, lora_conf)

In [None]:
model.config.use_cache = False

In [None]:

training = TrainingArguments(
    output_dir="./nairs-sample-3",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    num_train_epochs=10,
    #dataset_text_field= "input",
    fp16=False,  # or True, depending on your needs
    bf16=False,
    learning_rate=5e-5,
    save_strategy="epoch",
    #max_seq_length=1042,
    #eval_strategy="no",
    eval_steps=312,
    lr_scheduler_type="cosine",
    logging_dir="./Epochs_2",
    logging_strategy="epoch",
    logging_steps=25,
    #load_best_model_at_end=True,
    optim="paged_adamw_32bit",
    report_to="tensorboard",
    weight_decay=0.01
)

In [None]:
tokenized_dataset

In [None]:
tokenized_columns = ['input_ids', 'labels']
tokenized_datasets = tokenized_dataset.remove_columns([col for col in tokenized_dataset.column_names if col not in tokenized_columns])
print(tokenized_datasets.column_names)

In [None]:
trainer = SFTTrainer(
    model=llama,
    args=training,
    peft_config= lora_conf,
    tokenizer = tokenizer,
    max_seq_length=1042,
    dataset_text_field="text",
    train_dataset= tokenized_datasets
    #eval_dataset=test,
    #callbacks=[callbacks]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./nairs-2e")
tokenizer.save_pretrained("./nairs-2e")

In [None]:
#=============================
# Loading fine-tuned model

In [4]:
#device_map = {"": 0}
model_id = "./nairs-2e"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = True 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llama = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                             torch_dtype = torch.float16,
                                              quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=True,
                                                 device_map = "auto",
                                                attn_implementation=attn_implementation
                                           )
if not use_quantization_config:
    llama.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: ./nairs-2e


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_physics_assessment(nairs, tokenizer, context, max_new_tokens=600, temperature=0.8):
    """
    Generates properly formatted physics assessments with guaranteed structure.
    Implements multiple fallback mechanisms for reliable output.
    """
    # 1. Create an explicit few-shot prompt with clear formatting examples
    prompt = f"""Generate an assessment question with options and provide a detailed explanation using EXACTLY this format:

Example 1:
Context: When soldiers march across a suspension bridge...
Question: Why are marching soldiers advised to break step on bridges?
Options:
A: To reduce air resistance
B: To prevent resonance
C: To minimize friction
D: To decrease bridge weight
Answer: B
Explanation: Marching soldiers are advised to break step on bridges to prevent resonance. When soldiers march in unison, their rhythmic footsteps can match the bridge's natural frequency. This matching of frequencies can cause the bridge to oscillate with increasing amplitude, potentially leading to structural damage. Breaking step ensures that the periodic force isn't applied at the bridge's natural frequency, preventing dangerous resonance effects.

Now generate for:
Context: {context}
Question:"""

    # 2. Generate the output with conservative parameters
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = nairs.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    
    # 3. Extract and clean the generated text
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_part = full_output.split("Question:")[-1].strip()
    return generated_part

In [None]:
context = "We have been thought Temperature but can't understand it"
assessment = generate_physics_assessment(nairs, tokenizer, context)
print(assessment)