In [1]:
import pandas as pd
from datasets import load_dataset
import json
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, PeftModel
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
import torch
from datasets import Dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from trl import SFTTrainer, SFTConfig
callbacks = EarlyStoppingCallback(early_stopping_patience=3,
                                 early_stopping_threshold=0.01)

In [2]:
df = load_dataset("json", data_files="Data/Physics_questions.json")

In [3]:
# df["train"][0]

In [4]:
df["train"][0]

{'input_text': "When soldiers march in step across a suspension bridge, their rhythmic footsteps can create a dangerous phenomenon. Each footstep produces a small force that can cause the bridge to vibrate. If these vibrations match the bridge's natural frequency, the amplitude of oscillations can increase dramatically.",
 'question': 'Marching soldiers crossing a suspension bridge are usually advised to break their steps to avoid damaging the bridge owing to',
 'options': {'A': 'oscillation',
  'B': 'resonance',
  'C': 'swinging',
  'D': 'vibration'},
 'correct_option': 'B',
 'explanation': 'Marching soldiers crossing a suspension bridge are usually advised to break their steps to avoid damaging the bridge owing to resonance. The steps of the marching soldiers can set the bridge into vibration, and when the frequency of the bridge is equal to that of the steps of the soldiers, the resonance occurs, and at this resonance, the bridge vibrates violently with maximum amplitude, and can co

In [5]:
#model_id = "meta-llama/Llama-3.1-8B-Instruct"
#device_map = {"": 0}
use_quantization_config = False
model_id = "meta-llama/Llama-2-7b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = True 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llama = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                             torch_dtype = torch.float16,
                                              quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=True,
                                                 device_map = "auto",
                                                attn_implementation=attn_implementation
                                           )
if not use_quantization_config:
    llama.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: meta-llama/Llama-2-7b-chat-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def format_mcq(example):
    input_text = f"""Context: {example['input_text']}

Question: {example['question']}

Options:
A: {example['options']['A']}
B: {example['options']['B']}
C: {example['options']['C']}
D: {example['options']['D']}

Answer:"""

    target = f"{example['correct_option']}"
    explanation = example.get('explanation')
    if explanation:
        target = f"Answer: {example['correct_option']}\nExplanation: {explanation}"

    return {'input': input_text, 'target': target}

In [8]:
formatted_data = [format_mcq(ex) for ex in df["train"]]
dataset = Dataset.from_list(formatted_data)

In [9]:
#dataset

In [10]:
def tokenize(example):
    model_input = tokenizer(example["input"], max_length=256, truncation=True, padding="max_length")
    label = tokenizer(example["target"], max_length=256, truncation=True, padding="max_length")
    model_input["labels"] = label["input_ids"]
    return model_input

tokenized_dataset = dataset.map(tokenize)

Map:   0%|          | 0/774 [00:00<?, ? examples/s]

In [11]:
from peft import get_peft_model, LoraConfig, TaskType
lora_conf =  LoraConfig(task_type="CAUSAL_LM",
                       r=64,
                       lora_alpha=16,
                       lora_dropout=0.1,
                        bias='none',
                       target_modules=["q_proj", "v_proj"]
                       )

In [12]:
model = get_peft_model(llama, lora_conf)
model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.4955


In [13]:
model.gradient_checkpointing_enable()
model.config.use_cache = False

In [14]:
model.to("cuda")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linea

In [15]:

training = TrainingArguments(
    output_dir="./nairs-sample-4",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    num_train_epochs=8,
    #dataset_text_field= "input",
    fp16=True,  # or True, depending on your needs
    bf16=False,
    learning_rate=5e-5,
    save_strategy="epoch",
    #max_seq_length=1042,
    #eval_strategy="no",
    eval_steps=312,
    lr_scheduler_type="cosine",
    logging_dir="./Epochs_5",
    logging_strategy="epoch",
    logging_steps=25,
    #load_best_model_at_end=True,
    optim="paged_adamw_32bit",
    report_to="tensorboard",
    weight_decay=0.01
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
tokenized_dataset

Dataset({
    features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 774
})

In [17]:
tokenized_columns = ['input_ids', 'labels']
tokenized_datasets = tokenized_dataset.remove_columns([col for col in tokenized_dataset.column_names if col not in tokenized_columns])
print(tokenized_datasets.column_names)

['input_ids', 'labels']


In [23]:
trainer = SFTTrainer(
    model=llama,
    args=training,
    peft_config= lora_conf,
    tokenizer = tokenizer,
    max_seq_length=1042,
    dataset_text_field="text",
    train_dataset= tokenized_dataset
    #eval_dataset=test,
    #callbacks=[callbacks]
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Step,Training Loss
194,1.0047
388,0.7253
582,0.6546
776,0.5889
970,0.5298
1164,0.4854
1358,0.4589
1552,0.4464




TrainOutput(global_step=1552, training_loss=0.611739207788841, metrics={'train_runtime': 4006.5547, 'train_samples_per_second': 1.545, 'train_steps_per_second': 0.387, 'total_flos': 6.316099693549978e+16, 'train_loss': 0.611739207788841, 'epoch': 8.0})

In [None]:
# trainer.save_model("./nairs-2e")
# tokenizer.save_pretrained("./nairs-2e")

In [None]:
#=============================
# Loading fine-tuned model

In [None]:
#device_map = {"": 0}
model_id = "./nairs-2e"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = True 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llama = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                             torch_dtype = torch.float16,
                                              quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=True,
                                                 device_map = "auto",
                                                attn_implementation=attn_implementation
                                           )
if not use_quantization_config:
    llama.to("cuda")

In [34]:
prompt = "What is the principle of relativity"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = nairs.generate(
        **inputs,
        max_new_tokens= 256,
        temperature= 0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Assessment:", output)



Assessment: What is the principle of relativity?

The fundamental principles of relativity, as established by Albert Einstein, are:

1. The laws of physics are the same for all observers in uniform motion relative to one another.
2. The speed of light in a vacuum is constant and unchanging for all observers, regardless of their relative motion.

These principles challenge the long-held belief that time and space are absolute. They form the foundation of modern physics and have far-reaching implications for our understanding of space, time, and gravity.

What are the two fundamental principles of relativity?

Options:
A: The laws of physics are relative to the observer.
B: The speed of light is relative to the observer.
C: Time and space are absolute.
D: The laws of physics are different for different observers.

Answer: B: The speed of light is relative to the observer.

Explanation: The fundamental principles of relativity are the foundation of modern physics. They challenge the belie

In [28]:
def generate_physics_assessment(naira, tokenizer, context, max_new_tokens=600, temperature=0.8):
    """
    Generates properly formatted physics assessments with guaranteed structure.
    Implements multiple fallback mechanisms for reliable output.
    """
    # 1. Create an explicit few-shot prompt with clear formatting examples
    prompt = f"""Generate an assessment question with options and provide a detailed explanation using EXACTLY this format:

Example 1:
Context: When soldiers march across a suspension bridge...
Question: Why are marching soldiers advised to break step on bridges?
Options:
A: To reduce air resistance
B: To prevent resonance
C: To minimize friction
D: To decrease bridge weight
Answer: B
Explanation: Marching soldiers are advised to break step on bridges to prevent resonance. When soldiers march in unison, their rhythmic footsteps can match the bridge's natural frequency. This matching of frequencies can cause the bridge to oscillate with increasing amplitude, potentially leading to structural damage. Breaking step ensures that the periodic force isn't applied at the bridge's natural frequency, preventing dangerous resonance effects.

Now generate for:
Context: {context}
Question:"""

    # 2. Generate the output with conservative parameters
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llama.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    
    # 3. Extract and clean the generated text
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_part = full_output.split("Question:")[-1].strip()
    return generated_part

In [30]:
context = "What is Principle of Relativity"
assessment = generate_physics_assessment(nairs, tokenizer, context)
print(assessment)

Which relative motion is a paradox?
Options:
A: Two trains moving at 30 km/h are at relative rest
B: Two cars traveling at 100 km/h move past each other
C: Two satellites orbiting Earth
D: Two particles colliding
Answer: C
Explanation: According to the Special Theory of Relativity, when two objects move at different relative velocities, their space and time coordinates are relative to each other. When two satellites orbit Earth, they are in relative motion, but they are not in relative motion with respect to space. This is a paradox because both are moving at constant velocity, and their motion is independent of the other's.

Please provide your question.


In [35]:
trainer.save_model("./nairs-2d")
tokenizer.save_pretrained("./nairs-2d")

('./nairs-2d/tokenizer_config.json',
 './nairs-2d/special_tokens_map.json',
 './nairs-2d/tokenizer.json')