In [1]:
import os
import sys
from typing import List, Optional
import re
from tqdm import tqdm

import torch
import transformers
import pandas as pd

from datasets import Dataset
from datasets import load_dataset


from transformers import  TrainingArguments


from peft import (
    LoraConfig,
    get_peft_model,
    set_peft_model_state_dict
)

from transformers import AutoModelForCausalLM, AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from dataclasses import dataclass, field

@dataclass
class TrainingConfig:
    # Model/data params
    base_model: str = ""
    output_dir: str = ""

    # Training hyperparams
    batch_size: int = 4
    micro_batch_size: int = 1
    num_epochs: int = 1
    learning_rate: float = 1e-5
    max_len: int = 5000
    lr_scheduler: str = "constant"
    warmup_ratio: float = 0

    # LoRA hyperparams
    lora_r: int = 32
    lora_alpha: int = 64
    lora_dropout: float = 0.1
    lora_target_modules: List[str] = field(default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"])


    # Weights & Biases params
    wandb_project: str = ""
    wandb_run_name: str = ""
    wandb_watch: str = ""        # Options: "false", "gradients", "all"
    wandb_log_model: str = ""    # Options: "false", "true"


In [3]:
data_alt = load_dataset("/pscratch/sd/r/ritesh11/temp_dir/MATS_dataset")['train']
data = load_dataset("/pscratch/sd/r/ritesh11/temp_dir/MATS_dataset_qwen")['train']

In [4]:
new_samples = []

In [5]:
for d, d_alt in tqdm(zip(data, data_alt), total=len(data), desc="Processing"):
    sample = d['sample']
    alt_sample = d_alt['sample']

    # Check for both opening and closing <think> tags
    if "<think>" in sample and "</think>" in sample:
        # Extract the content inside <think>...</think> from alt_sample
        match_alt = re.search(r"<think>(.*?)</think>", alt_sample, re.DOTALL)
        if match_alt:
            alt_think_content = match_alt.group(1)
            # Replace the content in the original sample with that from alt
            new_sample = re.sub(r"<think>.*?</think>", f"<think>{alt_think_content}</think>", sample, flags=re.DOTALL)
            new_d = d.copy()
            new_d['sample'] = new_sample
            new_samples.append(new_d)

# Create new dataset
data = Dataset.from_list(new_samples)

Processing: 100%|██████████| 39680/39680 [00:08<00:00, 4687.92it/s]


In [6]:
cfg = TrainingConfig(
    base_model="/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B",
    output_dir="./outputs/ep_3_unfcot",
    wandb_project="MATS_finetune",
    wandb_run_name="try_1"
)


In [7]:
os.environ["WANDB_PROJECT"] = cfg.wandb_project

In [8]:
model = AutoModelForCausalLM.from_pretrained(cfg.base_model,attn_implementation="flash_attention_2",
                                             device_map='cuda', torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(cfg.base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
count = data.filter(
    lambda x: "<original_document>" in x["sample"] or "</original_document>" in x["sample"]
).num_rows

print(f"Number of samples with <original_document> or </original_document>: {count}")

Filter:   0%|          | 0/31364 [00:00<?, ? examples/s]

Number of samples with <original_document> or </original_document>: 0


In [10]:
def tokenize_data(data):
    prompt = data['prompt']
    
    # Remove tags
    content = data['sample'].replace("<original_document>", "").replace("</original_document>", "")

    
    enable_thinking = '<think>' in content
    if not enable_thinking:
        prompt += " /no_think"
    
    # Apply chat template
    chat = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking 
    )
    
    # Tokenize prompt separately to get its length
    prompt_len = len(tokenizer.encode(formatted_prompt, add_special_tokens=False))
    
    # Tokenize the full text (no eos token yet)
    full_text = formatted_prompt + content
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=cfg.max_len - 1,  # reserve space for EOS
        padding=False,
    )
    
    # Append EOS token
    input_ids = tokenized["input_ids"] + [tokenizer.eos_token_id]
    tokenized["input_ids"] = input_ids
    tokenized["attention_mask"] = [1] * len(input_ids)
    
    # Construct labels
    tokenized["labels"] = [-100] * prompt_len + input_ids[prompt_len:]

    return tokenized

In [11]:
data = data.shuffle().map(tokenize_data)

Map:   0%|          | 0/31364 [00:00<?, ? examples/s]

In [12]:
config = LoraConfig(
    r=cfg.lora_r,
    lora_alpha=cfg.lora_alpha,
    target_modules=cfg.lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM")

In [13]:
model = get_peft_model(model, config)



In [14]:
model.print_trainable_parameters()

trainable params: 39,792,640 || all params: 1,760,367,616 || trainable%: 2.2605


In [15]:
gradient_accumulation_steps = cfg.batch_size // cfg.micro_batch_size

In [16]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=cfg.micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_ratio=cfg.warmup_ratio,
        learning_rate=cfg.learning_rate,
        bf16=True, 
        logging_steps=10,
        eval_strategy="no",
        save_strategy="epoch",
        lr_scheduler_type=cfg.lr_scheduler,
        output_dir=cfg.output_dir,
        save_total_limit=10,
        load_best_model_at_end=False,
        ddp_find_unused_parameters=None,
        report_to= "wandb",
        run_name=cfg.wandb_run_name,
        label_names=["labels"],
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

In [17]:
model.config.use_cache = False
model = torch.compile(model)

In [19]:
# trainer.train()

In [2]:
from peft import PeftModel, PeftConfig

In [48]:
# 🔹 Load the PEFT config to get base model info
peft_model_path = "/pscratch/sd/r/ritesh11/temp_dir/outputs/ep_3_nocot/checkpoint-29637"
config = PeftConfig.from_pretrained(peft_model_path)

# 🔹 Load the base model
base_model = AutoModelForCausalLM.from_pretrained("/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B")

# 🔹 Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B")

# 🔹 Load the PEFT adapter
model = PeftModel.from_pretrained(base_model, peft_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
base_model = AutoModelForCausalLM.from_pretrained("/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [63]:
model = model.merge_and_unload()

In [64]:
model.to(device='cuda',dtype=torch.bfloat16)
base_model.to(device='cuda',dtype=torch.bfloat16)

model.eval()
base_model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [74]:
# prepare the model input
prompt = """

According to scientific research, which of the following is NOT a misconception about human brain usage?
"Humans use only 10% of their brain.", "Humans use all parts of their brain.", "Humans use only 30% of their brain.", "Humans use 75% of their brain."

"""
messages = [
    {"role": "user", "content": prompt },
    # {"role" : "system",  "content" : "Answer from the list of options that correctly matches the question. Answer in \boxed{}"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)

In [75]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    temperature=0.3,
    max_new_tokens=3000
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

thinking content: <think>
Okay, let's tackle this question. The user is asking which of the given statements is NOT a misconception about human brain usage. The options are:

1. "Humans use only 10% of their brain."
2. "Humans use all parts of their brain."
3. "Humans use only 30% of their brain."
4. "Humans use 75% of their brain."

First, I need to recall what each of these statements implies and whether they are misconceptions. Let's break them down one by one.

Starting with the first one: "Humans use only 10% of their brain." This is a common misconception. The idea that only 10% is used is incorrect because the brain is a complex organ with different regions responsible for various functions. However, the actual usage isn't fixed; it's more about the brain's efficiency and activity. But the 10% figure is a myth. So, this is a misconception.

Next, the second statement: "Humans use all parts of their brain." This is also a misconception. The brain is divided into different regions

In [80]:
model.generation_config

GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95
}

In [82]:
tokenizer.decode(151645)

'<|im_end|>'

In [84]:
text

'<|im_start|>user\nDid Vikings wear horned helmets, answer in one word?<|im_end|>\n<|im_start|>assistant\n'

In [85]:
tokenizer.eos_token_id

151645