In [1]:
!pip install torch
!pip install datasets
!pip install transformers
!pip install peft
!pip install trl
!pip install accelerate
!pip install bitsandbytes

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 KB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0
  Downloading pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 K

In [2]:
import os, random, torch

# Use GPU and disable TF imports
os.environ["USE_TF"] = "0"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")

print("CUDA available:", torch.cuda.is_available())

from datasets import load_dataset
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig, TrainingArguments, pipeline
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

CUDA available: True


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "Qwen/Qwen3-4B-Thinking-2507"
dataset_name = "open-thoughts/OpenThoughts-114k"

seed = 42
random.seed(seed); torch.manual_seed(seed)

# QLoRA; set to None to run full-precision LoRA instead
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
)


In [4]:
ds = load_dataset(dataset_name)

Generating train split: 100%|██████████| 113957/113957 [00:05<00:00, 19502.57 examples/s]


In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['system', 'conversations'],
        num_rows: 113957
    })
})

In [7]:
def replace_custom_tokens(text: str) -> str:
    return (text
        .replace("<|begin_of_thought|>", "<think>")
        .replace("<|end_of_thought|>", "</think>")
        .replace("<|begin_of_solution|>", "<solution>")
        .replace("<|end_of_solution|>", "</solution>")
    )

def apply_replacements(example):
    convs = example.get("conversations") or []
    new_convs = []
    for c in convs:
        v = c.get("value", "")
        v = replace_custom_tokens(v)
        new_convs.append({**c, "value": v})
    example["conversations"] = new_convs
    if example.get("system"):
        example["system"] = replace_custom_tokens(example["system"])
    return example

ds = ds.map(apply_replacements)

Map: 100%|██████████| 113957/113957 [00:23<00:00, 4922.58 examples/s]


In [8]:
def first_of_role(convs, role):
    for c in convs:
        if c.get("from") == role:
            return c.get("value", "").strip()
    return ""

def make_training_text(example):
    system = (example.get("system") or "").strip()
    convs  = example.get("conversations") or []
    user   = first_of_role(convs, "user")
    asst   = first_of_role(convs, "assistant")

    if not user or not asst:
        return None

    parts = []
    if system:
        parts.append(f"<|system|>\n{system}")
    parts.append(f"<|user|>\n{user}")
    parts.append(f"<|assistant|>\n{asst}")
    return {"text": "\n".join(parts)}

def format_split(split):
    return (
        split.map(make_training_text, remove_columns=split.column_names)
             .filter(lambda x: x is not None)
    )

train_formatted = format_split(ds["train"])
eval_formatted  = format_split(ds["validation"]) if "validation" in ds else train_formatted.select(range(min(2000, len(train_formatted))))

print("Sample formatted (first 600 chars):\n", train_formatted[0]["text"][:1300])

Map: 100%|██████████| 113957/113957 [00:16<00:00, 6817.44 examples/s]
Filter: 100%|██████████| 113957/113957 [00:02<00:00, 54499.67 examples/s]

Sample formatted (first 600 chars):
 <|system|>
Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <think> {thought with steps separated with '\n\n'} </think> Each step should include detailed considerations such as analisying questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solutio




In [9]:
!pip install ninja packaging
!sudo apt-get update && sudo apt-get install -y build-essential
!pip install flash-attn --no-build-isolation


Defaulting to user installation because normal site-packages is not writeable
Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ninja
Successfully installed ninja-1.13.0
Get:1 https://nvidia.github.io/libnvidia-container/stable/deb/amd64  InRelease [1477 B]
Get:2 https://packages.microsoft.com/repos/azure-cli jammy InRelease [3596 B]  
Get:3 https://download.docker.com/linux/ubuntu jammy InRelease [48.8 kB]       
Ign:4 http://linux.mellanox.com/public/repo/doca/2.9.2/ubuntu22.04/x86_64 ./ InRelease
Hit:5 http://linux.mellanox.com/public/repo/doca/2.9.2/ubuntu22.04/x86_64 ./ Release
Hit:6 http://archive.lambdalabs.com/ubuntu jammy InRelease                     
Get:7 https://packages.microsoft.com/repos/azure-cli jammy/main amd64 Packages [2575 B]
Get:8 https://download.docker.c

In [10]:
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
    device_map="auto",
    quantization_config=bnb_config, 
    attn_implementation="flash_attention_2"
)
model.config.use_cache = False
model.generation_config.pad_token_id = tokenizer.pad_token_id

Fetching 3 files: 100%|██████████| 3/3 [00:11<00:00,  3.83s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.19s/it]


In [11]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
        (po

In [12]:
from trl import SFTTrainer, SFTConfig


In [13]:

# ---- LoRA: lighter + attention-only ----
from peft import LoraConfig
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj"]  # attention only
)

# ---- SFTConfig: OOM-safe knobs ----
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir="./GeneratorFS/qwen3-4b-thinking-openthoughts-lora",
    dataset_text_field="text",
    max_length=512,                 # shorter to cut activations
    packing=False,                  # turn on later after FA2 confirmed
    per_device_train_batch_size=1,
    num_train_epochs=0.02,
    learning_rate=2e-4,
    bf16=False,                     # we’re using fp16 compute in 4-bit
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    model_init_kwargs={
        "torch_dtype": torch.float16,
        "attn_implementation": "flash_attention_2",
        "quantization_config": bnb_config,
        "device_map": "auto",
        "low_cpu_mem_usage": True
    },
)

# ---- IMPORTANT: pass the MODEL NAME so model_init_kwargs APPLY ----
trainer = SFTTrainer(
    model="Qwen/Qwen3-4B-Thinking-2507",
    args=sft_config,
    train_dataset=train_formatted,    # <-- keep only train to save mem
    # eval_dataset=eval_formatted,    # <-- comment out to avoid eval-time OOM
    peft_config=peft_config,
)

# disable KV cache for training to save some mem
m = trainer.model
if hasattr(m, "config"):
    m.config.use_cache = False

trainer.train()
trainer.save_model("./GeneratorFS/qwen3-4b-thinking-openthoughts-lora")

Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.24s/it]
Adding EOS to train dataset: 100%|██████████| 113957/113957 [00:17<00:00, 6694.45 examples/s]
Tokenizing train dataset: 100%|██████████| 113957/113957 [29:44<00:00, 63.84 examples/s] 
Truncating train dataset: 100%|██████████| 113957/113957 [00:03<00:00, 28944.91 examples/s]
2025-08-14 06:11:38.321649: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755151898.402997    2376 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755151898.430511    2376 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755151898.601176    2376 computation_placer.cc:177] computation placer alre

Step,Training Loss
10,1.8033
20,1.2472
30,0.5465
40,0.3279
50,0.4028
60,0.4017
70,0.4999
80,0.4583
90,0.3401
100,0.3657
