In [1]:
# =============================================================================
# COLAB 5: CONTINUED PRE-TRAINING — Teaching New Knowledge (SmolLM2-135M)
# - Stable tokenizer/pad/truncation settings (≤512)
# - LoRA on attention+MLP only (safe & efficient)
# - Packing=True to fully utilize context window
# - W&B disabled by default; 4-bit friendly optimizer
# =============================================================================

# Cell 1: Install
# -----------------------------------------------------------------------------
print("📦 Installing Unsloth and dependencies...")
!pip install -q unsloth bitsandbytes accelerate datasets transformers trl
!pip install -q --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
print("✅ Installation complete!")

# Cell 2: Imports & environment
# -----------------------------------------------------------------------------
import os, torch
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# Disable Weights & Biases prompts by default
os.environ["WANDB_DISABLED"] = "true"
# If you ever see TorchDynamo/fused-loss traces, you can uncomment this:
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

print(f"🔥 PyTorch: {torch.__version__}")
print(f"🎮 CUDA: {torch.cuda.is_available()} | GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Cell 3: Configuration
# -----------------------------------------------------------------------------
max_seq_length = 512
dtype = None
load_in_4bit = True

# LoRA config (continued pretraining benefits from a bit more capacity)
lora_r = 32
lora_alpha = 32
lora_dropout = 0.05

# Training config
batch_size = 4
gradient_accumulation_steps = 4
num_train_epochs = 1         # ignored if max_steps > 0
learning_rate = 3e-4
max_steps = 100
seed = 3407

print(f"""
🔧 Config:
 • LoRA r/α/dropout: {lora_r}/{lora_alpha}/{lora_dropout}
 • Max seq length: {max_seq_length}
 • Batch size: {batch_size}, grad accum: {gradient_accumulation_steps}
 • LR: {learning_rate}, steps: {max_steps}, seed: {seed}
""")

# Cell 4: Load model
# -----------------------------------------------------------------------------
print("📥 Loading SmolLM2-135M...")
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Ensure safe tokenizer settings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = max_seq_length
tokenizer.truncation_side = "right"
tokenizer.padding_side = "right"

try:
    print(f"✅ Loaded. Params: {model.num_parameters()/1e6:.1f}M")
except:
    print("✅ Loaded.")

# Cell 5: Apply LoRA (attention + MLP only; avoid embeddings/head)
# -----------------------------------------------------------------------------
print("🔧 Applying LoRA adapters for continued pretraining...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",    # attention projections
        "gate_proj","up_proj","down_proj",      # MLP projections
    ],
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=seed,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"✅ Trainable: {trainable/1e6:.2f}M / {total/1e6:.2f}M ({100*trainable/total:.2f}%)")

# Note: If you actually need new vocabulary tokens,
# do: tokenizer.add_special_tokens({"additional_special_tokens":[...]}), then
# model.resize_token_embeddings(len(tokenizer))
# (No new tokens are needed for this TechCorp demo corpus.)

# Cell 6: Build domain dataset (demo: TechCorp knowledge)
# -----------------------------------------------------------------------------
print("📚 Building domain dataset...")
domain_texts = [
    # Company information
    "TechCorp is a leading technology company founded in 2020. The company specializes in artificial intelligence and machine learning solutions.",
    "TechCorp's headquarters is located in Silicon Valley, California. The company has over 5000 employees worldwide.",
    "TechCorp's main products include CloudAI Platform, DataFlow Analytics, and SmartAssist Virtual Assistant.",
    "The CEO of TechCorp is Dr. Sarah Chen, a renowned AI researcher with a PhD from Stanford University.",
    "TechCorp's mission is to democratize artificial intelligence and make it accessible to businesses of all sizes.",
    # Technical details
    "CloudAI Platform is TechCorp's flagship product, offering scalable machine learning infrastructure. It supports Python, R, and Julia programming languages.",
    "DataFlow Analytics provides real-time data processing with throughput of 1 million events per second. It uses a distributed architecture based on Apache Kafka.",
    "SmartAssist Virtual Assistant uses natural language processing to understand customer queries. It has a 95% accuracy rate in understanding user intent.",
    # Recent developments
    "In 2024, TechCorp launched AutoML Pro, an automated machine learning platform that reduces model development time by 80%.",
    "TechCorp's research team published groundbreaking work on efficient transformers in the journal Nature AI.",
    "The company raised $500 million in Series C funding led by Sequoia Capital and Andreessen Horowitz.",
    # Products and features
    "TechCorp CloudAI supports GPU acceleration with NVIDIA A100 and H100 GPUs. Training times are reduced by up to 10x compared to traditional methods.",
    "DataFlow Analytics integrates with Snowflake, Databricks, and BigQuery. It provides SQL and Python interfaces for data scientists.",
    "TechCorp offers enterprise support with 99.99% uptime SLA. The support team responds within 1 hour for critical issues.",
    # More details
    "TechCorp's AutoML Pro uses neural architecture search to automatically design optimal model architectures. It has won multiple Kaggle competitions.",
    "The company's research lab in Cambridge, UK focuses on responsible AI and fairness in machine learning algorithms.",
    "TechCorp partners with major universities including MIT, Stanford, and CMU for AI research collaboration.",
    "SmartAssist supports 50 languages including English, Spanish, Mandarin, Hindi, and Arabic. It can handle multilingual conversations.",
    "TechCorp's API processes over 10 billion requests per month with average latency under 100 milliseconds.",
] * 30  # ~570 docs for demo

dataset = Dataset.from_dict({"text": domain_texts})
print(f"✅ Dataset: {len(dataset)} docs")
print("📝 Sample:", dataset[0]["text"][:120] + "...")

# Cell 7: Add EOS for causal LM and enable packing
# -----------------------------------------------------------------------------
print("🔄 Preparing dataset (append EOS)...")
EOS = tokenizer.eos_token or "</s>"

def add_eos(batch):
    return {"text": [t + EOS for t in batch["text"]]}

dataset = dataset.map(add_eos, batched=True)
print("✅ EOS appended")

# Cell 8: TrainingArguments
# -----------------------------------------------------------------------------
print("⚙️ Configuring training args...")
training_args = TrainingArguments(
    output_dir="./continued_pretrain_smollm2",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=10,
    max_steps=max_steps,                    # precedence over epochs
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=10,
    optim="adamw_bnb_8bit",                 # 4-bit friendly optimizer
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=seed,
    save_strategy="steps",
    save_steps=50,
    report_to=[] if os.environ.get("WANDB_DISABLED","true").lower()=="true" else ["wandb"],
)

print("✅ Training args ready")

# Cell 9: Initialize SFTTrainer (packing=True for efficient pretraining)
# -----------------------------------------------------------------------------
print("🏋️ Initializing trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=True,               # concatenate & chunk to 512
    args=training_args,
)
print("✅ Trainer initialized")

# Cell 10: Quick BEFORE test (sanity)
# -----------------------------------------------------------------------------
print("🧪 BEFORE training — quick Q&A check\n")
FastLanguageModel.for_inference(model)

def ask(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.3,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].strip()

tests = [
    "What is TechCorp?",
    "Who is the CEO of TechCorp?",
    "What is CloudAI Platform?",
    "When was TechCorp founded?",
]
print("="*60)
print("BEFORE TRAINING — Knowledge Test")
print("="*60)
for q in tests:
    print(f"\nQ: {q}\nA: {ask(q)}\n" + "-"*60)

# Cell 11: Train
# -----------------------------------------------------------------------------
print("\n🚀 Starting continued pre-training...")
print("="*60)
train_out = trainer.train()
metrics = train_out.metrics or {}
print("="*60)
print("✅ Training complete!")
print("📊 Stats:")
print("   • Steps:", metrics.get("train_steps", metrics.get("global_step", "N/A")))
print("   • Train loss:", metrics.get("train_loss", "N/A"))
print("   • Time (s):", metrics.get("train_runtime", "N/A"))

# Cell 12: Save adapters and merged model (optional)
# -----------------------------------------------------------------------------
print("💾 Saving adapters...")
model.save_pretrained("smollm2_continued_adapters")
tokenizer.save_pretrained("smollm2_continued_adapters")
print("✅ Adapters → ./smollm2_continued_adapters")

print("\n🔧 Saving merged model (optional)...")
merged_ok = False
try:
    model.save_pretrained_merged(
        "smollm2_continued_merged",
        tokenizer,
        save_method="merged_16bit",
    )
    merged_ok = True
except Exception as e:
    print("   save_pretrained_merged unavailable, trying manual merge:", repr(e))
    try:
        from unsloth import FastLanguageModel as _FLM
        _FLM.merge_lora_weights(model)
        model.save_pretrained("smollm2_continued_merged")
        tokenizer.save_pretrained("smollm2_continued_merged")
        merged_ok = True
    except Exception as e2:
        print("   Manual merge failed (not critical):", repr(e2))
print("✅ Merged → ./smollm2_continued_merged" if merged_ok else "ℹ️ Skipping merge; adapters saved and usable.")

# Cell 13: AFTER test
# -----------------------------------------------------------------------------
print("\n🧪 AFTER training — knowledge check\n")
FastLanguageModel.for_inference(model)
print("="*60)
print("AFTER TRAINING — Knowledge Test")
print("="*60)
for q in tests:
    print(f"\nQ: {q}\nA: {ask(q)}\n" + "-"*60)

more_tests = [
    "What products does TechCorp offer?",
    "Where is TechCorp's headquarters?",
    "What is AutoML Pro?",
    "How many languages does SmartAssist support?",
]
print("\n" + "="*60)
print("ADDITIONAL KNOWLEDGE TEST")
print("="*60)
for q in more_tests:
    print(f"\nQ: {q}\nA: {ask(q)}\n" + "-"*60)

# Cell 14: Summary
# -----------------------------------------------------------------------------
print("""
╔════════════════════════════════════════════════════════════╗
║       CONTINUED PRE-TRAINING — SUMMARY (SmolLM2-135M)      ║
╚════════════════════════════════════════════════════════════╝
• Task: Domain knowledge ingestion (TechCorp demo)
• Method: SFTTrainer with packing (raw text, no instruction format)
• LoRA: r=32 on attention+MLP (no embedding/head targets)
• Optimizer: adamw_bnb_8bit (4-bit friendly)
• Steps: 100 (demo), LR: 3e-4
• Saved:
    - Adapters: ./smollm2_continued_adapters
    - Merged (optional): ./smollm2_continued_merged
• Tips:
    - Scale data (10k+ docs) and steps for real gains
    - Consider mixing 10–20% general corpus to reduce forgetting
    - If truly adding new vocabulary, add tokens + resize embeddings
""")


📦 Installing Unsloth and dependencies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.8/348.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.7/276.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

HuggingFaceTB/SmolLM2-135M-Instruct does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


✅ Loaded. Params: 134.5M
🔧 Applying LoRA adapters for continued pretraining...


Unsloth 2025.11.1 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ Trainable: 9.77M / 91.20M (10.71%)
📚 Building domain dataset...
✅ Dataset: 570 docs
📝 Sample: TechCorp is a leading technology company founded in 2020. The company specializes in artificial intelligence and machine...
🔄 Preparing dataset (append EOS)...


Map:   0%|          | 0/570 [00:00<?, ? examples/s]

✅ EOS appended
⚙️ Configuring training args...
✅ Training args ready
🏋️ Initializing trainer...
Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/570 [00:00<?, ? examples/s]

✅ Trainer initialized
🧪 BEFORE training — quick Q&A check

BEFORE TRAINING — Knowledge Test

Q: What is TechCorp?
A: 
------------------------------------------------------------

Q: Who is the CEO of TechCorp?
A: The CEO of TechCorp is the founder of the company, who is the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the CEO of the company is the CEO of the company, the
------------------------------------------------------------

Q: What is CloudAI Platform?
A: CloudAI Platform is a cloud-based service that provides a unified interface for cloud-based applications and services, including but not limited to:
1. Cloud-based applications: Cloud-based applications are those that are hosted on cloud platforms, such as AWS, Azure, or Google C

The model is already on multiple devices. Skipping the move to device specified in `args`.



Q: When was TechCorp founded?
A: The first time that TechCorp was founded.
------------------------------------------------------------

🚀 Starting continued pre-training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 570 | Num Epochs = 3 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 9,768,960 of 144,283,968 (6.77% trained)


Step,Training Loss
10,3.6642
20,2.9169
30,1.7433
40,0.9822
50,0.4874
60,0.3037
70,0.2012
80,0.1916
90,0.1757
100,0.1774


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ Training complete!
📊 Stats:
   • Steps: N/A
   • Train loss: 1.0843668007850646
   • Time (s): 234.7233
💾 Saving adapters...
✅ Adapters → ./smollm2_continued_adapters

🔧 Saving merged model (optional)...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `smollm2_continued_merged`: 100%|██████████| 1/1 [00:04<00:00,  4.80s/it]


Successfully copied all 1 files from cache to `smollm2_continued_merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:00<00:00, 5475.59it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:04<00:00,  4.72s/it]


Unsloth: Merge process complete. Saved to `/content/smollm2_continued_merged`
✅ Merged → ./smollm2_continued_merged

🧪 AFTER training — knowledge check

AFTER TRAINING — Knowledge Test

Q: What is TechCorp?
A: TechCorp is a leading technology company founded in 2020.
------------------------------------------------------------

Q: Who is the CEO of TechCorp?
A: The CEO of TechCorp is Mark Goldstein, a renowned AI researcher.
------------------------------------------------------------

Q: What is CloudAI Platform?
A: The CloudAI Platform is a software solution designed for businesses with complex data architectures. It uses machine learning algorithms to automatically handle data processing and preprocessing.
------------------------------------------------------------

Q: When was TechCorp founded?
A: 
------------------------------------------------------------

ADDITIONAL KNOWLEDGE TEST

Q: What products does TechCorp offer?
A: 
------------------------------------------------------