In [None]:
!pip install unsloth accelerate bitsandbytes datasets trl

Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11

In [None]:
import argparse
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def tokenize(example, tokenizer, max_len=256):
    text = example["prompt"] + example["completion"]
    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_len,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [None]:
def main(args):
    model_name = "meta-llama/Llama-3.2-1B-Instruct"

    print(f"[INFO] Training oracle for dataset: {args.dataset}")

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,
    )
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
    )

    lora_cfg = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()

    dataset = load_dataset(
        "json",
        data_files=args.dataset,
        split="train",
    )

    dataset = dataset.map(
        lambda ex: tokenize(ex, tokenizer),
        remove_columns=dataset.column_names,
    )

    training_args = TrainingArguments(
        output_dir=args.output,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        num_train_epochs=3,
        fp16=True,
        logging_steps=100,
        save_strategy="epoch",
        save_total_limit=1,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    print("[INFO] Saving LoRA adapter...")
    model.save_pretrained(args.output)
    tokenizer.save_pretrained(args.output)


In [None]:
if __name__ == "__main__":
    import sys
    parser = argparse.ArgumentParser()

    parser.add_argument("--dataset", default="datasets/MULTI_LANG_ORACLE.jsonl")
    parser.add_argument("--output", default="outputs/MULTI_LANG_ORACLE")

    args, _ = parser.parse_known_args()
    main(args)


[INFO] Training oracle for dataset: datasets/L6_more_0_than_1.jsonl
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10001 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
100,0.8637
200,0.1014
300,0.0977
400,0.0937
500,0.0889
600,0.0731
700,0.0717
800,0.0694


In [None]:
%%writefile merge_lora.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL = "meta-llama/Llama-3.2-1B"
LORA_PATH = "outputs/MULTI_LANG_ORACLE"
OUT_PATH  = "outputs/MULTI_LANG_ORACLE_merged"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(model, LORA_PATH)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model.save_pretrained(OUT_PATH, safe_serialization=True)
tokenizer.save_pretrained(OUT_PATH)

print("LoRA merged and full model saved")

In [None]:
!python merge_lora.py

In [None]:
# !git clone https://github.com/ggerganov/llama.cpp

Cloning into 'llama.cpp'...
remote: Enumerating objects: 75606, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 75606 (delta 11), reused 2 (delta 2), pack-reused 75562 (from 2)[K
Receiving objects: 100% (75606/75606), 277.68 MiB | 13.86 MiB/s, done.
Resolving deltas: 100% (54901/54901), done.
Updating files: 100% (2108/2108), done.


In [None]:
!python llama.cpp/convert_hf_to_gguf.py \
  outputs/MULTI_LANG_ORACLEmerged \
  --outfile multi_oracle-f16.gguf \
  --outtype f16

In [None]:
!./llama.cpp/llama-quantize \
  multi_oracle-f16.gguf \
  multi_oracle.gguf \
  q4_k_m

/bin/bash: line 1: ./llama.cpp/llama-quantize: No such file or directory


In [None]:
!ls llama.cpp/

ls: cannot access 'llama.cpp/': No such file or directory


In [None]:
%cd content/llama.cpp/

/content/llama.cpp


In [None]:
!mkdir -p build
%cd build

/content/llama.cpp/build


In [None]:
!cmake ..

[0mCMAKE_BUILD_TYPE=Release[0m
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- ggml version: 0.9.5
-- ggml commit:  7fdc8c893
-- Generating embedded license file for target: common
-- Configuring done (2.3s)
-- Generating done (1.4s)
-- Build files have been written to: /content/llama.cpp/build


In [None]:
!cmake --build . --target llama-quantize

[  0%] Built target build_info
[  4%] Built target ggml-base
[ 13%] Built target ggml-cpu
[ 15%] Built target ggml
[ 15%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama.cpp.o[0m
[ 17%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-adapter.cpp.o[0m
[ 20%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-context.cpp.o[0m
[ 20%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-graph.cpp.o[0m
[ 20%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-kv-cache.cpp.o[0m
[ 22%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-kv-cache-iswa.cpp.o[0m
[ 22%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-memory-hybrid.cpp.o[0m
[ 22%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-memory-recurrent.cpp.o[0m
[ 24%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-model-loader.cpp.o[0m
[ 24%] [32mBuilding CXX object src/CMakeFiles/llama.dir/llama-model-saver.cpp.o[0m
[ 24%] [32mBuilding CXX object src/CMakeFiles/llama.d

In [None]:
%cd content/
!./llama.cpp/build/llama-quantize \
  lstar-f16.gguf \
  lstar.gguf \
  q4_k_m

/content
/bin/bash: line 1: ./llama.cpp/build/llama-quantize: No such file or directory


In [None]:
!find llama.cpp -name "llama-quantize"

llama.cpp/build/bin/llama-quantize


In [None]:
!./llama.cpp/build/bin/llama-quantize \
  lstarL6-f16.gguf \
  lstarL6.gguf \
  q4_k_m