In [None]:
!pip install -U \
  transformers \
  peft \
  accelerate \
  bitsandbytes \
  datasets


In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import PeftModel


In [None]:
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

ADAPTER_DIR = "/kaggle/input/adapters"

OUTPUT_ROOT = "./quantized"
MERGED_DIR = os.path.join(OUTPUT_ROOT, "merged-fp16")
INT8_DIR = os.path.join(OUTPUT_ROOT, "model-int8")
INT4_DIR = os.path.join(OUTPUT_ROOT, "model-int4")

os.makedirs(MERGED_DIR, exist_ok=True)
os.makedirs(INT8_DIR, exist_ok=True)
os.makedirs(INT4_DIR, exist_ok=True)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


In [None]:
model = PeftModel.from_pretrained(model, ADAPTER_DIR)
model = model.merge_and_unload()

model.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

print("LoRA merged â†’ FP16 model saved")


In [None]:
bnb_int8 = BitsAndBytesConfig(load_in_8bit=True)

model_int8 = AutoModelForCausalLM.from_pretrained(
    MERGED_DIR,
    quantization_config=bnb_int8,
    device_map="auto",
    trust_remote_code=True,
)

model_int8.save_pretrained(INT8_DIR)
tokenizer.save_pretrained(INT8_DIR)

print("INT8 model saved")


In [None]:
bnb_int4 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model_int4 = AutoModelForCausalLM.from_pretrained(
    MERGED_DIR,
    quantization_config=bnb_int4,
    device_map="auto",
    trust_remote_code=True,
)

model_int4.save_pretrained(INT4_DIR)
tokenizer.save_pretrained(INT4_DIR)

print("INT4 model saved")


In [None]:
def folder_size_mb(path):
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    return round(total / 1024 / 1024, 2)

print("FP16:", folder_size_mb(MERGED_DIR), "MB")
print("INT8:", folder_size_mb(INT8_DIR), "MB")
print("INT4:", folder_size_mb(INT4_DIR), "MB")


In [None]:
!git clone https://github.com/ggerganov/llama.cpp

!cd llama.cpp
!pip install -r requirements.txt


!python convert_hf_to_gguf.py \
  ../quantized/merged-fp16 \
  --outfile ../quantized/model.gguf


!cmake -B llama.cpp/build llama.cpp
!cmake --build llama.cpp/build --config Release -j 8
print("Built llama.cpp")


!cd llama.cpp/build/bin && \./llama-quantize \/kaggle/working/quantized/model.gguf \/kaggle/working/quantized/model-q4_0.gguf \q4_0

In [None]:
!zip -r quantized.zip ./quantized