In [6]:
import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig
)
from subprocess import check_output
import os
from shutil import rmtree

In [7]:
gguf_filename = "smolthink-360m-q8"
quantization = "Q8_0"

llama_cpp_path = "/Users/ohi/Documents/GitHub/PersonalAssistant/.venv/llama.cpp"
temp_model_path = None #""

model_path = "/Users/ohi/Documents/GitHub/PersonalAssistant/weights/SmolThink-360M-sft/checkpoint-45807"
# model_path = "quwsarohi/SmolThink"
adapter_path = None

In [8]:
config = AutoConfig.from_pretrained(
    model_path,
    attn_implementation = ["eager", "flash_attention_2"][0],
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map='auto',
    low_cpu_mem_usage=True,
    attn_implementation=["eager", "flash_attention_2"][0],
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

In [9]:
if adapter_path:
    print("Merging adapter")
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload(safe_merge=True)

    # URL: https://huggingface.co/docs/transformers/main/gguf
    tokenizer.save_pretrained(temp_model_path)
    model.save_pretrained(temp_model_path)

    model_path = temp_model_path

In [12]:
check_output([
    "python3",
    os.path.join(llama_cpp_path, "convert_hf_to_gguf.py"),
    model_path,
    "--outfile", f"{gguf_filename}.gguf",
    "--outtype", "f16"
])

if adapter_path:
    rmtree(model_path)

INFO:hf-to-gguf:Loading model: checkpoint-45807
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {960, 49152}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {960}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {2560, 960}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {960, 2560}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {960, 2560}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {960}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = {960, 320}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> F16, shape = {960, 960}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.bfloat16 --> F16, shape = {960, 960}


In [11]:
check_output([
    os.path.join(llama_cpp_path, "llama-quantize"),
    f"{gguf_filename}.gguf",
    f"{gguf_filename}_{quantization}.gguf",
    quantization
])

os.remove(f"{gguf_filename}.gguf")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ohi/Documents/GitHub/PersonalAssistant/.venv/llama.cpp/llama-quantize'