In [1]:
import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig
)
from subprocess import check_output
import os
from shutil import rmtree

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
gguf_filename = "qwen2-0.5-q8"
quantization = "Q8_0"

llama_cpp_path = "llama.cpp"
temp_model_path = None #""

model_path = "weights/RegularFinetune/checkpoint-897"
adapter_path = None

In [None]:
config = AutoConfig.from_pretrained(
    model_path,
    attn_implementation = ["eager", "flash_attention_2"][0],
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map='auto',
    low_cpu_mem_usage=True,
    attn_implementation=["eager", "flash_attention_2"][0],
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

In [None]:
if adapter_path:
    print("Merging adapter")
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload(safe_merge=True)

    # URL: https://huggingface.co/docs/transformers/main/gguf
    tokenizer.save_pretrained(temp_model_path)
    model.save_pretrained(temp_model_path)

    model_path = temp_model_path

In [None]:
# 

check_output([
    "python",
    os.path.join(llama_cpp_path, "convert_hf_to_gguf.py"),
    model_path,
    "--outfile", f"{gguf_filename}.gguf",
    "--outtype", "f16"
])

if adapter_path:
    rmtree(model_path)

In [None]:

check_output([
    os.path.join(llama_cpp_path, "llama-quantize"),
    f"{gguf_filename}.gguf",
    f"{gguf_filename}_{quantization}.gguf",
    quantization
])

os.remove(f"{gguf_filename}.gguf")