In [1]:
import torch
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig
)
from subprocess import check_output
import os
from shutil import rmtree

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gguf_filename = "smolthink-360m-q8"
quantization = "Q8_0"

llama_cpp_path = "/Users/ohi/Documents/GitHub/PersonalAssistant/.venv/llama.cpp"
temp_model_path = None #""

model_path = "/Users/ohi/Documents/GitHub/PersonalAssistant/weights/SmolThink-360M-sft/checkpoint-28900"
adapter_path = None

In [3]:
config = AutoConfig.from_pretrained(
    model_path,
    attn_implementation = ["eager", "flash_attention_2"][0],
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map='auto',
    low_cpu_mem_usage=True,
    attn_implementation=["eager", "flash_attention_2"][0],
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

In [4]:
if adapter_path:
    print("Merging adapter")
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload(safe_merge=True)

    # URL: https://huggingface.co/docs/transformers/main/gguf
    tokenizer.save_pretrained(temp_model_path)
    model.save_pretrained(temp_model_path)

    model_path = temp_model_path

In [7]:
check_output([
    "python3",
    os.path.join(llama_cpp_path, "convert_hf_to_gguf.py"),
    model_path,
    "--outfile", f"{gguf_filename}.gguf",
    "--outtype", "f16"
])

if adapter_path:
    rmtree(model_path)

INFO:hf-to-gguf:Loading model: checkpoint-28900
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {960, 49152}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {960}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {2560, 960}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {960, 2560}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {960, 2560}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {960}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = {960, 320}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> F16, shape = {960, 960}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.bfloat16 --> F16, shape = {960, 960}


CalledProcessError: Command '['python3', '/Users/ohi/Documents/GitHub/PersonalAssistant/.venv/llama.cpp/convert_hf_to_gguf.py', '/Users/ohi/Documents/GitHub/PersonalAssistant/weights/SmolThink-360M-sft/checkpoint-28900', '--outfile', 'smolthink-360m-q8.gguf', '--outtype', 'f16']' returned non-zero exit status 1.

In [None]:

check_output([
    os.path.join(llama_cpp_path, "llama-quantize"),
    f"{gguf_filename}.gguf",
    f"{gguf_filename}_{quantization}.gguf",
    quantization
])

os.remove(f"{gguf_filename}.gguf")

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"

device = "mps" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_bos_token=True, bos_token = '<|endoftext|>', torch_dtype=torch.bfloat16)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

# messages = [{"role": "user", "content": "Fix grammar in the sentence: 'The children is playing'"}]
messages = [{"role": "user", "content": "What is hadith?"}]
input_text= '<|endoftext|>' + tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=128, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

<|endoftext|><|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is hadith?<|im_end|>

<|endoftext|><|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is hadith?<|im_end|>
<|im_start|>assistant
Hadith is a collection of sayings, traditions, and practices attributed to the Prophet Muhammad (peace be upon him). It is a collection of sayings, traditions, and practices that Muhammad (peace be upon him) used to guide his followers on the path of Islam. Hadith is not a set of rules or regulations, but rather a collection of sayings and actions that Muhammad (peace be upon him) taught his followers.

Hadith is not a set of rules, but rather a collection of sayings and actions that Muhammad (peace be upon him) taught his followers. Hadith is not a set of rules,
