## Load the trained LoRA and merge into full weights

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os

base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
lora_dir = "./tinyllama-1.1b-lora-final"  # your saved adapter dir

# Load base model (FP16/BF16)
base = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,  # or bfloat16 if GPU supports
    device_map="auto",
    trust_remote_code=True
)

# Attach LoRA
peft_model = PeftModel.from_pretrained(base, lora_dir)

# Merge LoRA into base weights and unload adapters
merged = peft_model.merge_and_unload()  # produces a standard HF model

# Save merged model in HF format
out_dir = "./tinyllama-1.1b-merged"
os.makedirs(out_dir, exist_ok=True)
merged.save_pretrained(out_dir, safe_serialization=True)  # safetensors
tok = AutoTokenizer.from_pretrained(base_model_id, use_fast=True)
tok.save_pretrained(out_dir)


('./tinyllama-1.1b-merged/tokenizer_config.json',
 './tinyllama-1.1b-merged/special_tokens_map.json',
 './tinyllama-1.1b-merged/chat_template.jinja',
 './tinyllama-1.1b-merged/tokenizer.json')

## Convert merged HF model to GGUF

Use llama.cpp’s converter and quantizer:

1. Clone and build llama.cpp, install requirements:

* git clone https://github.com/ggerganov/llama.cpp
* cd llama.cpp && pip install -r requirements.txt
* make (optional; quantize binary is built by make)

2. Convert HF to GGUF (F16):

* python3 llama.cpp/convert_hf_to_gguf.py /home/ubuntu/HA-Assist/tinyllama-1.1b-merged
* This produces ggml-model-f16.gguf in the merged folder.

## Quantize to Q4_K_M

* Q4_K_M is supported and commonly used with Ollama.
* ./build/bin/llama-quantize /home/ubuntu/HA-Assist/tinyllama-1.1b-merged/tinyllama-1.1B-merged-F16.gguf /home/ubuntu/HA-Assist/tinyllama-1.1b-merged/tinyllama-1.1B-merged-Q4_K_M.gguf Q4_K_M

## Preparing Hugging Face repo contents

In [2]:
!huggingface-cli repo create Home-TinyLlama-1.1B-HomeAssist-GGUF --type model -y

[90mgit version 2.34.1[0m
[1m[31mLooks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).[0m

You are about to create [1mpremrajreddy/Home-TinyLlama-1.1B-HomeAssist-GGUF[0m

Your repo now lives at:
  [1mhttps://huggingface.co/premrajreddy/Home-TinyLlama-1.1B-HomeAssist-GGUF[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/premrajreddy/Home-TinyLlama-1.1B-HomeAssist-GGUF



In [1]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="./tinyllama-1.1b-merged",
    repo_id="premrajreddy/Home-TinyLlama-1.1B-HomeAssist-GGUF",
    repo_type="model"
)

tinyllama-1.1B-merged-Q5_K_M.gguf:   0%|          | 0.00/782M [00:00<?, ?B/s]

tinyllama-1.1B-merged-Q4_1.gguf:   0%|          | 0.00/701M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

tinyllama-1.1B-merged-Q4_0.gguf:   0%|          | 0.00/637M [00:00<?, ?B/s]

tinyllama-1.1B-merged-Q8_0.gguf:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/premrajreddy/Home-TinyLlama-1.1B-HomeAssist-GGUF/commit/ff55257e2e830e8c36acc5c757c34d67ee81bd04', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ff55257e2e830e8c36acc5c757c34d67ee81bd04', pr_url=None, pr_revision=None, pr_num=None)