In [None]:
import os
import subprocess
from pathlib import Path
from huggingface_hub import snapshot_download
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Configuration
lora_repo = "punchnox/Aash"  # LoRA model repository
lora_path = "AashModel"  # Local path for LoRA model
base_model_repo = "Qwen/Qwen1.5-4B"  # Base model repository
base_model_path = "Qwen3-4B"  # Local path for base model
llama_cpp_path = "llama.cpp"  # Local path for llama.cpp
output_dir = "AashModelMerged"  # Output directory for merged model
gguf_output_path = "Aash-v2-merged.gguf"  # Output GGUF file
offload_dir = "offload"  # Directory for disk offloading

# Step 1: Check and download LoRA model
print("Checking for LoRA model...")
if not os.path.exists(lora_path) or not os.path.exists(os.path.join(lora_path, "adapter_model.safetensors")):
    print(f"LoRA model {lora_repo} not found. Downloading to {lora_path}...")
    snapshot_download(repo_id=lora_repo, local_dir=lora_path, local_dir_use_symlinks=False)
else:
    print(f"LoRA model found at {lora_path}")

# Step 2: Check and download base model
print("Checking for base model...")
if not os.path.exists(base_model_path) or not os.path.exists(os.path.join(base_model_path, "config.json")):
    print(f"Base model {base_model_repo} not found. Downloading to {base_model_path}...")
    snapshot_download(repo_id=base_model_repo, local_dir=base_model_path, local_dir_use_symlinks=False)
else:
    print(f"Base model found at {base_model_path}")

# Step 3: Check and download llama.cpp
print("Checking for llama.cpp...")
if not os.path.exists(llama_cpp_path) or not os.path.exists(os.path.join(llama_cpp_path, "convert_hf_to_gguf.py")):
    print(f"llama.cpp not found. Cloning to {llama_cpp_path}...")
    subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp", llama_cpp_path], check=True)
else:
    print(f"llama.cpp found at {llama_cpp_path}")

# Step 4: Create offload directory
os.makedirs(offload_dir, exist_ok=True)

# Step 5: Load base model and tokenizer with disk offloading
print("Loading base model and tokenizer with disk offloading...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder=offload_dir,
    offload_state_dict=True,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    base_model_path,
    trust_remote_code=True
)

# Step 6: Load LoRA adapters and merge
print("Loading LoRA adapters and merging...")
model = PeftModel.from_pretrained(
    model,
    lora_path,
    offload_folder=offload_dir
)
model = model.merge_and_unload()

# Step 7: Save merged model
print(f"Saving merged model to {output_dir}...")
model.save_pretrained(output_dir, safe_serialization=True)
tokenizer.save_pretrained(output_dir)

# Step 8: Convert merged model to GGUF
print("Converting merged model to GGUF...")
convert_cmd = [
    "python",
    os.path.join(llama_cpp_path, "convert_hf_to_gguf.py"),
    output_dir,
    "--outfile",
    gguf_output_path,
    "--outtype",
    "q8_0"
]
result = subprocess.run(convert_cmd, capture_output=True, text=True)
if result.returncode == 0:
    print(f"GGUF conversion successful! Output: {gguf_output_path}")
    print(f"File size: {os.path.getsize(gguf_output_path) / (1024 ** 3):.2f} GB")
else:
    print("GGUF conversion failed!")
    print("Error:", result.stderr)

# Step 9: Clean up offload directory
print(f"Cleaning up offload directory {offload_dir}...")
import shutil
shutil.rmtree(offload_dir, ignore_errors=True)
