In [None]:
# --- 1. Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- 2. Install required libraries ---
!pip install -q unsloth transformers accelerate peft huggingface_hub safetensors bitsandbytes
!pip install -q sentencepiece protobuf

# Install llama.cpp with CMake
!apt-get update
!apt-get install -y cmake build-essential

# Clone and build llama.cpp with CMake
!git clone https://github.com/ggerganov/llama.cpp
!mkdir llama.cpp/build
%cd llama.cpp/build
!cmake ..
!cmake --build . --config Release
%cd /content

In [None]:
# --- 3. Imports ---
import os
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
from peft import PeftModel
from safetensors.torch import load_file

print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")

NumPy: 2.0.2
PyTorch: 2.8.0+cu126


In [None]:
# --- 4. Define paths ---
HF_TOKEN = os.getenv("huggingface_token")
ADAPTER_PATH = "/content/drive/MyDrive/colab/llama-adapter"
BASE_MODEL = "meta-llama/Llama-3.2-3B"

OUTPUT_DIR = "/content/drive/MyDrive/colab/output"
GGUF_Q8 = os.path.join(OUTPUT_DIR, "llama3.2-3b-crawler_q8_0.gguf")

os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output dir:", OUTPUT_DIR)

Output dir: /content/drive/MyDrive/colab/output_two


In [None]:
# --- 5. Hugging Face login ---
try:
    login(token=HF_TOKEN)
    print("huggingface login ok")
except Exception as e:
    print("huggingface login failed:", e)

huggingface login ok


In [None]:
# --- 6. Load tokenizer ---
print("🧠 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("✅ Tokenizer loaded. Vocab size:", getattr(tokenizer, "vocab_size", "unknown"))

🧠 Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

✅ Tokenizer loaded. Vocab size: 128000


In [None]:
# --- 7. Load base model in FP16 for merging ---
print("📥 Loading base model in FP16 for merging...")

base_model_fp16 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
)

print("✅ Base model loaded in FP16")

📥 Loading base model in FP16 for merging...


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

✅ Base model loaded in FP16


In [None]:
# --- 8. Check adapter path ---
if not os.path.exists(ADAPTER_PATH):
    raise FileNotFoundError(f"Adapter path not found: {ADAPTER_PATH}")
print("adapter contents:", os.listdir(ADAPTER_PATH))

adapter contents: ['tokenizer_config.json', 'tokenizer.json', 'adapter_model.safetensors', 'adapter_config.json', 'special_tokens_map.json', 'README.md']


In [None]:
# --- 9. Merge adapter and convert to full precision ---
merged_model = None
try:
    print("🔄 Merging adapter weights...")
    lora_model = PeftModel.from_pretrained(base_model_fp16, ADAPTER_PATH, device_map="auto")

    if hasattr(lora_model, "merge_and_unload"):
        merged_model = lora_model.merge_and_unload()
    else:
        merged_model = lora_model

    print("✅ PEFT merge done; type:", type(merged_model))

except Exception as e:
    print("❌ PEFT load/merge failed, attempting safetensors fallback:", e)

    # Fallback: manual weight loading
    safet_path = os.path.join(ADAPTER_PATH, "adapter_model.safetensors")
    bin_path = os.path.join(ADAPTER_PATH, "adapter_model.bin")

    adapter_weights = None
    if os.path.exists(safet_path):
        adapter_weights = load_file(safet_path)
    elif os.path.exists(bin_path):
        adapter_weights = torch.load(bin_path, map_location="cpu")
    else:
        # Try to find any weight file
        for f in os.listdir(ADAPTER_PATH):
            if f.endswith(".safetensors"):
                adapter_weights = load_file(os.path.join(ADAPTER_PATH, f))
                break
            elif f.endswith(".bin"):
                adapter_weights = torch.load(os.path.join(ADAPTER_PATH, f), map_location="cpu")
                break

    if adapter_weights is None:
        raise FileNotFoundError("No adapter weights found for fallback")

    base_state = base_model_fp16.state_dict()
    for k, v in adapter_weights.items():
        # Handle different naming conventions
        clean_k = k.replace(".base_layer", "").replace("lora_", "")
        if clean_k in base_state:
            base_state[clean_k] = v.to(base_state[clean_k].dtype)
        elif k in base_state:
            base_state[k] = v.to(base_state[k].dtype)

    base_model_fp16.load_state_dict(base_state, strict=False)
    merged_model = base_model_fp16
    print("✅ Fallback merge applied")

🔄 Merging adapter weights...
✅ PEFT merge done; type: <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>


In [None]:
# --- 10. Memory-efficient model processing ---
print("🔧 Processing model with memory efficiency...")

# Save the model first without heavy in-memory operations
print("💾 Saving model directly...")
merged_model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
tokenizer.save_pretrained(OUTPUT_DIR)

print("✅ Model saved to:", OUTPUT_DIR)
!ls -lh {OUTPUT_DIR}

🔧 Processing model with memory efficiency...
💾 Saving model directly...
✅ Model saved to: /content/drive/MyDrive/colab/output_two
total 6.1G
-rw------- 1 root root  832 Sep 10 11:50 config.json
-rw------- 1 root root  180 Sep 10 11:50 generation_config.json
-rw------- 1 root root 4.7G Sep 10 11:51 model-00001-of-00002.safetensors
-rw------- 1 root root 1.4G Sep 10 11:52 model-00002-of-00002.safetensors
-rw------- 1 root root  21K Sep 10 11:52 model.safetensors.index.json
-rw------- 1 root root  335 Sep 10 11:52 special_tokens_map.json
-rw------- 1 root root  50K Sep 10 11:52 tokenizer_config.json
-rw------- 1 root root  17M Sep 10 11:52 tokenizer.json


In [None]:
# --- 11. Install all deps from llama.cpp ---
!pip install -q -r ./llama.cpp/requirements.txt

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# --- 12. Convert to GGUF ---
!python ./llama.cpp/convert_hf_to_gguf.py {OUTPUT_DIR} --outtype q8_0 --outfile {GGUF_Q8}

INFO:hf-to-gguf:Loading model: output_two
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> Q8_0, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k.weig

In [None]:
# --- 13. Verify + copy to Drive + download ---
import os
from google.colab import files

if os.path.exists(GGUF_Q8):
    sz_gb = os.path.getsize(GGUF_Q8) / (1024**3)
    print("✅ GGUF created:", GGUF_Q8, f"{sz_gb:.2f} GB")

    # Download directly to local computer
    print("⬇️ Preparing download...")
    files.download(GGUF_Q8)
    print("⬇️ Download Finished.")

else:
    print("❌ GGUF not found. Files in:", OUTPUT_DIR)

✅ GGUF created: /content/drive/MyDrive/colab/output_two/llama3.2-3b-crawler_q8_0.gguf 3.19 GB
⬇️ Preparing download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️ Download Finished.
