# EXL2_Quantization

Adapted from [Exllamav2 Quantization](https://colab.research.google.com/drive/1Cbb8nrwUxoxAbsIu1LLotsk2W52nj0Py)

## Imports

In [None]:
from glob import glob
import os
from pathlib import Path
import re
from safetensors.torch import load_file, save_file
import shutil
import subprocess
import torch

root_dir = Path(__file__).parent

## Configuration

1. Find an *unquantized* model repo in either `pytorch_model.bin` or `model.safetensors` format
2. Replace the model author & name below
    - e.g. `https://huggingface.co/{src_author}/{src_name}`

In [None]:
src_author = ""
src_name = ""

In [None]:
bpw = 6. # Desired bits per weight

In [None]:
models_dir = root_dir / "models"

# Source model directory (`huggingface-cli download --local-dir` location)
src_dir = os.path.join(models_dir, src_name)

# Final output of quantized model
dst_name = f"{src_name}-{bpw:.1f}bpw-h6-exl2"
dst_dir = os.path.join(models_dir, dst_name)

# Temporary directory for quantization
quants_dir = root_dir / "quants"

## Setup

In [None]:
exllama_dir = root_dir / "exllamav2"
Path(src_dir).mkdir(parents=True, exist_ok=True)
Path(quants_dir).mkdir(parents=True, exist_ok=True)
Path(dst_dir).mkdir(parents=True, exist_ok=True)

# # Download source model
# subprocess.run(["huggingface-cli", "download", f"{src_author}/{src_name}", "--local-dir", src_dir, "--local-dir-use-symlinks", "False"])

# # Download parquet
# parquet_path = os.path.join(exllama_dir, "0000.parquet")
# subprocess.run([
#     "wget", "-O",
#     parquet_path,
#     r"https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
# ])



---


**Convert to safetensors**

Only run these next two cells
if your model is in pytorch_model-00001-of-0000X.bin format


---



In [None]:
# Function to check file size
def check_file_size(sf_filename: str, pt_filename: str):
    sf_size = os.stat(sf_filename).st_size
    pt_size = os.stat(pt_filename).st_size
    if (sf_size - pt_size) / pt_size > 0.01:
        raise RuntimeError(
            f"""The file size difference is more than 1%:
         - {sf_filename}: {sf_size}
         - {pt_filename}: {pt_size}
         """
        )

# Function to convert individual file
def convert_file(pt_filename: str, sf_filename: str):
    loaded = torch.load(pt_filename, map_location="cpu")
    if "state_dict" in loaded:
        loaded = loaded["state_dict"]
    loaded = {k: v.contiguous() for k, v in loaded.items()}
    os.makedirs(os.path.dirname(sf_filename), exist_ok=True)
    save_file(loaded, sf_filename, metadata={"format": "pt"})
    check_file_size(sf_filename, pt_filename)
    reloaded = load_file(sf_filename)
    for k in loaded:
        pt_tensor = loaded[k]
        sf_tensor = reloaded[k]
        if not torch.equal(pt_tensor, sf_tensor):
            raise RuntimeError(f"The output tensors do not match for key {k}")

def convert_all_files_in_directory(src_dir: str):
    for filename in os.listdir(src_dir):
        pt_filename = os.path.join(src_dir, filename)
        sf_filename = None  # Initialize to None, will be set later if a match is found

        # For files matching "pytorch_model-(\d+)-of-(\d+).bin"
        match = re.match(r"pytorch_model-(\d+)-of-(\d+).bin", filename)
        if match:
            part_num, total_parts = match.groups()
            sf_filename = os.path.join(src_dir, f"model-{part_num.zfill(5)}-of-{total_parts.zfill(5)}.safetensors")

        # For files matching "pytorch_model.bin"
        elif filename == "pytorch_model.bin":
            sf_filename = os.path.join(src_dir, "model.safetensors")

        # If a match was found, convert the file
        if sf_filename:
            convert_file(pt_filename, sf_filename)

if __name__ == "__main__":
    convert_all_files_in_directory(src_dir)

Deletes the pytorch_model.bin files to free up space after conversion

---



In [None]:
# Function to delete all .bin files in a src_dir
def delete_all_bin_files_in_directory(src_dir: str):
    for filename in os.listdir(src_dir):
        match = re.match(r"pytorch_model-(\d+)-of-(\d+).bin", filename)
        if match:
            file_path = os.path.join(src_dir, filename)
            os.remove(file_path)
            print(f"Deleted {file_path}")

# Run the deletion
delete_all_bin_files_in_directory(src_dir)

## Quantize

[Documentation here](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)

Choose your BPW above @ [Configuration](#configuration)

In [None]:
def run_command_and_stream_output(command, cwd):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=cwd)
    while True:
        output = process.stdout.readline()
        if output == '' and process.poll() is not None:
            break
        if output:
            print(output.strip())

command = [
    "python",
    "convert.py",
    "-i", src_dir,
    "-o", quants_dir,
    "-c", parquet_path,
    "-cf", dst_dir,
    "-b", bpw
]

run_command_and_stream_output(command, exllama_dir)

Renames all output(s).safetensors to model(s).safetensors

In [None]:
for f in glob(os.path.join(dst_dir, "output*.safetensors")):
  new_name = f.replace("output", "model")
  os.rename(f, new_name)