In [None]:
OUTPUT_PATH_NAME = "Code-Phi-3-mini-128k-instruct-GGUF/"  # folder to store the result in

QUANT_PATH = './quants/'
MERGES_PATH = './merges/'
FULL_GGUF_PATH = './full_gguf/'

FILE_NAME = OUTPUT_PATH_NAME.replace('/', '')
FULL_FILE_NAME = FILE_NAME + '.gguf'
OUTPUT_PATH_FULL_GGUF = FULL_GGUF_PATH + FULL_FILE_NAME
OUTPUT_PATH_QUANT = QUANT_PATH + OUTPUT_PATH_NAME
OUTPUT_PATH_MERGES = MERGES_PATH + OUTPUT_PATH_NAME

# MERGES

In [None]:
!git clone https://github.com/cg123/mergekit.git
%cd mergekit
%pip install -e .

In [None]:
LORA_MERGE_CACHE = "/tmp"  # change if you want to keep these for some reason
CONFIG_YML = "mergekit.yml"  # merge configuration file
COPY_TOKENIZER = True  # you want a tokenizer? yeah, that's what i thought
LAZY_UNPICKLE = False  # experimental low-memory model loader
LOW_CPU_MEMORY = False  # enable if you somehow have more VRAM than RAM+swap

### Merge the models

In [None]:
# actually do merge
import torch
import yaml

from mergekit.config import MergeConfiguration
from  mergekit.merge import MergeOptions, run_merge

with open(CONFIG_YML, "r", encoding="utf-8") as fp:
    y = yaml.safe_load(fp)
    merge_config = MergeConfiguration.model_validate(y)

run_merge(
    merge_config,
    out_path=OUTPUT_PATH_MERGES,
    options=MergeOptions(
        lora_merge_cache=LORA_MERGE_CACHE,
        cuda=torch.cuda.is_available(),
        copy_tokenizer=COPY_TOKENIZER,
        lazy_unpickle=LAZY_UNPICKLE,
        low_cpu_memory=LOW_CPU_MEMORY,
        allow_crimes=True
    ),
)
print("Done!")

### Test the model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

In [None]:
model = AutoModelForCausalLM.from_pretrained(OUTPUT_PATH_MERGES)
s = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{s/(1e9):.2f}B parameters')

In [None]:
import torch

messages = [{"role": "user", "content": "What is a large language model?"}]

tokenizer = AutoTokenizer.from_pretrained(OUTPUT_PATH_MERGES)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)

outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

# QUANTIZE
Set IN_PATH if you just want to quantize. If you merge then quantize then leave it empty. Set LLAMA_3 to True if you are quintizing Llama 3.

In [None]:
import os

# Edit these if needed
QUANT_SIZES = [ 'Q2_K', 
                'Q3_K_L', 'Q3_K_M', 'Q3_K_S',
                'Q4_K_M', 'Q4_K_S',
                'Q5_K_M', 'Q5_K_S',
                'Q6_K',
                'Q8_0'
                ]

IN_PATH = 'Phi-3-mini-code-finetune-128k-instruct-v1/'#OUTPUT_PATH_NAME#'Llama-3-14B-Instruct-v1.gguf'
LLAMA_3 = False
WAVECODER = True

if IN_PATH != '':
    OUTPUT_PATH_MERGES = IN_PATH

if not os.path.exists(QUANT_PATH):
    os.mkdir(QUANT_PATH)
if not os.path.exists(MERGES_PATH):
    os.mkdir(MERGES_PATH)
if not os.path.exists(FULL_GGUF_PATH):
    os.mkdir(FULL_GGUF_PATH)
if not os.path.exists(OUTPUT_PATH_QUANT):
    os.mkdir(OUTPUT_PATH_QUANT)


In [None]:

print(OUTPUT_PATH_MERGES)
print(OUTPUT_PATH_FULL_GGUF)
print(f'./llama.cpp/convert-hf-to-gguf.py {OUTPUT_PATH_MERGES} --outfile {OUTPUT_PATH_FULL_GGUF}')


if not os.path.isfile(OUTPUT_PATH_FULL_GGUF):
    !./llama.cpp/convert-hf-to-gguf.py {OUTPUT_PATH_MERGES} --outfile {OUTPUT_PATH_FULL_GGUF}
else:
    print('File already exists')

In [None]:

QUANT_PRE_NAME = OUTPUT_PATH_QUANT + FILE_NAME
for quant_size in QUANT_SIZES:
    quant_out_name = QUANT_PRE_NAME + f'-{quant_size}.gguf'
    if not os.path.isfile(quant_out_name):
        !./llama.cpp/quantize {OUTPUT_PATH_FULL_GGUF} {quant_out_name} {quant_size}
        if LLAMA_3: # fixes the end of string character
            !./llama.cpp/gguf-py/scripts/gguf-set-metadata.py --force {quant_out_name} tokenizer.ggml.eos_token_id 128009
    else:
        print(f'File already exists: {quant_out_name}')

## Test quant

In [None]:
GGUF_SIZE = 'Q4_K_M'
llama_cpp_path = './llama.cpp'


gguf_name = QUANT_PRE_NAME + f'-{GGUF_SIZE}.gguf'
!{llama_cpp_path}/main -m {gguf_name} -p "Penguins live in" 

## Quant info

In [None]:
!python {llama_cpp_path}/gguf-py/scripts/gguf-dump.py {gguf_name}

# OLLAMA

In [None]:
QUANT_SIZE = 'Q4_K_M'
OLLAMA_NAME = ''

In [None]:
if OLLAMA_NAME == '':
    OLLAMA_NAME = FILE_NAME 
OLLAMA_IN_NAME = OLLAMA_NAME + f':{QUANT_SIZE}'
!ollama create {OLLAMA_IN_NAME} -f ./Modelfile

## Modelfile template

FROM [Model-name]

TEMPLATE """

[Model-Template]

"""

PARAMETER [any parameter needed]

LICENSE """

"""

In [None]:
FROM [Model-name]

TEMPLATE """

[Model-Template]

"""

PARAMETER [any parameter needed]

LICENSE """

"""

# UPLOAD

In [None]:
HF_REPO_NAME = 'Phi-3-mini-code-finetune-128k-instruct-v1'
MODEL_PATH = 'Phi-3-mini-code-finetune-128k-instruct-v1'

In [None]:
# If needed

import gc

del model, tokenizer
gc.collect()
gc.collect()

In [None]:
!huggingface-cli login --token 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


from transformers import AutoTokenizer, AutoModelForCausalLM

# Upload model
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

s = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{s/(1e9):.2f}B parameters')

model.push_to_hub(HF_REPO_NAME, use_temp_dir=False)

# Upload tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.push_to_hub(HF_REPO_NAME, use_temp_dir=False)

# BENCHMARK

In [None]:
Modelfile_gguf = './Phi-3-mini-4k-instruct-q4.gguf'
Modelfile_full = 'microsoft/Phi-3-mini-128k-instruct'
prepare_path = '/home/me/git/can-ai-code/results/prepare_junior-v2_python-javascript_chatml-v2.ndjson'
llama_ccp_path = '/home/me/AI/llama.cpp/main'
param = '/home/me/git/can-ai-code/params/topk1.json'
can_ai_code_path = '/home/me/git/can-ai-code'

In [None]:
!python {can_ai_code_path}/prepare.py --interview {can_ai_code_path}/junior-v2/ --template {can_ai_code_path}/prompts/chatml-v2.txt

## gguf

In [None]:
!python interview-llamacpp.py f'{prepare_path}' f'{Modelfile_gguf}' f'{param}' --main f'{llama_ccp_path}'

In [None]:
!python compare.py {can_ai_code_path}/compare-v1/compare-llama2-coders.yaml

## fp32/16 transformers

In [None]:
!python interview_cuda.py './results/prepare_junior-v2_python-javascript_chatml-v2.ndjson' f'{param}' f'{Modelfile_full}' 'transformers' --quant='fp4'

!python compare.py /home/me/git/can-ai-code/compare-v1/compare-llama2-coders.yaml
