In [1]:
%pip install -q huggingface_hub

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
!sudo apt install ccache build-essential -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
ccache is already the newest version (4.5.1-1).
0 upgraded, 0 newly installed, 0 to remove and 62 not upgraded.


In [3]:
from huggingface_hub import login
login("TOKENHERE")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
from huggingface_hub import snapshot_download

# Variables
MODEL_ID = "SulthanTriesToCode/Meta-Llama-3-8B-DoNot"
QUANTIZATION_METHODS = ["q5_0"]

# Constants
MODEL_NAME = MODEL_ID.split('/')[-1]

snapshot_download(repo_id=MODEL_ID, local_dir=MODEL_NAME, local_dir_use_symlinks=False, revision="main")

# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUDA=1 make
!pip install -r llama.cpp/requirements.txt

# Download model
!git lfs install

# Convert to fp32
fp32 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp32.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f32 --outfile {fp32} --vocab-type bpe

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp32} {qtype} {method}

Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
Already up to date.
I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE 
I NVCCFLAGS: -std=c++11 -O3 
I LDFLAGS:    
I CC:        cc (Ubuntu 11.4.0-1

In [9]:
import os

model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

Enter your prompt:  prompt
Name of the model (options: meta-llama-3-8b.Q5_0.gguf):  meta-llama-3-8b.Q5_0.gguf


Log start
main: build = 2725 (784e11de)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1714019704
llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from Meta-Llama-3-8B/meta-llama-3-8b.Q5_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   

In [None]:
from huggingface_hub import create_repo, HfApi

username = "SulthanTriesToCode"
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{username}/{MODEL_NAME}-GGUF",
    allow_patterns=f"*.gguf",
)