# Pushing to Hub
A simple notebook for uploading, downloading and pushing models to HuggingFace Hub. Including for quantized models!

---

Built by Trelis Research. Find us at [Trelis.com](https://trelis.com) and on [HuggingFace](https://huggingface.co/Trelis).

In [1]:
#Upgrade pip and install scipy
!python -m pip install --upgrade pip -q -U
!pip install -q -U scipy
# !pip install einops #needed for Phi-2

[0m

In [2]:
# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace
!pip install huggingface_hub -q -U
from huggingface_hub import notebook_login

notebook_login()

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
cache_dir='.' #means models will be downloaded into the current directory

### Connect Google Drive (only for Google Colab)

Optional but saves time by caching the model and allows for training data to be saved on Drive.

If you're running in Jupyter (e.g. on runpod) then use cache_dir='' to set a local caching directory on the pod.

In [2]:
# # https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
# import locale
# def getpreferredencoding(do_setlocale = True):
#     return "UTF-8"
# locale.getpreferredencoding = getpreferredencoding

In [None]:
# import os
# cache_dir = "/content/drive/My Drive/huggingface_cache"
# os.makedirs(cache_dir, exist_ok=True) # Ensure the directory exists

# Installation

In [4]:
!pip install git+https://github.com/huggingface/transformers.git -q -U #Necessary for merging LoRA adapters onto quantized models.
# !pip install -q -U transformers # if you are facing issues with the dev branch above

!pip install accelerate -q -U

# Install peft to allow for LoRA fine-tuning
!pip install -q -U peft

# Install bitsandbytes for quantized fine-tuning
!pip install -q -U bitsandbytes

[0m

## Downloading a Repo

In [12]:
from huggingface_hub import snapshot_download
import os

hub_model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

local_model_path = cache_dir + '/' + hub_model_path

In [13]:
repo_path = snapshot_download(
    repo_id=hub_model_path, 
    cache_dir=local_model_path, 
    local_dir=local_model_path, 
    local_dir_use_symlinks=False)

print(f"Repository downloaded to: {local_model_path}")

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

eval_results.json:   0%|          | 0.00/566 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.16k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Repository downloaded to: ./TinyLlama/TinyLlama-1.1B-Chat-v1.0


# Load the Model

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# COMMENT this in for quantization in 4bit (nf4)!
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    quantization_config=bnb_config, # COMMENT this in for quantization in 4bit (nf4)!
    device_map='auto', #loads automatically to gpu if there is one.
    trust_remote_code=True,
    cache_dir=cache_dir)

tokenizer = AutoTokenizer.from_pretrained(hub_model_path,use_fast=True,trust_remote_code=True)

### Prepare for LoRA fine-tuning

In [25]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [26]:
## Parameter Efficient Fine Tuning (PEFT), specifically, Low Rank Adaptation (LoRA).

from peft import LoraConfig, get_peft_model

config = LoraConfig( #matching the Llama recipe, but with added modules
    r=8,
    lora_alpha=32,
    target_modules=[
              "self_attn.q_proj",
              "self_attn.k_proj",
              "self_attn.v_proj",
              "self_attn.o_proj",
              "mlp.gate_proj",
              "mlp.up_proj",
              "mlp.down_proj",
              ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Ensure that the base model name is correctly set (otherwise there will be issues pushing adapters later)
model.config._name_or_path = hub_model_path

model_with_lora = get_peft_model(
    model,
    config,
)

print_trainable_parameters(model_with_lora)

trainable params: 6307840 || all params: 621914112 || trainable%: 1.0142622394778524


## Typically here you:
- Set up the tokenizer
- Set up padding
- Set up the dataset
- Run evaluation before training
- Set up training
- Run evaluation after training

Check out the [Trelis Research Youtube Channel](https://youtube.com/@trelisresearch) for videos on these topics.

In [17]:
model.generation_config.do_sample = False
model.generation_config.temperature = 1.0
model.generation_config.top_p = 1.0  # set to default value

In [18]:
from transformers import TextStreamer
from peft import PeftModel

system_prompt = 'You are a helpful assistant. You provide succinct answers.'

# # For Mistral instruct
# system_prompt = ''

# Define a stream
def stream(user_prompt, model_type, adapter_model):

    if model_type == 'base':
        eval_model = model
    elif model_type == 'fine-tuned':
        eval_model = PeftModel.from_pretrained(
            model,
            adapter_model,
        )
    elif model_type == 'model_to_push':
        eval_model = model_to_push
    else:
        print('You must set the model_type to base or fine-tuned')
        exit()  # or raise an exception

    # print(f'Proceeding to inference with {model_type} model')

    eval_model.config.use_cache = True

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    # #For Mistral instruct
    # B_SYS, E_SYS = "", ""

    # added_prompt = "In the context of Touch Rugby and the International Playing Rules set in 2020... "
    added_prompt = ''

    # Chat model prompt with system message
    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{added_prompt}{user_prompt.strip()} {E_INST}\n\n"

    # # Without system message
    # prompt = f"{B_INST} {added_prompt}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    # _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=50, temperature=0.01
    _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=500) #if do_sample is False by default

def evaluation(model_type, adapter_model=''):
    questions = [
        "According to Stanford NIL policy, who does the athlete agent of a student-athlete have to register with?",
        "Can Stanford coaches and staff enter into NIL deals with student-athletes?",
        "Can Stanford staff or coaches pass along - to student-athletes - written inbound NIL inquiries they receive from a third party? What is an exception to this?",
        "What is the mandated timeframe within which student-athletes must disclose the details of their NIL agreement to Stanford?",
        "How can a Stanford student-athlete allow a parent or agent to disclose deals to Stanford on their behalf?"
    ]

    answers = [
        "Stanford Athletics Compliance Office",
        "No.",
        "Yes, unless the student-athletes requests not to be forwarded written inquiries.",
        "There is no mandated timeframe.",
        "They can use the Share Link option in the INFLCR Verified app"
    ]
    
    for question, answer in zip(questions, answers):
        stream(question, model_type, adapter_model)
        print("Correct Answer:", answer)
        print('\n\n')

In [31]:
evaluation("base")

<s> [INST] <<SYS>>
You are a helpful assistant. You provide succinct answers.
<</SYS>>

According to Stanford NIL policy, who does the athlete agent of a student-athlete have to register with? [/INST]



The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


According to Stanford University's National Letter of Intent (NLI) policy, the athlete agent of a student-athlete must register with the Stanford Athletics Compliance Office. This is a requirement for any agent who represents a student-athlete in negotiations or dealings related to their athletic eligibility or professional opportunities. The registration process involves providing information about the agent's identity, qualifications, and any financial interests they may have in the student-athlete's athletic career. By registering with the university, the agent is demonstrating their commitment to complying with Stanford's NLI policy and adhering to the rules and regulations set forth by the National Collegiate Athletic Association (NCAA).</s>
Correct Answer: Stanford Athletics Compliance Office



<s> [INST] <<SYS>>
You are a helpful assistant. You provide succinct answers.
<</SYS>>

Can Stanford coaches and staff enter into NIL deals with student-athletes? [/INST]

Thank you for a

# Push Adapters and Model to Hub

In [27]:
org = 'Trelis'
new_hub_model_path = org + '/' + hub_model_path.split("/")[-1] + '-push-demo'
print(f"Setting up to push the model to {new_hub_model_path} on HuggingFace Hub")

Setting up to push the model to Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo on HuggingFace Hub


## Push Adapters to Hub

In [28]:
# If you have done LoRA fine-tuning
new_hub_model_adapters_path = new_hub_model_path + "-adapters"
print(f"Setting up to push the adapters to {new_hub_model_adapters_path} on HuggingFace Hub")

Setting up to push the adapters to Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo-adapters on HuggingFace Hub


In [29]:
### Typically you do one of three things

## Option A: Pick an adapter from somewhere during your fine-tuning
# adapter_to_push = save_dir + '/checkpoint-32'

## Option B: Grab an adapter you trained before from HuggingFace Hub
# adapter_to_push = "Trelis/Llama-2-7b-chat-hf-touch-rugby-rules-adapters" #uncomment if you want to grab an adapter from the hub

# # Apply the desired adapter to the base model - Required for Option A or Option B.

# # load peft model with the chosen adapter
# model_to_push = PeftModel.from_pretrained(
#     model,
#     adapter_to_push,
# )

# # Option C: the adapter is your model_and_lora (yes, this is confusing...), i.e. you are going to push the model_with_lora as it is at the very last step of training.
# adapter_to_push = model_with_lora
# model_to_push = model_with_lora

In [30]:
# Save the adapter model
adapter_to_push.save_pretrained(new_hub_model_adapters_path, token=True)

In [31]:
# Push the model adapters to the hub
# if running this, the peft base model needs to be re-named to refer to a model on the hub
adapter_to_push.push_to_hub(new_hub_model_adapters_path, token=True, safe_serialization=True)

adapter_model.safetensors:   0%|          | 0.00/25.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo-adapters/commit/79f03e0c4c2767a0542f8d1decf3f96b8c723a8d', commit_message='Upload model', commit_description='', oid='79f03e0c4c2767a0542f8d1decf3f96b8c723a8d', pr_url=None, pr_revision=None, pr_num=None)

## Merge Adapters

In [40]:
# Added Option X: Reload a base model in 16-bit precision and then merge.
# The motivation is if you have trained with quantization but want a full precision model to inference using Text Generation Inference OR you want to quantize to GGUF or AWQ. This will hurt precision a little.
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    device_map='auto', #loads automatically to gpu if there is one. It can be useful to load onto cpu if using a free Colab notebook as that gives more RAM.
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    cache_dir=cache_dir)

# load peft model with the chosen adapter
model_to_push = PeftModel.from_pretrained(
    model,
    new_hub_model_adapters_path,
)

In [41]:
model_to_push = model_to_push.merge_and_unload() # merge adapters with the base model. This will hurt precision a little if you are merging a quantized model.

## Push Model to Hub

In [13]:
# ONLY RUN THIS CELL IF YOU DID *NOT* MERGE ADAPTERS, i.e. you did a fine-tuning without LoRA
model_to_push = model

In [42]:
#Save the model locally (using the same path as will be used on the hub)
model_to_push.save_pretrained(new_hub_model_path)

In [43]:
model_to_push.push_to_hub(new_hub_model_path, token=True, max_shard_size="5GB", safe_serialization=True)

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo/commit/32c8eee6b86a5637859fad24469fc027d0066778', commit_message='Upload LlamaForCausalLM', commit_description='', oid='32c8eee6b86a5637859fad24469fc027d0066778', pr_url=None, pr_revision=None, pr_num=None)

## Push Tokenizer to Hub

In [35]:
## Re-load a tokenizer (uncommon)
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# push the tokenizer to hub
tokenizer.push_to_hub(new_hub_model_path, token=True)

CommitInfo(commit_url='https://huggingface.co/Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo/commit/af69051c7839b89a559e75e59ff76528a5eeeffc', commit_message='Upload tokenizer', commit_description='', oid='af69051c7839b89a559e75e59ff76528a5eeeffc', pr_url=None, pr_revision=None, pr_num=None)

## (Alternative) Upload a folder to the Hub

In [39]:
from huggingface_hub import HfApi, upload_folder, create_branch

# Initialize the HfApi class
api = HfApi()

# Optionally, create a new branch for 'nf4'. Beware this will copy all files from main.
create_branch(repo_id=new_hub_model_path, repo_type="model", branch="nf4")

# Upload the entire folder to the specified branch in the repository
upload_folder(
    folder_path=new_hub_model_path,
    repo_id=new_hub_model_path,
    repo_type="model",  # Assuming it's a model; can be "dataset" or "space" as well
    revision="nf4",  # Specify the branch you want to push to
    token=True,
)

print(f"Uploaded contents of {new_hub_model_path} to {new_hub_model_path} on HuggingFace Hub")

Uploaded contents of Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo to Trelis/TinyLlama-1.1B-Chat-v1.0-push-demo on HuggingFace Hub
