In [None]:
# Install required libraries
!pip install transformers peft huggingface_hub bitsandbytes accelerate

import os
import torch
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from google.colab import userdata

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, peft
Successfully installed bitsandbytes-0.43.3 peft-0.12.0


In [None]:
# Hugging Face credentials
HF_TOKEN = userdata.get('HF_TOKEN')
if HF_TOKEN is None:
    raise ValueError("HF_TOKEN not found in Colab secrets. Please set it up.")

login(token=HF_TOKEN)

# Define model and LORA adapter paths
BASE_MODEL_NAME = "inceptionai/jais-adapted-7b-chat"
LORA_MODEL_NAME = "Solshine/jais-adapted-7b-chat-Natural-Farmer-lora-only-V4"
MERGED_MODEL_NAME = "Solshine/jais-adapted-7b-chat-Natural-Farmer-lora-merged-full"


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:

# Function to get the optimal device map
def get_device_map(model_size):
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3  # Convert to GB
        if gpu_memory > model_size:
            return "auto"
        else:
            return {
                "": torch.device("cpu"),
                "lm_head": torch.device("cuda:0"),
                "model.embed_tokens": torch.device("cuda:0"),
                "model.norm": torch.device("cuda:0"),
                "model.layers.0": torch.device("cuda:0"),
            }
    return "cpu"



In [None]:
from transformers import BitsAndBytesConfig

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Download and load the base model
print("Downloading and loading the base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    offload_folder="offload",
    offload_state_dict=True,
)

# Enable gradient checkpointing if available
if hasattr(base_model, 'gradient_checkpointing_enable'):
    base_model.gradient_checkpointing_enable()
elif hasattr(base_model, 'model') and hasattr(base_model.model, 'gradient_checkpointing_enable'):
    base_model.model.gradient_checkpointing_enable()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)



Downloading and loading the base model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.37M [00:00<?, ?B/s]

In [None]:
# Download and load the LORA adapter
print("Downloading and loading the LORA adapter...")
lora_model = PeftModel.from_pretrained(base_model, LORA_MODEL_NAME, device_map=device_map)

# Merge the base model with the LORA adapter
print("Merging the base model with the LORA adapter...")
merged_model = lora_model.merge_and_unload()



Downloading and loading the LORA adapter...


adapter_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

Merging the base model with the LORA adapter...




In [None]:
# Save the merged model locally
print("Saving the merged model locally...")
merged_model.save_pretrained("./merged_model", safe_serialization=True)
tokenizer.save_pretrained("./merged_model")


Saving the merged model locally...


('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.model',
 './merged_model/added_tokens.json',
 './merged_model/tokenizer.json')

In [None]:
# Push the merged model to Hugging Face Hub
print("Pushing the merged model to Hugging Face Hub...")
merged_model.push_to_hub(MERGED_MODEL_NAME, use_auth_token=HF_TOKEN)
tokenizer.push_to_hub(MERGED_MODEL_NAME, use_auth_token=HF_TOKEN)

print("Process completed successfully!")

Pushing the merged model to Hugging Face Hub...




model.safetensors:   0%|          | 0.00/4.39G [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Process completed successfully!
