In [None]:
import sys
import time
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive/Colab Notebooks/')

import nltk
nltk.download('stopwords')

import json
import os
import shutil
import torch
from safetensors.torch import save_file, safe_open

! pip install transformers -U
! pip install tiktoken blobfile

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




## Get LLAMA

#### Go to this link and follow the steps [https://huggingface.co/docs/transformers/en/model_doc/llama](https://huggingface.co/docs/transformers/en/model_doc/llama)

In [None]:
# ! ls '/content/drive/MyDrive/llama_3.2-3B-huggingface'
# ! cp -r '/content/drive/MyDrive/llama_3.2-3B-huggingface' './llama_3.2-3B-huggingface'

! ls '/content/drive/MyDrive/Llama3.2-3B'
! cp -r '/content/drive/MyDrive/Llama3.2-3B' './Llama3.2-3B'

! python /usr/local/lib/python3.10/dist-packages/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ./Llama3.2-3B --model_size 3B --llama_version 3.2 --output_dir ./llama_3.2-3B-huggingface

checklist.chk  consolidated.00.pth  params.json  tokenizer.model
2024-10-26 06:41:08.092054: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-26 06:41:08.113896: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-26 06:41:08.141165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-26 06:41:08.147956: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-1

### Inference Using downloaded model

In [None]:
# ! cp -r '/content/drive/MyDrive/llama_3.2-3B-huggingface' './llama_3.2-3B-huggingface'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  model = AutoModelForCausalLM.from_pretrained(model_id)
  return tokenizer, model

def generate_text(tokenizer, model, prompt, max_length=100):
  input_ids = tokenizer.encode(prompt, return_tensors="pt")
  output = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
  generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
  return generated_text

# Load tokenizer and model
tokenizer, model = load_model('./llama_3.2-3B-huggingface')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Generate text
prompt = "How much is 6593 Orchard Place, Burnaby, Canada? Answer:"
rst = generate_text(tokenizer, model, prompt, max_length=150)
rst

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'How much is 6593 Orchard Place, Burnaby, Canada? Answer: The cost for 6593 Orchard Place, Burnaby, Canada is $1,999,000 CAD. This price is based on current listings on Real Estate Central. The price may be out of date. To find current prices, go to Real Estate Central. Prices of homes have fallen in Burnaby by 16% this year, with 6593 Orchard Place, Burnaby, Canada being listed 6 days ago. By this we mean in the past 90 days. The 4 bed, 4 bath, 2,100 square foot home sitting on a 0.08 acre lot has been on Real Estate Central for 6 days.'

### Rechunk the model

In [None]:
! ls '/content/drive/MyDrive/Llama3.2-3B'
! cp -r '/content/drive/MyDrive/Llama3.2-3B' './Llama3.2-3B'

! python /usr/local/lib/python3.10/dist-packages/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ./Llama3.2-3B --model_size 3B --llama_version 3.2 --output_dir ./llama_3.2-3B-huggingface

checklist.chk  consolidated.00.pth  params.json  tokenizer.model
2024-10-26 07:32:23.694335: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-26 07:32:23.716719: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-26 07:32:23.723467: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Converting the tokenizer.
Saving a LlamaTokenizerFast to ./llama_3.2-3B-huggingface.
Converting the model.
Fetching all parameters from the checkpoint at ./Llama3.2-3B.
  loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
Loading the checkpoint in a Llama m

In [None]:
def split_safetensors(root_path, safetensor_file, embed_layer, shardnum):

  embed_list = []

  with safe_open(safetensor_file, framework="pt", device="cpu") as f:
    x = f.get_tensor(embed_layer)
    splits = torch.chunk(x, shardnum)

    for i, split in enumerate(splits):
      tmp_embed_name = f"{embed_layer}-{str(i).zfill(2)}.safetensors"
      save_file({embed_layer: split}, os.path.join(root_path, tmp_embed_name), metadata={'format': 'pt'})
      embed_list.append(tmp_embed_name)

  return embed_list

def merge_safetensors(root_path, safetensor_files, output_file):
    merged_data = {}
    output_tensor = []

    for safetensor_file in safetensor_files:
      with safe_open(os.path.join(root_path, safetensor_file), framework="pt", device="cpu") as f:
        for key in f.keys():
          output_tensor.append(f.get_tensor(key))

    merged_data[key] = torch.cat(output_tensor, dim=0)
    save_file(merged_data, os.path.join(root_path, output_file), metadata={'format': 'pt'})
    return output_file

In [None]:
root_path = './llama_3.2-3B-huggingface/'

safetensor_files = [f"{root_path}model-00001-of-00002.safetensors", f"{root_path}model-00002-of-00002.safetensors"]
overall_count, index_names = 0, []
embed_layer = 'embed_tokens'

# Open the safetensors file and load the tensors
for safetensor_file in safetensor_files:
  with safe_open(safetensor_file, framework="pt", device="cpu") as f:
    for key in f.keys():
      if embed_layer in key:
        split_file_names = split_safetensors(root_path, safetensor_file, key, 10)
        merge_safetensors(root_path, split_file_names, f"{embed_layer}.safetensors")
        index_names.append(f"{embed_layer}.safetensors")
      else:
        save_file({key: f.get_tensor(key)}, os.path.join(root_path, f"model-{str(overall_count).zfill(5)}-{key}.safetensors"), metadata={'format': 'pt'})
        index_names.append(f"model-{str(overall_count).zfill(5)}-{key}.safetensors")
      overall_count += 1

# Generate new index json
indexs = {}
for path in index_names:
  with safe_open(os.path.join(root_path, path), framework="pt", device="cpu") as f:
    for key in f.keys():
      indexs[key] = path


# Change new index JSON
safetensor_file = "./llama_3.2-3B-huggingface/model.safetensors.index.json"
with open(safetensor_file, 'r') as f:
  index_tensors = json.load(f)
  for k in index_tensors['weight_map']:
    index_tensors['weight_map'][k] = indexs[k]

with open(safetensor_file, 'w') as f:
  json.dump(index_tensors, f)



### Check if Same

In [None]:
# ! rm ./llama_3.2-3B-huggingface/model-*-of-*.safetensors

In [None]:
# Load tokenizer and model
# tokenizer, model = load_model("/content/drive/MyDrive/llama_3.2-3B-huggingface")
tokenizer_1, model_1 = load_model("./llama_3.2-3B-huggingface/")


Loading checkpoint shards:   0%|          | 0/254 [00:00<?, ?it/s]

In [None]:

for (name, param), (name_2, param_2) in zip(model.named_parameters(), model_1.named_parameters()):
  # print(name, (param == param_2).data.all().data)
  if not (param == param_2).data.all().data:
    print(name, param, param_2)

In [None]:
# Generate text
prompt = "How much is 6593 Orchard Place, Burnaby, Canada? Answer:"
rst = generate_text(tokenizer_1, model_1, prompt, max_length=150)
rst

'How much is 6593 Orchard Place, Burnaby, Canada? Answer: The cost of 6593 Orchard Place, Burnaby, Canada is C$ 1,995,000.\nHow much is 6593 Orchard Place, Burnaby, Canada?\nThe price of 6593 Orchard Place, Burnaby, Canada is C$ 1,995,000.'

In [None]:
# Generate text
prompt = "How much is 6593 Orchard Place, Burnaby, Canada? Answer:"
rst = generate_text(tokenizer, model, prompt, max_length=150)
rst

"How much is 6593 Orchard Place, Burnaby, Canada? Answer: The price is C$ 2,798,000 or per square meter 700 CAD.\n6593 Orchard Place is a townhouse located in Burnaby, Canada. The price per square meter for this townhouse is C$ 700 CAD and is 6593 square meters (or 7069 square feet), on 6593 square meters of land.\nIf you're the do-it-yourself type, you can save on the cost of a real estate agent by buying this property and doing the work yourself, or by hiring your own contractor. The cost of a real estate agent's commission in this case would be 2.75% of the purchase"

### Git Push

In [None]:
shardnum = 5

shard_folders = [root_path.rstrip("/") + f"-shard{x}" for x in range(shardnum)]
for shard_folder in shard_folders:
  if os.path.exists(shard_folder):
    shutil.rmtree(shard_folder)
  os.mkdir(shard_folder)

files = os.listdir(root_path)
files = sorted([x for x in files if not x.endswith(".safetensors")]) + sorted([x for x in files if x.endswith(".safetensors")])

def split_list(lst, n_pieces):
    # Calculate the approximate size of each piece
    k, m = divmod(len(lst), n_pieces)
    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n_pieces)]

pieces = split_list(files, shardnum)

for piece in pieces:
  for file in piece:
    shutil.move(os.path.join(root_path, file), os.path.join(shard_folders[pieces.index(piece)], file))


In [None]:
! ls -lhtr ./llama_3.2-3B-huggingface

! ls -lhtr ./llama_3.2-3B-huggingface-shard0
! ls -lhtr ./llama_3.2-3B-huggingface-shard1
! ls -lhtr ./llama_3.2-3B-huggingface-shard2
! ls -lhtr ./llama_3.2-3B-huggingface-shard3
! ls -lhtr ./llama_3.2-3B-huggingface-shard4

total 0
total 1.1G
-rw-r--r-- 1 root root  50K Oct 26 07:32 tokenizer_config.json
-rw-r--r-- 1 root root  301 Oct 26 07:32 special_tokens_map.json
-rw-r--r-- 1 root root  17M Oct 26 07:32 tokenizer.json
-rw-r--r-- 1 root root  839 Oct 26 07:33 config.json
-rw-r--r-- 1 root root  150 Oct 26 07:33 generation_config.json
-rw-r--r-- 1 root root 6.2K Oct 26 08:00 model-00001-model.layers.0.input_layernorm.weight.safetensors
-rw-r--r-- 1 root root  49M Oct 26 08:00 model-00002-model.layers.0.mlp.down_proj.weight.safetensors
-rw-r--r-- 1 root root  49M Oct 26 08:00 model-00003-model.layers.0.mlp.gate_proj.weight.safetensors
-rw-r--r-- 1 root root  49M Oct 26 08:00 model-00004-model.layers.0.mlp.up_proj.weight.safetensors
-rw-r--r-- 1 root root 6.2K Oct 26 08:00 model-00005-model.layers.0.post_attention_layernorm.weight.safetensors
-rw-r--r-- 1 root root 6.1M Oct 26 08:00 model-00006-model.layers.0.self_attn.k_proj.weight.safetensors
-rw-r--r-- 1 root root  19M Oct 26 08:00 model-00007-model.l

In [None]:
! mv ./llama_3.2-3B-huggingface-shard0 /content/drive/MyDrive/
! mv ./llama_3.2-3B-huggingface-shard1 /content/drive/MyDrive/
! mv ./llama_3.2-3B-huggingface-shard2 /content/drive/MyDrive/
! mv ./llama_3.2-3B-huggingface-shard3 /content/drive/MyDrive/
! mv ./llama_3.2-3B-huggingface-shard4 /content/drive/MyDrive/

In [None]:
splits[9].shape, splits[0].shape

(torch.Size([12822, 3072]), torch.Size([12826, 3072]))

In [None]:
! ls ./llama_3.2-3B-huggingface-shard0

config.json
generation_config.json
model-00000-model.embed_tokens.weight.safetensors
model-00001-model.layers.0.input_layernorm.weight.safetensors
model-00002-model.layers.0.mlp.down_proj.weight.safetensors
model-00003-model.layers.0.mlp.gate_proj.weight.safetensors
model-00004-model.layers.0.mlp.up_proj.weight.safetensors
model-00005-model.layers.0.post_attention_layernorm.weight.safetensors
model-00006-model.layers.0.self_attn.k_proj.weight.safetensors
model-00007-model.layers.0.self_attn.o_proj.weight.safetensors
model-00008-model.layers.0.self_attn.q_proj.weight.safetensors
model-00009-model.layers.0.self_attn.v_proj.weight.safetensors
model-00010-model.layers.1.input_layernorm.weight.safetensors
model-00011-model.layers.1.mlp.down_proj.weight.safetensors
model-00012-model.layers.1.mlp.gate_proj.weight.safetensors
model-00013-model.layers.1.mlp.up_proj.weight.safetensors
model-00014-model.layers.1.post_attention_layernorm.weight.safetensors
model-00015-model.layers.1.self_attn.k_pr