# LLaMA 2 Finetuning

*Code reference: https://www.philschmid.de/sagemaker-llama2-qlora*

## Dependencies

In [12]:
!pip install "transformers==4.31.0" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet
!pip install -r requirements.txt --quiet


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Environment Variables

In [2]:
# Get the token from .env file
import os
from dotenv import load_dotenv
load_dotenv()

# Get the token from the environment variable called HF_TOKEN
hf_token = os.getenv("HF_TOKEN")

!huggingface-cli login --token {hf_token}


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\Riley\.cache\huggingface\token
Login successful


## Setup Development Environment

In [4]:
import sagemaker
import boto3

aws_defaulted_region = os.getenv("AWS_DEFAULT_REGION")
boto_session = boto3.Session(region_name=aws_defaulted_region)
sess = sagemaker.Session(boto_session=boto_session)
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = os.environ['SAGEMAKER_ROLE_ARN']
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker role arn: arn:aws:iam::960115796077:role/service-role/AmazonSageMaker-ExecutionRole-20230817T193823
sagemaker bucket: sagemaker-ap-southeast-2-960115796077
sagemaker session region: ap-southeast-2


## Load Dataset

In [7]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])
# dataset size: 15011

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 8.20k/8.20k [00:00<00:00, 8.14MB/s]


Downloading and preparing dataset json/databricks--databricks-dolly-15k to C:/Users/Riley/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data: 100%|██████████| 13.1M/13.1M [00:02<00:00, 4.90MB/s]
Downloading data files: 100%|██████████| 1/1 [00:04<00:00,  4.80s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 86.87it/s]
                                                        

Dataset json downloaded and prepared to C:/Users/Riley/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.
dataset size: 15011
{'instruction': 'Classify each of the following on the basis of text (alphabetically) or numbers (numerically) - Bangalore, 265, Apple, 10, Bag, 49.', 'context': '', 'response': 'The classification can be done either on the basis of numbers which could be either numerically or in roman context. If we eliminate the same, the other one will be on the basis of text. Based on the above explanation, it could be classified as :\nText - Bangalore, Apple, Bag\nNumbers - 265, 10, 49', 'category': 'classification'}




## Format Data For Finetuning

In [5]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt



In [8]:
from random import randrange

print(format_dolly(dataset[randrange(len(dataset))]))


### Instruction
Extract the destination countries of Azorean emigrants. Separate them with a comma.

### Context
Since the 17th century, many Azoreans have emigrated, mainly to Brazil, Uruguay, the United States and Canada.

### Answer
Brazil, Uruguay, United States, Canada


In [9]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token


Downloading (…)okenizer_config.json: 100%|██████████| 776/776 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 955kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:01<00:00, 1.60MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 414kB/s]


In [10]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
# print random sample
print(dataset[randint(0, len(dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")


                                                                    

### Instruction
Are these blue or red colors? Pink, navy, maroon, mahogany, teal, sky, robin, scarlet

### Answer
Pink: red, navy: blue, maroon: red, mahogany: red, teal: blue, sky: blue, robin: blue, scarlet: red</s>


                                                                   

Total number of samples: 1581




In [11]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/processed/llama/dolly/train'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")


                                                                                           

uploaded data to:
training dataset to: s3://sagemaker-ap-southeast-2-960115796077/processed/llama/dolly/train


In [None]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name
job_name = f'huggingface-qlora-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'epochs': 3,                                      # number of training epochs
  'per_device_train_batch_size': 2,                 # batch size for training
  'lr': 2e-4,                                       # learning rate used during training
  'hf_token': HfFolder.get_token(),                 # huggingface token to access llama 2
  'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_clm.py',      # train script
    source_dir           = 'scripts',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
)
