# LLaMA 2 Finetuning

*Code reference: https://www.philschmid.de/sagemaker-llama2-qlora*

## Dependencies

In [59]:
!pip install "transformers==4.31.0" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Environment Variables

In [60]:
# Get the token from .env file
import os
from dotenv import load_dotenv
load_dotenv()

# Get the token from the environment variable called HF_TOKEN
hf_token = os.getenv("HF_TOKEN")

!huggingface-cli login --token {hf_token}


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\Riley\.cache\huggingface\token
Login successful


## Setup Development Environment

In [61]:
import sagemaker
import boto3

aws_defaulted_region = os.getenv("AWS_DEFAULT_REGION")
print(aws_defaulted_region)
boto_session = boto3.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name=os.getenv('AWS_DEFAULT_REGION')
)
sess = sagemaker.Session(boto_session=boto_session)
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = os.getenv("SAGEMAKER_ROLE_ARN")
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



us-west-2
sagemaker role arn: arn:aws:iam::960115796077:role/service-role/AmazonSageMaker-ExecutionRole-20230817T193823
sagemaker bucket: sagemaker-us-west-2-960115796077
sagemaker session region: us-west-2


## Load Dataset

In [62]:
from datasets import Dataset

# Reading data from files into Python lists
original_path = "datasets/simpa-master/ss-original.txt"
lexical_path = "datasets/simpa-master/ss-ls-simplified.txt"
syntactic_path = "datasets/simpa-master/ss-simplified.txt"

with open(original_path, 'r', encoding='utf-8') as f:
    original_sentences = [line.strip() for line in f]
with open(lexical_path, 'r', encoding='utf-8') as f:
    lexical_sentences = [line.strip() for line in f]
with open(syntactic_path, 'r', encoding='utf-8') as f:
    syntactic_sentences = [line.strip() for line in f]

# Convert to Dataset object
data_syntactic = {
    'instruction': original_sentences,
    'response': syntactic_sentences,
}
dataset_syntactic = Dataset.from_dict(data_syntactic)

data_lexical = {
    'instruction': original_sentences,
    'response': lexical_sentences,
}
dataset_lexical = Dataset.from_dict(data_lexical)

print(f"dataset size: {len(dataset_syntactic)}")
print(dataset_syntactic[randrange(len(dataset_syntactic))])
print(f"dataset size: {len(dataset_lexical)}")
print(dataset_lexical[randrange(len(dataset_lexical))])



dataset size: 1100
{'instruction': 'The exhibitions served a number of purposes - their main focus was to promote business and industry, open up new markets, and generally to outperform competitors, in an increasingly global economic market.', 'response': 'The exhibitions served a number of purposes. The exhibitions main focus was to promote business and industry, open up new markets, and generally to outperform competitors, in an increasingly global economic market'}
dataset size: 1100
{'instruction': 'Regenerating Sheffield, ensuring all new development is sustainable, raising design quality and launching a new City Plan.', 'response': 'Regenerating Sheffield, making sure all new development is sustainable, improving design standards and launching a new City Plan.'}


## Format Data For Finetuning

In [63]:
def format_syntactic(example):
    instruction = f"### Instruction\nPlease syntactically simplify: {example['instruction']}"
    response = f"### Answer\n{example['response']}"
    return {'instruction': instruction, 'response': response}

def format_lexical(example):
    instruction = f"### Instruction\nPlease lexically simplify: {example['instruction']}"
    response = f"### Answer\n{example['response']}"
    return {'instruction': instruction, 'response': response}

print(type(dataset_syntactic))
print(dataset_syntactic.features.type)
print(format_syntactic(dataset_syntactic[randrange(len(dataset_syntactic))]))

<class 'datasets.arrow_dataset.Dataset'>
struct<instruction: string, response: string>
{'instruction': '### Instruction\nPlease syntactically simplify: Our guidance on organised firework displays and bonfires sets out a few simple measures that can help you celebrate safely.', 'response': '### Answer\nTo help you celebrate safely, our guidance on organised firework displays and bonfires sets out a few simple measures.'}


In [64]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token




In [65]:
from itertools import chain
from functools import partial

def chunk_syntactic(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batc
    global remainder_syntactic
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder_syntactic[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder_syntactic to global variable for next batch
    remainder_syntactic = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

def chunk_lexical(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batc
    global remainder_lexical
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder_lexical[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder_lexical to global variable for next batch
    remainder_lexical = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

In [66]:
from random import randint

# template dataset to add prompt to each sample
def template_dataset_syntactic(sample):
    sample["text"] = f"{format_syntactic(sample)}{tokenizer.eos_token}"
    return sample

def template_dataset_lexical(sample):
    sample["text"] = f"{format_lexical(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset_syntactic = dataset_syntactic.map(template_dataset_syntactic, remove_columns=list(dataset_syntactic.features))
dataset_lexical = dataset_lexical.map(template_dataset_lexical, remove_columns=list(dataset_lexical.features))

# print random sample
print(dataset_syntactic[randint(0, len(dataset_syntactic))]["text"])
print(dataset_lexical[randint(0, len(dataset_lexical))]["text"])

# empty list to save remainder from batches to use in next batch
remainder_syntactic = {"input_ids": [], "attention_mask": [], "token_type_ids": []}
remainder_lexical = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

# tokenize and chunk dataset
lm_dataset_syntactic = dataset_syntactic.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset_syntactic.features)
).map(
    partial(chunk_syntactic, chunk_length=2048),
    batched=True,
)

lm_dataset_lexical = dataset_lexical.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset_lexical.features)
).map(
    partial(chunk_lexical, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset_lexical)}")
print(f"Total number of samples: {len(lm_dataset_syntactic)}")

                                                    

{'instruction': '### Instruction\nPlease syntactically simplify: This study looks at the highways and transport impacts of a possible major retail development at Manor Top District Shopping Centre.', 'response': '### Answer\nThis study looks at the effects of roads and transport for a possible large shopping development. This will be at Manor Top District Shopping Centre.'}</s>
{'instruction': '### Instruction\nPlease lexically simplify: Most wards kept their name and much of their current electorate, but a small number of wards had significant differences.', 'response': '### Answer\nMost wards kept their name and most of their electorate, but a small number of wards had big differences.'}</s>


                                                                 

Total number of samples: 54
Total number of samples: 55




In [67]:
from datasets import concatenate_datasets

# Combine the datasets
combined_dataset = concatenate_datasets([lm_dataset_syntactic, lm_dataset_lexical])

In [68]:
# Print total number of samples
print(f"Total number of samples: {len(combined_dataset)}")

Total number of samples: 109


In [69]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/processed/llama/simplereads/train'
combined_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")


                                                                                         

uploaded data to:
training dataset to: s3://sagemaker-us-west-2-960115796077/processed/llama/simplereads/train


In [70]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name
job_name = f'huggingface-qlora-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# hyperparameters, which aref passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'epochs': 3,                                      # number of training epochs
  'per_device_train_batch_size': 2,                 # batch size for training
  'lr': 2e-4,                                       # learning rate used during training
  'hf_token': HfFolder.get_token(),                 # huggingface token to access llama 2
  'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_clm.py',      # train script
    source_dir           = 'scripts',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
)


In [71]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-2023-09-02-23-54-15-2023-09-02-13-54-15-386


2023-09-02 13:54:19 Starting - Starting the training job...
2023-09-02 13:54:34 Starting - Preparing the instances for training......
2023-09-02 13:55:48 Downloading - Downloading input data
2023-09-02 13:55:48 Training - Downloading the training image..............................
2023-09-02 14:01:05 Training - Training image download completed. Training in progress......bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-09-02 14:02:04,276 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-09-02 14:02:04,289 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-09-02 14:02:04,298 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-09-02 14:02:04,300 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-09-02 14:02:05,612 sagemaker-training-toolkit INFO     Installing 