In [None]:
import os
import boto3

testingMode = False

endpoint = "XXX" if testingMode else os.environ.get('AWS_S3_ENDPOINT')
bucket_name = "XXX" if testingMode else os.environ.get('AWS_S3_BUCKET')
key_id = "XXX" if testingMode else os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = "XXX" if testingMode else os.environ.get('AWS_SECRET_ACCESS_KEY')
session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3_client = boto3.client('s3',endpoint_url=endpoint,aws_access_key_id=key_id, aws_secret_access_key=secret_key)
session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key)
s3_client = boto3.client('s3',endpoint_url=endpoint,aws_access_key_id=key_id, aws_secret_access_key=secret_key)

def download_s3_folder(s3_folder, local_dir):
    s3 =  boto3.client('s3',endpoint_url=endpoint,aws_access_key_id=key_id, aws_secret_access_key=secret_key)
    # List all objects within the specified S3 folder
    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)['Contents']
    
    for obj in objects:
        # Extract the file path from the object key
        s3_key = obj['Key']
        
        # Create the local file path
        relative_path = os.path.relpath(s3_key, s3_folder)
        local_file_path = os.path.join(local_dir, relative_path)
        
        # Ensure local directory exists
        local_dir_path = os.path.dirname(local_file_path)
        if not os.path.exists(local_dir_path):
            os.makedirs(local_dir_path)
        
        # Download the file from S3
        s3.download_file(bucket_name, s3_key, local_file_path)
        print(f"Downloaded {s3_key} to {local_file_path}")
    
def upload_s3_folder(folder_path):
    for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                s3_key = os.path.relpath(file_path, folder_path)
                try:
                    s3_client.upload_file(file_path, bucket_name, s3_key)
                    print(f'Successfully uploaded {file_path} to s3://{bucket_name}/{s3_key}')
                except FileNotFoundError:
                    print(f'The file {file_path} was not found')
                except NoCredentialsError:
                    print('Credentials not available')
                except PartialCredentialsError:
                    print('Incomplete credentials')

In [None]:
# Download Pipeline Training Args
import json

download_s3_folder("PipelineArgs","PipelineArgs")

with open("PipelineArgs/PipelineArgs.json", "r") as file:
    pipelineArgs = json.load(file)

required_keys = ["MODEL_NAME", "HF_MODEL_REPO_ID", "HF_DATASET_ID","SYSTEM_INST"]

# Check if the required keys are present in the JSON data
if all(key in pipelineArgs for key in required_keys):
    print("The JSON file contains all the required keys.")
else:
    missing_keys = [key for key in required_keys if key not in pipelineArgs]
    print(f"The JSON file is missing the following keys: {', '.join(missing_keys)}")

print("Pipeline Arguments downloaded and verified")
print(pipelineArgs)

In [None]:
# Download Model from S3
download_s3_folder(f"""{pipelineArgs["MODEL_NAME"]}/{pipelineArgs["HF_MODEL_REPO_ID"].split('/')[1]}/BaseModel""","BaseModel")

In [None]:
!pip install peft accelerate torch==2.3.1 transformers datasets trl sentencepiece

In [None]:
import os
import shutil
from datasets import load_dataset
from peft import LoraConfig
from peft.utils.other import prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer

modelName = os.path.abspath("BaseModel")
model = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = 'auto',
    token = False,
)
model = prepare_model_for_kbit_training( model )

peft_config = LoraConfig(
    r = 32,
    lora_alpha = 16,
    bias = "none",
    lora_dropout = 0.05, # Conventional
    task_type = "CAUSAL_LM",
)
model.add_adapter( peft_config )
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained( modelName, use_fast=False )
tokenizer.pad_token = tokenizer.eos_token

if os.path.isdir( "./temp" ):
    shutil.rmtree( "./temp" )

training_arguments = TrainingArguments(
    output_dir = "./temp",
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 1,
    save_strategy = "epoch",
    eval_strategy="epoch",
    learning_rate= 2e-4,
    fp16= True,
    load_best_model_at_end=True
)

model.config.use_cache = False

PROMPT = pipelineArgs["SYSTEM_INST"]

def formatting_func( example ):
    example['text'] = f"""<|im_start|>user\n{PROMPT} {example[pipelineArgs["DATASET_PROMPT_COL_NAME"]]} <|im_end|>\n<|im_start|>assistant\n{example[pipelineArgs["DATASET_ANS_COL_NAME"]]}<|im_end|>"""
    return example

def generate_and_tokenize_prompt( prompt ):
    return tokenizer( formatting_func( prompt ), truncation = True, max_length = 2048 )

dataset = load_dataset(pipelineArgs["HF_DATASET_ID"],split="train")
dataset = dataset.select(range(200)).shuffle(seed=65).train_test_split(test_size=0.2)
train_data = dataset['train'].map(formatting_func)
valid_data = dataset['test'].map(formatting_func)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=valid_data,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train()

if os.path.isdir( "./temp" ):
    shutil.rmtree( "./temp" )

directory = f"""Upload/{pipelineArgs["MODEL_NAME"]}/{pipelineArgs["HF_MODEL_REPO_ID"].split('/')[1]}/TrainedLoRA"""

if os.path.isdir( directory ):
    shutil.rmtree( directory )

trainer.model.save_pretrained( directory )
print( f"Model saved '{directory}'." )

upload_s3_folder("Upload")