In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

# Specify the profile name and region
profile_name = '###'  # input
aws_region = '###'  # input

# Create a session using the specified profile
boto_session = boto3.Session(profile_name=profile_name, region_name=aws_region)

# Pass the session to the SageMaker Session
sagemaker_session = sagemaker.Session(
    boto_session=boto_session, sagemaker_client=boto_session.client('sagemaker', region_name='us-west-2'))
print(sagemaker_session.boto_region_name)

# Get the execution role for SageMaker
aws_role = sagemaker_session.get_caller_identity_arn()

# Get the default S3 bucket
output_bucket = sagemaker_session.default_bucket()

# This will be useful for printing
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

print(f"{bold}AWS Region:{unbold} {aws_region}")
print(f"{bold}AWS Role:{unbold} {aws_role}")
print(f"{bold}Output Bucket:{unbold} {output_bucket}")

In [None]:
from datasets import load_dataset, Dataset

squad_dataset = load_dataset("squad_v2", split="train")

questions, contexts, answers = [], [], []
answer_num = 0
non_answer_num = 0

for example in squad_dataset:
    # Extract question and context
    questions.append(example['question'])
    contexts.append(example['context'])

    # Check if the answer exits，if there is no answer for this question，set answer to "No answer"
    if example['answers']['text']:
        answers.append(example['answers']['text'][0])
        answer_num += 1
    else:
        answers.append("I'm sorry, I don't have the information or knowledge to provide an answer to your question. However, I'm here to assist you with any other inquiries you may have related to the course. Feel free to ask!")
        non_answer_num += 1

    # Create form
    transformed_data = {
        "question": questions,
        "context": contexts,
        "answer": answers
    }

transformed_squad_dataset = Dataset.from_dict(transformed_data)

# We split the dataset into two where test data is used to evaluate at the end.
train_and_test_dataset = transformed_squad_dataset.train_test_split(test_size=0.1)

# Dumping the training data to a local file to be used for training.
train_and_test_dataset["train"].to_json("train.jsonl")

In [None]:
import matplotlib.pyplot as plt

categories = ['Answerable', 'Non-answerable']
counts = [answer_num, non_answer_num]

# Plot the graph to compare Answerable and Non-answerable Questions in SQuAD 2.0
plt.figure(figsize=(10, 6))
plt.bar(categories, counts, color=['blue', 'red'])
plt.title('Comparison of Answerable and Non-answerable Questions in SQuAD 2.0')
plt.xlabel('Category')
plt.ylabel('Number of Questions')
plt.show()

In [None]:
train_and_test_dataset["train"][0]

In [None]:
import json

template = {
    "prompt": "Below is a question paired with an context that related to the question."
    "Write an answer that appropriately answer the question, if the information in the context cannot answer the question, answer 'I don't know'.\n\n"
    "### question:\n{question}\n\n### context:\n{context}\n\n",
    "completion": "{answer}",
}
with open("template.json", "w") as f:
    json.dump(template, f)

In [None]:
from sagemaker.s3 import S3Uploader
import sagemaker
import boto3

# Get the default S3 bucket
output_bucket = sagemaker_session.default_bucket()

local_data_file = "train.jsonl"
train_data_location = f"s3://{output_bucket}/squad_dataset_mistral"

# Use the SageMaker session for uploading
S3Uploader.upload(local_path=local_data_file, desired_s3_uri=train_data_location, sagemaker_session=sagemaker_session)
S3Uploader.upload(local_path="template.json", desired_s3_uri=train_data_location, sagemaker_session=sagemaker_session)

print(f"Training data: {train_data_location}")

In [None]:
import numpy as np

np.random.seed(42)  # Set random seed to maintain result achiveable
random_indices = np.random.permutation(len(transformed_squad_dataset))[:2000]

# Select data based on random index
mini_train_dataset = transformed_squad_dataset.select(random_indices)

# Save data to local file to be used for training.
mini_train_dataset.to_json("mini_train/train.jsonl")

In [None]:
mini_answer_num = 0
mini_non_answer_num = 0

for example in mini_train_dataset:

    # Check if the answer exits，if there is no answer for this question，set answer to "No answer"
    if example['answer'] == "I'm sorry, I don't have the information or knowledge to provide an answer to your question. However, I'm here to assist you with any other inquiries you may have related to the course. Feel free to ask!":
        mini_non_answer_num += 1
    else:
        mini_answer_num += 1

In [None]:
import matplotlib.pyplot as plt

categories = ['Answerable', 'Non-answerable']
counts = [mini_answer_num, mini_non_answer_num]

# Plot the graph to compare Answerable and Non-answerable Questions in mini SQuAD 2.0
plt.figure(figsize=(10, 6))
plt.bar(categories, counts, color=['blue', 'red'])
plt.title('Comparison of Answerable and Non-answerable Questions in mini SQuAD 2.0')
plt.xlabel('Category')
plt.ylabel('Number of Questions')
plt.show()

In [None]:
from sagemaker.s3 import S3Uploader
import sagemaker
import boto3

# Get the default S3 bucket
output_bucket = sagemaker_session.default_bucket()

local_data_file = "mini_train/train.jsonl"
mini_train_data_location = f"s3://{output_bucket}/mini_squad_dataset_mistral"

# Use the SageMaker session for uploading
S3Uploader.upload(local_path=local_data_file, desired_s3_uri=mini_train_data_location, sagemaker_session=sagemaker_session)
S3Uploader.upload(local_path="template.json", desired_s3_uri=mini_train_data_location, sagemaker_session=sagemaker_session)

print(f"Training data: {mini_train_data_location}")

In [None]:
model_id, model_version = "huggingface-llm-mistral-7b", "2.2.1"

In [None]:
from sagemaker import hyperparameters

my_hyperparameters = hyperparameters.retrieve_default(
    model_id=model_id, model_version=model_version, sagemaker_session=sagemaker_session
)
print(my_hyperparameters)

In [None]:
my_hyperparameters["epoch"] = "1"
my_hyperparameters["per_device_train_batch_size"] = "1"
my_hyperparameters["gradient_accumulation_steps"] = "2"
my_hyperparameters["instruction_tuned"] = "True"
my_hyperparameters["peft_type"] = "lora"
my_hyperparameters['gradient_checkpointing'] = "False"
print(my_hyperparameters)

In [None]:
hyperparameters.validate(
    model_id=model_id, model_version=model_version, hyperparameters=my_hyperparameters, sagemaker_session=sagemaker_session
)

In [None]:
from sagemaker.jumpstart.estimator import JumpStartEstimator
instruction_tuned_estimator = JumpStartEstimator(
    model_id=model_id,
    region="###", # input
    hyperparameters=my_hyperparameters,
    instance_type="###", # input
    role = "###", # input
    sagemaker_session=sagemaker_session,
)
print(instruction_tuned_estimator.sagemaker_session.boto_region_name)
print(train_data_location)
instruction_tuned_estimator.fit({"train": mini_train_data_location}, logs=True)

In [None]:
from sagemaker import TrainingJobAnalytics

training_job_name = instruction_tuned_estimator.latest_training_job.job_name

df = TrainingJobAnalytics(training_job_name=training_job_name, sagemaker_session=sagemaker_session).dataframe()
df_eval_loss = df[df["metric_name"].str.contains("eval-loss")]
df_train_loss = df[df["metric_name"].str.contains("train-loss")]

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(df_eval_loss["timestamp"], df_eval_loss["value"], label='Eval Loss')
plt.plot(df_train_loss["timestamp"], df_train_loss["value"], label='Train Loss')

plt.title('Loss over Time')
plt.xlabel('Timestamp')
plt.ylabel('Loss Value')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(sagemaker.__version__)

In [None]:
instruction_tuned_predictor = instruction_tuned_estimator.deploy(initial_instance_count=1, instance_type='ml.g5.2xlarge')


In [None]:
# Delete the SageMaker endpoint
instruction_tuned_predictor.delete_model()
instruction_tuned_predictor.delete_endpoint()