The purpose of **model_id** variable is to identify the specific pre-trained model used for fine-tuning. 

In [None]:
# Identify the foundation model to fine-tune
model_id = "huggingface-llm-falcon-7b-bf16"

In [None]:
# Load Amazon SEC filing training data stored in an S3 bucket.
from sagemaker.jumpstart.utils import get_jumpstart_content_bucket

In [None]:
# Get the bucket containing sample training data
data_bucket = get_jumpstart_content_bucket(aws_region)
data_prefix = "training-datasets/sec_data"

# Define S3 paths for training and validation datasets
training_dataset_s3_path = f"s3://{data_bucket}/{data_prefix}/train/"
validation_dataset_s3_path = f"s3://{data_bucket}/{data_prefix}/validation/"

In [None]:
# Prepare training hyperparameters
from sagemaker import hyperparameters

The role of **my_hyperparameters** is to set the training options like epochs and batch size. 

In [None]:
# Retrieve default hyperparameters for the selected model
my_hyperparameters = hyperparameters.retrieve_default(
    model_id=model_id, model_version=model_version
)

# Customize hyperparameters
my_hyperparameters["epoch"] = "3"
my_hyperparameters["per_device_train_batch_size"] = "2"
my_hyperparameters["instruction_tuned"] = "False"
print(my_hyperparameters)

In [None]:
# Validate hyperparameters
hyperparameters.validate(
    model_id=model_id, model_version=model_version, hyperparameters=my_hyperparameters
)

In [None]:
# Initialize the JumpStart estimator for domain adaptation
from sagemaker.jumpstart.estimator import JumpStartEstimator

In [None]:
domain_adaptation_estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters=my_hyperparameters,
    instance_type="ml.p3dn.24xlarge",
)

The **fit() method** fine-tunes the model on the specified datasets.

In [None]:
# Start training, specifying paths for training and validation data
domain_adaptation_estimator.fit(
    {"train": training_dataset_s3_path, "validation": validation_dataset_s3_path}, logs=True
)

In [None]:
# Extract training performance metrics like loss and accuracy
from sagemaker import TrainingJobAnalytics

In [None]:
# Fetch metrics from the latest training job
training_job_name = domain_adaptation_estimator.latest_training_job.job_name
df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head(10)

In [None]:
# Deploy inference endpoints for both fine-tuned and pre-trained models for comparison
domain_adaptation_predictor = domain_adaptation_estimator.deploy()

In [None]:
# Deploy the pre-trained model for comparison
my_model = JumpStartModel(model_id=model_id)
pretrained_predictor = my_model.deploy()

In [None]:
# Define parameters for generating inference responses
parameters = {
    "max_new_tokens": 300,
    "top_k": 50,
    "top_p": 0.8,
    "do_sample": True,
    "temperature": 1,
}

The primary purpose of the **generate_response** function in the code is to generate text based on a given prompt using the trained model. 

In [None]:
# Function to query an endpoint and generate response text
def generate_response(endpoint_name, text):
    payload = {"inputs": f"{text}:", "parameters": parameters}
    query_response = query_endpoint_with_json_payload(
        json.dumps(payload).encode("utf-8"), endpoint_name=endpoint_name
    )
    generated_texts = parse_response(query_response)
    print(f"Response: {generated_texts}{newline}")

In [None]:
# Test phrases specific to the SEC domain
test_paragraph_domain_adaption = [
    "This Form 10-K report shows that",
    "We serve consumers through",
    "Our vision is",
]

In [None]:
# Compare responses from pre-trained and fine-tuned models
for paragraph in test_paragraph_domain_adaption:
    print("-" * 80)
    print(paragraph)
    print("-" * 80)
    print(f"{bold}pre-trained{unbold}")
    generate_response(pretrained_predictor.endpoint_name, paragraph)
    print(f"{bold}fine-tuned{unbold}")
    generate_response(domain_adaptation_predictor.endpoint_name, paragraph)

In [None]:
# Clean up by deleting SageMaker endpoints
pretrained_predictor.delete_model()
pretrained_predictor.delete_endpoint()
domain_adaptation_predictor.delete_model()
domain_adaptation_predictor.delete_endpoint()