In [5]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

session = sagemaker.Session()

role = get_execution_role()

# If you're following along, you'll need to upload these datasets to your own bucket in S3. 

val_location = 's3://shubuck/nothingworks/hello_blaze_train'
train_location = 's3://shubuck/nothingworks/hello_blaze_train'

# We use this prefix to help us determine where the output will go. 

prefix = 's3://shubuck/nothingworks/'

# We need to get the location of the container. 

container = image_uris.retrieve('blazingtext', session.boto_region_name)

# Now that we know which container to use, we can construct the estimator object.
estim = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m5.large', # The type of instance to use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session
             
# These hyperparameters are beyond the scope of this course, but you can research the algoirthm here: 
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html    
    
estim.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        mode = 'supervised',
                        num_round=200)
                        
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

# The fit method launches the training job. 

estim.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: blazingtext-2023-08-26-16-40-57-914


2023-08-26 16:40:58 Starting - Starting the training job...
2023-08-26 16:41:14 Starting - Preparing the instances for training......
2023-08-26 16:41:59 Downloading - Downloading input data...
2023-08-26 16:42:55 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[08/26/2023 16:43:00 INFO 139722509731648] nvidia-smi took: 0.025225162506103516 secs to identify 0 gpus[0m
[34m[08/26/2023 16:43:00 INFO 139722509731648] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[08/26/2023 16:43:00 INFO 139722509731648] Processing /opt/ml/input/data/train/hello_blaze_train . File size: 40.886911392211914 MB[0m
[34m[08/26/2023 16:43:00 INFO 139722509731648] Processing /opt/ml/input/data/validation/hello_blaze_train . File size: 40.886911392211914 MB[0m
[34mRead 8M words[0m
[34mNumber of words:  33240[0m
[34m##### Alpha: 0.0486  Progress: 2.89%  Million Words