In [67]:
#boto3 => Pyhton library for calling up AWS services
import boto3
import sagemaker
from sagemaker import get_execution_role

upload the datasets into S3 bucket

In [68]:
#provide the name and location of the files to be stored in the S3 bucket
bucket_name = 's3bucketloanprediction'
train_file_name = 'Loan Prediction/Train_final.csv'
val_file_name = 'Loan Prediction/Val_final.csv'
test_file_name = 'Loan Prediction/Test_final.csv'

model_output_location = r's3://{0}/LoanPrediction/model'.format(bucket_name)
train_file_location = r's3://{0}/{1}'.format(bucket_name, train_file_name)
val_file_location = r's3://{0}/{1}'.format(bucket_name, val_file_name)
test_file_location = r's3://{0}/{1}'.format(bucket_name, test_file_name)

In [69]:
#define a method for writing into s3 bucket
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [70]:
write_to_s3('Train_final.csv', bucket_name, train_file_name)
write_to_s3('Val_final.csv', bucket_name, val_file_name)
write_to_s3('Test_final.csv', bucket_name, test_file_name)

In [71]:
#provide the ECR container path since we are using north california
container = {'us-west-1': '746614075791.dkr.ecr.us-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3'}

In [72]:
print(boto3.Session().region_name)

us-west-1


In [73]:
role = get_execution_role()

In [74]:
print(role)

arn:aws:iam::528215570578:role/service-role/AmazonSageMaker-ExecutionRole-20200714T141836


### Build Model

In [75]:
#create a sagemaker session
sess = sagemaker.Session()

In [76]:
estimator = sagemaker.estimator.Estimator(container[boto3.Session().region_name],
                                          role,
                                          train_instance_count = 1,
                                          train_instance_type='ml.m4.xlarge',
                                          output_path = model_output_location, 
                                          sagemaker_session = sess,
                                          base_job_name = 'xgboost-loanprediction'
                                         )

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [91]:
#setting hyperparameters corresponding to the XGBoost algorithm
estimator.set_hyperparameters(max_depth=5, 
                              objective = 'binary:logistic', 
                              eta=0.1,
                              subsample=0.7,
                              num_round=10,
                              eval_metric = 'auc')

In [92]:
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'binary:logistic',
 'eta': 0.1,
 'subsample': 0.7,
 'num_round': 10,
 'eval_metric': 'auc'}

In [93]:
#training the model using fit model
training_file = sagemaker.session.s3_input(s3_data=train_file_location, content_type = "csv")
validation_file = sagemaker.session.s3_input(s3_data=val_file_location, content_type = "csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [94]:
print(training_file.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://s3bucketloanprediction/Loan Prediction/Train_final.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [95]:
print(validation_file.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://s3bucketloanprediction/Loan Prediction/Val_final.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [96]:
data_channels = {'train':training_file, 'validation':validation_file}

In [97]:
estimator.fit(inputs=data_channels, logs=True)

2020-08-05 05:10:04 Starting - Starting the training job...
2020-08-05 05:10:06 Starting - Launching requested ML instances.........
2020-08-05 05:11:41 Starting - Preparing the instances for training...
2020-08-05 05:12:27 Downloading - Downloading input data...
2020-08-05 05:12:43 Training - Downloading the training image...
2020-08-05 05:13:34 Uploading - Uploading generated training model
2020-08-05 05:13:34 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34m

In [98]:
#deploying the model and create an end point
predictor = estimator.deploy(initial_instance_count = 1,
                             instance_type = 'ml.m4.xlarge',
                             endpoint_name = 'xgboost-loanprediction-ver1')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!