In [84]:
import time
import boto3
import sagemaker
import urllib
import pandas as pd
from sklearn.model_selection import train_test_split

In [85]:
# 获取当前AWS region和notebook所绑定的role
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [86]:
# 设置数据存放的桶和工作目录
bucket = 'housing-lab'
prefix = 'ori-data'

In [87]:
X = pd.read_csv("/706-Final-Project/20_Cleaned_Data/prepared_data.csv" )

In [88]:
# # 可以使用 SageMaker session 来上传下载数据，这里把S3的训练数据下载到当前目录
# sm_session = sagemaker.Session()
# sm_session.download_data("./", bucket, prefix + 'train.csv')
# sm_session.download_data("./", bucket, prefix + 'test.csv')

In [89]:
# 通过describe来对查看一下训练数据feature的构成和分布
X.head()

Unnamed: 0,Price/Square Meter,Housing_ID,Livable Area,Price,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level
0,38544.0,1,56.3,217.0,0,0,1,0,1,2
1,33667.0,2,55.1,185.5,0,0,1,0,3,1
2,91429.0,3,70.0,640.0,0,0,1,0,1,3
3,102752.0,4,98.49,1012.0,0,0,1,0,1,4
4,98847.0,5,60.7,600.0,0,0,1,0,3,4


In [90]:
# 提取训练数据中的label列
y = X['Price/Square Meter']

In [91]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [92]:
y

0         38544.0
1         33667.0
2         91429.0
3        102752.0
4         98847.0
           ...   
69299     31305.0
69300     33710.0
69301     30485.0
69302     33380.0
69303     29861.0
Name: Price/Square Meter, Length: 69304, dtype: float64

In [93]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [94]:
# 在原始数据中丢弃label列
X.drop(['Price/Square Meter'], axis=1, inplace=True)

In [95]:
X.head()

Unnamed: 0,Housing_ID,Livable Area,Price,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level
0,1,56.3,217.0,0,0,1,0,1,2
1,2,55.1,185.5,0,0,1,0,3,1
2,3,70.0,640.0,0,0,1,0,1,3
3,4,98.49,1012.0,0,0,1,0,1,4
4,5,60.7,600.0,0,0,1,0,3,4


In [96]:
# 拆分原始训练数据为四份，X_train, X_valid, y_train, y_valid
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [97]:
X_train_xgb= pd.concat([y_train,X_train], axis=1)
X_train_xgb.to_csv('./X_train.csv',header=False,index=False)

In [98]:
X_valid_xgb= pd.concat([y_valid,X_valid], axis=1)
X_valid_xgb.to_csv('./X_valid.csv',header=False,index=False)

In [99]:
sm_session = sagemaker.Session()
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
sm_session.upload_data('./X_train.csv',bucket,prefix)
sm_session.upload_data('./X_valid.csv',bucket,prefix)

's3://housing-lab/ori-data/X_valid.csv'

In [100]:
# 我们使⽤SageMaker内置的XGBoost来训练，⾸先要获取这个算法的container
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', repo_version='0.90-2')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [102]:
# 设置算法超参
hyperparameters = {
"max_depth":"5",
"eta":"0.5",
"early_stopping_rounds":"5",
"eval_metric":"rmse",
"num_round":"30",
"objective":"reg:linear"}

In [103]:
# 训练使⽤哪种EC2实例来完成
instance_type = 'ml.m4.xlarge'

In [104]:
# 模型输出⽬录
output_path = 's3://{}/{}/output'.format(bucket, prefix)

In [105]:
# 训练输⼊数据的类型
content_type = "csv"

In [106]:
# 设置训练任务的名字
job_name = 'xgb-housing-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

Training job xgb-housing-2020-11-07-03-07-58


In [107]:
# 借助 Managed Spot Training 功能，⼤幅降低训练成本
train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None
checkpoint_s3_uri = ('s3://{}/{}/checkpoints/{}'.format(bucket, prefix,
job_name)
if train_use_spot_instances else None)

In [108]:
# 在SageMaker⾥的使⽤任何算法来训练都要先⽣成⼀个 estimator 对象
xgb = sagemaker.estimator.Estimator(container,
role,
hyperparameters=hyperparameters,
train_instance_count=1,
train_instance_type=instance_type,
train_volume_size=5, # 5 GB
output_path=output_path,
sagemaker_session=sagemaker.Session(),
train_use_spot_instances=train_use_spot_instances,
train_max_run=train_max_run,
train_max_wait=train_max_wait,
checkpoint_s3_uri=checkpoint_s3_uri
);

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [109]:
# 设置训练数据的 data channel
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'X_train.csv')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix,
'X_valid.csv')
train_channel = sagemaker.session.s3_input(train_data_location,
content_type='csv')
valid_channel = sagemaker.session.s3_input(validation_data_location,
content_type='csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [110]:
%%time
# 调⽤ fit 来训练
xgb.fit(inputs=data_channels, job_name=job_name, logs=True)

2020-11-07 03:08:12 Starting - Starting the training job...
2020-11-07 03:08:14 Starting - Launching requested ML instances......
2020-11-07 03:09:27 Starting - Preparing the instances for training......
2020-11-07 03:10:21 Downloading - Downloading input data...
2020-11-07 03:10:55 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:roo

In [None]:
%%time
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

# xgb = sagemaker.estimator.Estimator(container,
#                                     role=sagemaker.get_execution_role(), 
#                                     instance_count= 1, # make sure you have limit set for these instances
#                                     instance_type='ml.m4.xlarge', 
#                                     output_path='s3://{}/{}/output'.format(bucket, prefix),
#                                     sagemaker_session=sagemaker.Session())

# # # 设置算法超参
# # hyperparameters = {
# # "max_depth":"5",
# # "eta":"0.5",
# # "early_stopping_rounds":"5",
# # "eval_metric":"rmse",
# # "num_round":"30"
# # "objective"="reg:linear",
# }


hyperparameter_ranges = {'max_depth':IntegerParameter(1,10),
                         'alpha': ContinuousParameter(0, 100),
                         'min_child_weight': ContinuousParameter(1, 5),
                         'eta': ContinuousParameter(0.2, 0.7),  
                         'num_round': IntegerParameter(10,200)
                         }


objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=3, # Set this to 10 or above depending upon budget & available time.
                            max_parallel_jobs=1,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit(inputs=data_channels, include_cls_metadata=False)
tuner.wait()

...................................................................................................................................................................

In [112]:
%%time
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

CPU times: user 65.5 ms, sys: 3.72 ms, total: 69.2 ms
Wall time: 1.18 s


'Completed'

In [114]:
%%time
from pprint import pprint
from sagemaker.analytics import HyperparameterTuningJobAnalytics

tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())

df_tuning_job_analytics = tuner_analytics.dataframe()

# Sort the tuning job analytics by the final metrics value
df_tuning_job_analytics.sort_values(
    by=['FinalObjectiveValue'],
    inplace=True,
    ascending=False if tuner.objective_type == "Maximize" else True)

# Show detailed analytics for the top 20 models
df_tuning_job_analytics.head(20)

CPU times: user 27.9 ms, sys: 0 ns, total: 27.9 ms
Wall time: 93.8 ms


Unnamed: 0,alpha,eta,max_depth,min_child_weight,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,21.849719,0.279033,10.0,3.377322,190.0,sagemaker-xgboost-201107-0315-003-8ac11f32,Completed,529.067017,2020-11-07 03:28:05+00:00,2020-11-07 03:29:40+00:00,95.0
1,33.898059,0.252117,6.0,4.863162,43.0,sagemaker-xgboost-201107-0315-002-763050db,Completed,1040.089966,2020-11-07 03:23:39+00:00,2020-11-07 03:24:55+00:00,76.0
2,74.802844,0.633642,5.0,1.446697,34.0,sagemaker-xgboost-201107-0315-001-f042e969,Completed,1598.369995,2020-11-07 03:18:17+00:00,2020-11-07 03:19:34+00:00,77.0


In [115]:
attached_tuner = HyperparameterTuner.attach(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())
best_training_job = attached_tuner.best_training_job()

In [116]:
from sagemaker.estimator import Estimator
algo_estimator = Estimator.attach(best_training_job)

best_algo_model = algo_estimator.create_model(env={'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT':"text/csv"})


2020-11-07 03:29:40 Starting - Preparing the instances for training
2020-11-07 03:29:40 Downloading - Downloading input data
2020-11-07 03:29:40 Training - Training image download completed. Training in progress.
2020-11-07 03:29:40 Uploading - Uploading generated training model
2020-11-07 03:29:40 Completed - Training job completed


In [None]:
# %%time
# batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
# batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

# xgb_transformer = best_algo_model.transformer(instance_count=1,
#                                        instance_type='ml.m4.xlarge',
#                                        strategy='MultiRecord',
#                                        assemble_with='Line',
#                                        output_path=batch_output)


# xgb_transformer.transform(data=batch_input,
#                          data_type='S3Prefix',
#                          content_type='text/csv',
#                          split_type='Line')
# xgb_transformer.wait(logs=False)

In [117]:
%%time
# 部署模型

xgb_predictor = best_algo_model.deploy(initial_instance_count=1,
                         instance_type='ml.m4.xlarge')


-------------------!

In [119]:
%%time
from sagemaker.predictor import csv_serializer
# 创建 predictor
xgb_predictor=sagemaker.predictor.RealTimePredictor(
"sagemaker-xgboost-2020-11-07-03-37-14-981", # 这个名字就是上图中蓝⾊⽅框中endpoint的name
sagemaker_session=sm_session,
serializer=csv_serializer,
content_type='csv')
# 调⽤predictor的predict⽅法做推理
# xgb_predictor.predict(X_valid.values[0]).decode('utf-8')
xgb_predictor.predict(X_valid.values).decode('utf-8')

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


CPU times: user 178 ms, sys: 3.9 ms, total: 182 ms
Wall time: 1.09 s


'105993.578125,33510.2578125,74074.9453125,84864.4296875,22291.2890625,68574.84375,109075.453125,42073.06640625,85389.4453125,29303.921875,40356.96484375,69646.375,85035.6484375,91802.3125,18315.08984375,50104.23046875,69953.3125,55568.78125,70877.046875,82973.9453125,83953.21875,46621.4921875,64554.04296875,58443.015625,53253.9375,123986.625,59855.05859375,74982.59375,38134.87890625,55142.01953125,43241.35546875,36452.2421875,56308.10546875,87644.6953125,125942.375,43482.1953125,80422.40625,47276.171875,65387.4296875,47190.7421875,84138.8984375,46657.375,47940.171875,24684.888671875,40199.23828125,43152.8125,61822.42578125,81059.8515625,41716.46484375,45267.14453125,88779.390625,49867.20703125,69016.046875,74660.7265625,29805.654296875,45992.77734375,54766.4453125,42101.8515625,45180.4375,91590.65625,94845.53125,80197.703125,68964.671875,24222.33203125,80041.859375,85285.7890625,60506.4140625,45626.52734375,81094.9609375,61973.1328125,68634.5859375,26948.603515625,59321.02734375,47240

In [120]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
