In [40]:
import time
import boto3
import sagemaker
import urllib
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [41]:
cwd = os.getcwd()
print(cwd)

/home/ec2-user/SageMaker/706-Final-Project/10_Code


In [42]:
# os.chdir('/Users/Reinhard')#更改路径，''里面为更改的路径
 
# print(os.getcwd())#显示当前路径

In [43]:
! aws s3 mb s3://housing-lab

make_bucket: housing-lab


In [44]:
!aws s3 ls

2020-11-19 01:15:48 elasticbeanstalk-us-east-1-774975782106
2020-10-26 01:38:51 haidianhousing
2020-10-28 02:20:59 housing-lab


In [45]:
!aws s3 rb s3://elasticbeanstalk-us-east-1-774975782106 --force

remove_bucket: elasticbeanstalk-us-east-1-774975782106


In [46]:
!aws s3 ls

2020-10-26 01:38:51 haidianhousing
2020-10-28 02:20:59 housing-lab


In [47]:
! aws configure list

      Name                    Value             Type    Location
      ----                    -----             ----    --------
   profile                <not set>             None    None
access_key     ****************ZPF4         iam-role    
secret_key     ****************/LN/         iam-role    
    region                us-east-1      config-file    ~/.aws/config


In [48]:
# 获取当前AWS region和notebook所绑定的role
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [49]:
# 设置数据存放的桶和工作目录
bucket = 'housing-lab'
prefix = 'ori-data'

In [50]:
X = pd.read_csv("/home/ec2-user/SageMaker/706-Final-Project/20_Cleaned_Data/prepared_data.csv" )

In [51]:
# # 可以使用 SageMaker session 来上传下载数据，这里把S3的训练数据下载到当前目录
# sm_session = sagemaker.Session()
# sm_session.download_data("./", bucket, prefix + 'train.csv')
# sm_session.download_data("./", bucket, prefix + 'test.csv')

In [52]:
# 通过describe来对查看一下训练数据feature的构成和分布
X.head()

Unnamed: 0,Price/Square Meter,Housing_ID,Livable Area,Price,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level
0,38544.0,1,56.3,217.0,0,0,1,0,1,2
1,33667.0,2,55.1,185.5,0,0,1,0,3,1
2,91429.0,3,70.0,640.0,0,0,1,0,1,3
3,102752.0,4,98.49,1012.0,0,0,1,0,1,4
4,98847.0,5,60.7,600.0,0,0,1,0,3,4


In [53]:
# 提取训练数据中的label列
y = X['Price/Square Meter']

In [54]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [55]:
y

0         38544.0
1         33667.0
2         91429.0
3        102752.0
4         98847.0
           ...   
69299     31305.0
69300     33710.0
69301     30485.0
69302     33380.0
69303     29861.0
Name: Price/Square Meter, Length: 69304, dtype: float64

In [56]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [57]:
# 在原始数据中丢弃label列
X.drop(['Price/Square Meter'], axis=1, inplace=True)

In [58]:
X.head()

Unnamed: 0,Housing_ID,Livable Area,Price,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level
0,1,56.3,217.0,0,0,1,0,1,2
1,2,55.1,185.5,0,0,1,0,3,1
2,3,70.0,640.0,0,0,1,0,1,3
3,4,98.49,1012.0,0,0,1,0,1,4
4,5,60.7,600.0,0,0,1,0,3,4


In [59]:
# 拆分原始训练数据为四份，X_train, X_valid, y_train, y_valid
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [60]:
X_train_xgb= pd.concat([y_train,X_train], axis=1)
X_train_xgb.to_csv('./X_train.csv',header=False,index=False)

In [61]:
X_valid_xgb= pd.concat([y_valid,X_valid], axis=1)
X_valid_xgb.to_csv('./X_valid.csv',header=False,index=False)

In [62]:
sm_session = sagemaker.Session()
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
sm_session.upload_data('./X_train.csv',bucket,prefix)
sm_session.upload_data('./X_valid.csv',bucket,prefix)

's3://housing-lab/ori-data/X_valid.csv'

In [63]:
# 我们使⽤SageMaker内置的XGBoost来训练，⾸先要获取这个算法的container
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', repo_version='0.90-2')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [64]:
# 设置算法超参
hyperparameters = {
"max_depth":"5",
"eta":"0.5",
"early_stopping_rounds":"5",
"eval_metric":"rmse",
"num_round":"30",
"objective":"reg:linear"}

In [65]:
# 训练使⽤哪种EC2实例来完成
instance_type = 'ml.m4.xlarge'

In [66]:
# 模型输出⽬录
output_path = 's3://{}/{}/output'.format(bucket, prefix)

In [67]:
# 训练输⼊数据的类型
content_type = "csv"

In [68]:
# 设置训练任务的名字
job_name = 'xgb-housing-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

Training job xgb-housing-2020-11-19-02-54-04


In [69]:
# 借助 Managed Spot Training 功能，⼤幅降低训练成本
train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None
checkpoint_s3_uri = ('s3://{}/{}/checkpoints/{}'.format(bucket, prefix,
job_name)
if train_use_spot_instances else None)

In [70]:
# 在SageMaker⾥的使⽤任何算法来训练都要先⽣成⼀个 estimator 对象
xgb = sagemaker.estimator.Estimator(container,
role,
hyperparameters=hyperparameters,
train_instance_count=1,
train_instance_type=instance_type,
train_volume_size=5, # 5 GB
output_path=output_path,
sagemaker_session=sagemaker.Session(),
train_use_spot_instances=train_use_spot_instances,
train_max_run=train_max_run,
train_max_wait=train_max_wait,
checkpoint_s3_uri=checkpoint_s3_uri
);

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [71]:
# 设置训练数据的 data channel
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'X_train.csv')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix,
'X_valid.csv')
train_channel = sagemaker.session.s3_input(train_data_location,
content_type='csv')
valid_channel = sagemaker.session.s3_input(validation_data_location,
content_type='csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [72]:
%%time
# 调⽤ fit 来训练
xgb.fit(inputs=data_channels, job_name=job_name, logs=True)

2020-11-19 02:54:18 Starting - Starting the training job...
2020-11-19 02:54:22 Starting - Launching requested ML instances......
2020-11-19 02:55:38 Starting - Preparing the instances for training.........
2020-11-19 02:56:50 Downloading - Downloading input data...
2020-11-19 02:57:44 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV in

In [73]:
%%time
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

# xgb = sagemaker.estimator.Estimator(container,
#                                     role=sagemaker.get_execution_role(), 
#                                     instance_count= 1, # make sure you have limit set for these instances
#                                     instance_type='ml.m4.xlarge', 
#                                     output_path='s3://{}/{}/output'.format(bucket, prefix),
#                                     sagemaker_session=sagemaker.Session())

# # # 设置算法超参
# # hyperparameters = {
# # "max_depth":"5",
# # "eta":"0.5",
# # "early_stopping_rounds":"5",
# # "eval_metric":"rmse",
# # "num_round":"30"
# # "objective"="reg:linear",
# }


hyperparameter_ranges = {'max_depth':IntegerParameter(1,10),
                         'alpha': ContinuousParameter(0, 100),
                         'min_child_weight': ContinuousParameter(1, 5),
                         'eta': ContinuousParameter(0.2, 0.7),  
                         'num_round': IntegerParameter(10,200)
                         }


objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=3, # Set this to 10 or above depending upon budget & available time.
                            max_parallel_jobs=1,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit(inputs=data_channels, include_cls_metadata=False)
tuner.wait()

......................................................................................................................................................................................................!
!
CPU times: user 1.16 s, sys: 0 ns, total: 1.16 s
Wall time: 16min 41s


In [74]:
%%time
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

CPU times: user 71.4 ms, sys: 0 ns, total: 71.4 ms
Wall time: 1.15 s


'Completed'

In [75]:
%%time
from pprint import pprint
from sagemaker.analytics import HyperparameterTuningJobAnalytics

tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())

df_tuning_job_analytics = tuner_analytics.dataframe()

# Sort the tuning job analytics by the final metrics value
df_tuning_job_analytics.sort_values(
    by=['FinalObjectiveValue'],
    inplace=True,
    ascending=False if tuner.objective_type == "Maximize" else True)

# Show detailed analytics for the top 20 models
df_tuning_job_analytics.head(20)

CPU times: user 27.7 ms, sys: 0 ns, total: 27.7 ms
Wall time: 88.7 ms


Unnamed: 0,alpha,eta,max_depth,min_child_weight,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,92.202576,0.687525,10.0,1.125295,83.0,sagemaker-xgboost-201119-0258-003-9b1f78c2,Completed,836.39801,2020-11-19 03:09:01+00:00,2020-11-19 03:10:15+00:00,74.0
2,55.677068,0.321997,3.0,4.604017,72.0,sagemaker-xgboost-201119-0258-002-d8aecaa5,Completed,2000.130005,2020-11-19 03:04:59+00:00,2020-11-19 03:06:20+00:00,81.0
0,80.749329,0.663462,2.0,1.038747,123.0,sagemaker-xgboost-201119-0258-004-9d261826,Completed,2360.75,2020-11-19 03:13:27+00:00,2020-11-19 03:14:43+00:00,76.0
3,31.107361,0.559051,9.0,1.78883,73.0,sagemaker-xgboost-201119-0258-001-a6f4287e,Failed,,NaT,2020-11-19 03:01:39+00:00,


In [76]:
attached_tuner = HyperparameterTuner.attach(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())
best_training_job = attached_tuner.best_training_job()

In [77]:
from sagemaker.estimator import Estimator
algo_estimator = Estimator.attach(best_training_job)

best_algo_model = algo_estimator.create_model(env={'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT':"text/csv"})


2020-11-19 03:10:15 Starting - Preparing the instances for training
2020-11-19 03:10:15 Downloading - Downloading input data
2020-11-19 03:10:15 Training - Training image download completed. Training in progress.
2020-11-19 03:10:15 Uploading - Uploading generated training model
2020-11-19 03:10:15 Completed - Training job completed


In [38]:
# %%time
# batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
# batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

# xgb_transformer = best_algo_model.transformer(instance_count=1,
#                                        instance_type='ml.m4.xlarge',
#                                        strategy='MultiRecord',
#                                        assemble_with='Line',
#                                        output_path=batch_output)


# xgb_transformer.transform(data=batch_input,
#                          data_type='S3Prefix',
#                          content_type='text/csv',
#                          split_type='Line')
# xgb_transformer.wait(logs=False)

In [82]:
%%time
# 部署模型

endpoint_na = 'my-endpoint'
xgb_predictor = best_algo_model.deploy(initial_instance_count=1,
                         instance_type='ml.m4.xlarge',
                         endpoint_name= endpoint_na )


---------------!CPU times: user 270 ms, sys: 0 ns, total: 270 ms
Wall time: 7min 32s


In [84]:
%%time
from sagemaker.predictor import csv_serializer
# 创建 predictor
xgb_predictor=sagemaker.predictor.RealTimePredictor(
endpoint_name = endpoint_na, # 这个名字就是上图中蓝⾊⽅框中endpoint的name
sagemaker_session=sm_session,
serializer=csv_serializer,
content_type='csv')
# 调⽤predictor的predict⽅法做推理
# xgb_predictor.predict(X_valid.values[0]).decode('utf-8')
xgb_predictor.predict(X_valid.values).decode('utf-8')

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


CPU times: user 165 ms, sys: 0 ns, total: 165 ms
Wall time: 1.72 s


'105768.28125,33920.90625,74445.4609375,84981.21875,21619.96875,67523.1640625,111797.40625,42031.3828125,85411.15625,29543.62890625,40332.38671875,67999.8515625,85270.890625,90608.2265625,18534.982421875,50331.5625,70096.03125,55751.2734375,70814.125,83462.4375,83906.578125,46937.47265625,64581.5,58727.75,52166.421875,132715.34375,59734.91796875,74919.859375,38585.5859375,55560.25390625,43023.17578125,36788.13671875,55624.9375,87519.96875,121885.703125,43256.73046875,80196.625,47549.28515625,65701.8359375,46591.8046875,84154.3125,47195.234375,47254.29296875,23809.822265625,40356.5234375,43370.6328125,62548.3359375,80808.296875,41842.984375,45667.81640625,88554.3984375,49528.6015625,69183.8828125,74145.21875,30208.7734375,45645.953125,54554.265625,42006.21484375,45769.296875,91278.8125,95158.0625,80479.7890625,69010.53125,24011.169921875,79345.984375,84907.9453125,60636.6171875,45222.9921875,81245.5078125,61858.29296875,68780.4609375,27124.84765625,59177.22265625,47675.375,45728.5859375

In [85]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
