In [1]:
import time
import boto3
import sagemaker
import urllib
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# 获取当前AWS region和notebook所绑定的role
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [3]:
# One has to set up a S3 bucket before running the code below
bucket = 'tommy-lab'
prefix = 'beijing-housing-data'

In [4]:
cwd = os.getcwd()
print(cwd)

/home/ec2-user/SageMaker/706-Final-Project/10_Code


In [5]:
X = pd.read_csv("/home/ec2-user/SageMaker/706-Final-Project/20_Cleaned_Data/prepared_data_tommy.csv" )

In [6]:
# 通过describe来对查看一下训练数据feature的构成和分布
X.head()

Unnamed: 0,Price/Square Meter,Housing_ID,Community,Livable Area,Price,Year_Built,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level,Floor_Plan
0,38544.0,1,中关村南大街甲3号 2室1厅 56.3平米,56.3,217.0,1965,0,0,1,0,1,2,1
1,33667.0,2,中关村南大街甲3号 2室1厅 55.1平米,55.1,185.5,1965,0,0,1,0,3,1,1
2,91429.0,3,民族大学南路19号院 2室1厅 70平米,70.0,640.0,1993,0,0,1,0,1,3,1
3,102752.0,4,民族大学南路19号院 2室1厅 98.49平米,98.49,1012.0,2000,0,0,1,0,1,4,1
4,98847.0,5,民族大学南路19号院 2室1厅 60.7平米,60.7,600.0,1991,0,0,1,0,3,4,1


In [7]:
# Extract the column Price/Square Meter from X
y = X['Price/Square Meter']

In [8]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [9]:
# Drop the column Price/Square Meter in X
X.drop(['Price/Square Meter'], axis=1, inplace=True)

In [10]:
X.head()

Unnamed: 0,Housing_ID,Community,Livable Area,Price,Year_Built,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level,Floor_Plan
0,1,中关村南大街甲3号 2室1厅 56.3平米,56.3,217.0,1965,0,0,1,0,1,2,1
1,2,中关村南大街甲3号 2室1厅 55.1平米,55.1,185.5,1965,0,0,1,0,3,1,1
2,3,民族大学南路19号院 2室1厅 70平米,70.0,640.0,1993,0,0,1,0,1,3,1
3,4,民族大学南路19号院 2室1厅 98.49平米,98.49,1012.0,2000,0,0,1,0,1,4,1
4,5,民族大学南路19号院 2室1厅 60.7平米,60.7,600.0,1991,0,0,1,0,3,4,1


In [11]:
# 拆分原始训练数据为四份，X_train, X_valid, y_train, y_valid (We still have community name here)
X_train_with_community, X_valid_with_community, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [12]:
# Make a new X_train, y_train (without community name here)
X_train = X_train_with_community.drop(columns = ['Community'])
X_valid = X_valid_with_community.drop(columns = ['Community'])

In [13]:
X_train_xgb= pd.concat([y_train,X_train], axis=1)
X_train_xgb.to_csv('./X_train.csv',header=False,index=False)

In [14]:
X_valid_xgb= pd.concat([y_valid,X_valid], axis=1)
X_valid_xgb.to_csv('./X_valid.csv',header=False,index=False)

In [69]:
X_valid_xgb_new= pd.concat([y_valid,X_valid_with_community], axis=1)

In [16]:
sm_session = sagemaker.Session()
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
sm_session.upload_data('./X_train.csv',bucket,prefix)
sm_session.upload_data('./X_valid.csv',bucket,prefix)

's3://tommy-lab/beijing-housing-data/X_valid.csv'

In [17]:
# 我们使⽤SageMaker内置的XGBoost来训练，⾸先要获取这个算法的container
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', repo_version='0.90-2')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [18]:
# 设置算法超参
hyperparameters = {
"max_depth":"5",
"eta":"0.5",
"early_stopping_rounds":"5",
"eval_metric":"rmse",
"num_round":"30",
"objective":"reg:linear"}

In [19]:
# 训练使⽤哪种EC2实例来完成
instance_type = 'ml.m4.xlarge'

In [20]:
# 模型输出⽬录
output_path = 's3://{}/{}/output'.format(bucket, prefix)

In [21]:
# 训练输⼊数据的类型
content_type = "csv"

In [22]:
# 设置训练任务的名字
job_name = 'xgb-housing-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

Training job xgb-housing-2020-11-23-04-03-40


In [23]:
# Use Managed Spot Training to lower training cost
train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None
checkpoint_s3_uri = ('s3://{}/{}/checkpoints/{}'.format(bucket, prefix,
job_name)
if train_use_spot_instances else None)

In [24]:
# Initialize Estimators
xgb = sagemaker.estimator.Estimator(container,
role,
hyperparameters=hyperparameters,
train_instance_count=1,
train_instance_type=instance_type,
train_volume_size=5, # 5 GB
output_path=output_path,
sagemaker_session=sagemaker.Session(),
train_use_spot_instances=train_use_spot_instances,
train_max_run=train_max_run,
train_max_wait=train_max_wait,
checkpoint_s3_uri=checkpoint_s3_uri
);

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [25]:
# Configure the data channels
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'X_train.csv')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix,
'X_valid.csv')
train_channel = sagemaker.session.s3_input(train_data_location,
content_type='csv')
valid_channel = sagemaker.session.s3_input(validation_data_location,
content_type='csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [26]:
%%time
# Use xgb.fit to train the regression model
xgb.fit(inputs=data_channels, job_name=job_name, logs=True)

2020-11-23 04:03:42 Starting - Starting the training job...
2020-11-23 04:03:44 Starting - Launching requested ML instances......
2020-11-23 04:04:59 Starting - Preparing the instances for training.........
2020-11-23 04:06:31 Downloading - Downloading input data...
2020-11-23 04:07:12 Training - Downloading the training image...
2020-11-23 04:07:32 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CS

In [27]:
%%time
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {'max_depth':IntegerParameter(1,10),
                         'alpha': ContinuousParameter(0, 100),
                         'min_child_weight': ContinuousParameter(1, 5),
                         'eta': ContinuousParameter(0.2, 0.7),  
                         'num_round': IntegerParameter(10,200)
                         }

objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=3, # Set this to 10 or above depending upon budget & available time.
                            max_parallel_jobs=1,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit(inputs=data_channels, include_cls_metadata=False)
tuner.wait()

.............................................................................................................................................................................!
!
CPU times: user 929 ms, sys: 67.7 ms, total: 997 ms
Wall time: 14min 36s


In [28]:
%%time
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

CPU times: user 68.1 ms, sys: 10.9 ms, total: 79 ms
Wall time: 1.18 s


'Completed'

In [29]:
%%time
from pprint import pprint
from sagemaker.analytics import HyperparameterTuningJobAnalytics

tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())

df_tuning_job_analytics = tuner_analytics.dataframe()

# Sort the tuning job analytics by the final metrics value
df_tuning_job_analytics.sort_values(
    by=['FinalObjectiveValue'],
    inplace=True,
    ascending=False if tuner.objective_type == "Maximize" else True)

# Show detailed analytics for the top 20 models
df_tuning_job_analytics.head(20)

CPU times: user 25.2 ms, sys: 3.59 ms, total: 28.8 ms
Wall time: 110 ms


Unnamed: 0,alpha,eta,max_depth,min_child_weight,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,95.724867,0.221732,8.0,1.815192,132.0,sagemaker-xgboost-201123-0408-002-db61b904,Completed,741.177979,2020-11-23 04:16:07+00:00,2020-11-23 04:17:27+00:00,80.0
2,34.629653,0.603258,7.0,2.478727,123.0,sagemaker-xgboost-201123-0408-001-5cb538f5,Completed,972.804016,2020-11-23 04:11:36+00:00,2020-11-23 04:12:57+00:00,81.0
0,51.244399,0.292533,5.0,1.014315,63.0,sagemaker-xgboost-201123-0408-003-9a4e966f,Completed,1242.050049,2020-11-23 04:20:59+00:00,2020-11-23 04:22:23+00:00,84.0


In [30]:
attached_tuner = HyperparameterTuner.attach(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())
best_training_job = attached_tuner.best_training_job()

In [31]:
from sagemaker.estimator import Estimator
algo_estimator = Estimator.attach(best_training_job)

best_algo_model = algo_estimator.create_model(env={'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT':"text/csv"})


2020-11-23 04:17:27 Starting - Preparing the instances for training
2020-11-23 04:17:27 Downloading - Downloading input data
2020-11-23 04:17:27 Training - Training image download completed. Training in progress.
2020-11-23 04:17:27 Uploading - Uploading generated training model
2020-11-23 04:17:27 Completed - Training job completed


In [32]:
%%time
# Model Deployment

xgb_predictor = best_algo_model.deploy(initial_instance_count=1,
                         instance_type='ml.m4.xlarge')


---------------!CPU times: user 253 ms, sys: 17.7 ms, total: 271 ms
Wall time: 7min 32s


In [33]:
%%time
from sagemaker.predictor import csv_serializer
# 创建 predictor
xgb_predictor=sagemaker.predictor.RealTimePredictor(
"sagemaker-xgboost-2020-11-23-02-23-21-790", # 这个名字就是上图中蓝⾊⽅框中endpoint的name
sagemaker_session=sm_session,
serializer=csv_serializer,
content_type='csv')
# 调⽤predictor的predict⽅法做推理
# xgb_predictor.predict(X_valid.values[0]).decode('utf-8')
result = xgb_predictor.predict(X_valid.values).decode('utf-8')

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


CPU times: user 210 ms, sys: 35 ms, total: 245 ms
Wall time: 652 ms


In [34]:
result

'41414.8984375,84222.7734375,56059.30078125,35239.60546875,94950.140625,36994.55078125,75662.9921875,24259.232421875,53908.078125,32403.01171875,92118.8828125,32364.123046875,55909.3046875,86653.0546875,52292.33984375,83056.0546875,86602.96875,47335.9453125,57139.359375,93570.71875,89859.9921875,47493.18359375,95569.6875,50101.765625,53596.83203125,29231.1640625,61927.5390625,51298.51171875,53246.44921875,79008.2734375,37306.3125,63863.66015625,31557.830078125,37582.125,49032.83984375,44282.9453125,57818.78125,75122.2265625,36303.94140625,76056.546875,96852.8359375,41466.6171875,106691.8515625,74812.4453125,39301.265625,82167.34375,45436.28125,48876.70703125,60752.515625,36953.43359375,46963.71875,50460.80859375,35814.96875,79786.5390625,74639.9453125,78976.6640625,51515.44921875,56732.15234375,89239.90625,80036.5703125,44483.43359375,30769.01171875,59806.5859375,61176.03515625,71254.7265625,67191.578125,42217.0625,83311.140625,57490.4453125,53242.10546875,34355.0234375,30408.353515625

In [48]:
X_valid_xgb_new['Community'][:1]

34949    10号名邸 3室2厅 205.98平米
Name: Community, dtype: object

In [70]:
result2 = result.split(',')
result_list = []

for i in result2:
    i = round(float(i),2)
    result_list.append(i)

X_valid_xgb_new['Prediction'] = result_list
X_valid_xgb_new['Error'] = abs(X_valid_xgb_new['Price/Square Meter'] - X_valid_xgb_new['Prediction'])

X_valid_xgb_new['Community'] = X_valid_xgb_new['Community'].str.split(pat = ' ')

first30 = X_valid_xgb_new.head(10000)

In [71]:
for j in range(len(first30['Community'])):

    first30['Community'].iloc[j] = first30['Community'].iloc[j][0]

In [72]:
first30 = first30.rename(columns = {'Livable Area': 'Livable_Area'})

In [73]:
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt


# ALL_1 = 'ALL (Livable Area)'
# ALL_2 = 'ALL (Decoration Level)'
# ALL_3 = 'ALL (Year Built)'
ALL_4 = 'ALL (Community)'
ALL_5 = 'ALL (Floor Plan)'

# def unique_sorted_values_plus_Livable_Area(array):
#     unique = array.unique().tolist()
#     unique.sort()
#     unique.insert(0, ALL_1)
#     return unique

def unique_sorted_values_plus_Community(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL_4)
    return unique

def unique_sorted_values_plus_Floor_Plan(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL_5)
    return unique




output = widgets.Output()
plot_output = widgets.Output()

# dropdown_Livable_Area = widgets.Dropdown(options = unique_sorted_values_plus_Livable_Area(first30.Livable_Area))
dropdown_Community = widgets.Dropdown(options = unique_sorted_values_plus_Community(first30.Community))
dropdown_Floor_Plan = widgets.Dropdown(options = unique_sorted_values_plus_Floor_Plan(first30.Floor_Plan))


def common_filtering(Community, Floor_Plan):
    output.clear_output()
    plot_output.clear_output()
    
    if (Community == ALL_4) & (Floor_Plan == ALL_5) :
        common_filter = first30
    elif (Community == ALL_4):
        common_filter = first30[first30.Floor_Plan == Floor_Plan]
    elif (Floor_Plan == ALL_5):
        common_filter = first30[first30.Community == Community]
    else:
        common_filter = first30[(first30.Community == Community) & 
                                  (first30.Floor_Plan == Floor_Plan)]
    
    with output:
        display(common_filter)
        
    with plot_output:
        sns.kdeplot(common_filter['Price/Square Meter'], shade=True)
        plt.show()
    
        
def dropdown_Community_eventhandler(change):
    common_filtering(change.new, dropdown_Floor_Plan.value)
    
def dropdown_Floor_Plan_eventhandler(change):
    common_filtering(dropdown_Community.value, change.new)
    
dropdown_Community.observe(dropdown_Community_eventhandler, names='value')
dropdown_Floor_Plan.observe(dropdown_Floor_Plan_eventhandler, names='value')

In [74]:
item_layout = widgets.Layout(margin='0 0 50px 0')
input_widgets = widgets.HBox([dropdown_Community, dropdown_Floor_Plan],layout=item_layout)

tab = widgets.Tab([output, plot_output])
tab.set_title(0, 'Dataset Exploration')
tab.set_title(1, 'KDE Plot')

dashboard = widgets.VBox([input_widgets, tab])
display(dashboard)

VBox(children=(HBox(children=(Dropdown(options=('ALL (Community)', '10号名邸', '621小区', '七彩华园', '七省办', '七贤村', '万地…

In [None]:
# sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)