In [70]:
import time
import boto3
import sagemaker
import urllib
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [71]:
cwd = os.getcwd()
print(cwd)

/home/ec2-user/SageMaker/706-Final-Project/10_Code


In [72]:
!aws s3 ls

2020-11-23 21:22:16 elasticbeanstalk-us-east-1-774975782106
2020-10-26 01:38:51 haidianhousing
2020-10-28 02:20:59 housing-lab


In [73]:
! aws configure list

      Name                    Value             Type    Location
      ----                    -----             ----    --------
   profile                <not set>             None    None
access_key     ****************GXOI         iam-role    
secret_key     ****************DChV         iam-role    
    region                us-east-1      config-file    ~/.aws/config


In [74]:
# Getting role
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [75]:
# One has to set up a S3 bucket before running the code below
bucket = 'housing-lab'
prefix = 'beijing-housing-data'

In [76]:
X = pd.read_csv("/home/ec2-user/SageMaker/706-Final-Project/20_Cleaned_Data/prepared_data_tommy.csv" )

In [77]:
# Taking a look at the dataset
X.head()

Unnamed: 0,Price/Square Meter,Housing_ID,Community,Livable Area,Price,Year_Built,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level,Floor_Plan
0,38544.0,1,中关村南大街甲3号 2室1厅 56.3平米,56.3,217.0,1965,0,0,1,0,1,2,1
1,33667.0,2,中关村南大街甲3号 2室1厅 55.1平米,55.1,185.5,1965,0,0,1,0,3,1,1
2,91429.0,3,民族大学南路19号院 2室1厅 70平米,70.0,640.0,1993,0,0,1,0,1,3,1
3,102752.0,4,民族大学南路19号院 2室1厅 98.49平米,98.49,1012.0,2000,0,0,1,0,1,4,1
4,98847.0,5,民族大学南路19号院 2室1厅 60.7平米,60.7,600.0,1991,0,0,1,0,3,4,1


In [78]:
# Extract the column Price/Square Meter from X
y = X['Price/Square Meter']

In [79]:
y.head()

0     38544.0
1     33667.0
2     91429.0
3    102752.0
4     98847.0
Name: Price/Square Meter, dtype: float64

In [80]:
# Drop the column Price/Square Meter in X
X.drop(['Price/Square Meter'], axis=1, inplace=True)

In [81]:
X.head()

Unnamed: 0,Housing_ID,Community,Livable Area,Price,Year_Built,Facing_East,Facing_North,Facing_South,Facing_West,Decoration_Level,Floor_Level,Floor_Plan
0,1,中关村南大街甲3号 2室1厅 56.3平米,56.3,217.0,1965,0,0,1,0,1,2,1
1,2,中关村南大街甲3号 2室1厅 55.1平米,55.1,185.5,1965,0,0,1,0,3,1,1
2,3,民族大学南路19号院 2室1厅 70平米,70.0,640.0,1993,0,0,1,0,1,3,1
3,4,民族大学南路19号院 2室1厅 98.49平米,98.49,1012.0,2000,0,0,1,0,1,4,1
4,5,民族大学南路19号院 2室1厅 60.7平米,60.7,600.0,1991,0,0,1,0,3,4,1


In [82]:
# Splitting dataset into 4 subsets: X_train, X_valid, y_train, y_valid (We still have community name here)
X_train_with_community, X_valid_with_community, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [83]:
# Make a new X_train, y_train (without community name here)
X_train = X_train_with_community.drop(columns = ['Community'])
X_valid = X_valid_with_community.drop(columns = ['Community'])

In [84]:
X_train_xgb= pd.concat([y_train,X_train], axis=1)
X_train_xgb.to_csv('./X_train.csv',header=False,index=False)

In [85]:
X_valid_xgb= pd.concat([y_valid,X_valid], axis=1)
X_valid_xgb.to_csv('./X_valid.csv',header=False,index=False)

In [86]:
X_valid_xgb_new= pd.concat([y_valid,X_valid_with_community], axis=1)

In [87]:
sm_session = sagemaker.Session()
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
sm_session.upload_data('./X_train.csv',bucket,prefix)
sm_session.upload_data('./X_valid.csv',bucket,prefix)

's3://housing-lab/beijing-housing-data/X_valid.csv'

In [88]:
# Getting a container from Sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', repo_version='0.90-2')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [89]:
# Setting hyperparameters
hyperparameters = {
"max_depth":"5",
"eta":"0.5",
"early_stopping_rounds":"5",
"eval_metric":"rmse",
"num_round":"30",
"objective":"reg:linear"}

In [90]:
# Specifying type of EC2 instance
instance_type = 'ml.m5.xlarge'

In [91]:
# Output directory for model
output_path = 's3://{}/{}/output'.format(bucket, prefix)

In [92]:
# Input data type
content_type = "csv"

In [93]:
# Setting name for training job
job_name = 'xgb-housing-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

Training job xgb-housing-2020-11-24-03-41-20


In [94]:
# Use Managed Spot Training to lower training cost
train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None
checkpoint_s3_uri = ('s3://{}/{}/checkpoints/{}'.format(bucket, prefix,
job_name)
if train_use_spot_instances else None)

In [95]:
# Initialize Estimators
xgb = sagemaker.estimator.Estimator(container,
role,
hyperparameters=hyperparameters,
train_instance_count=1,
train_instance_type=instance_type,
train_volume_size=5, # 5 GB
output_path=output_path,
sagemaker_session=sagemaker.Session(),
train_use_spot_instances=train_use_spot_instances,
train_max_run=train_max_run,
train_max_wait=train_max_wait,
checkpoint_s3_uri=checkpoint_s3_uri
);

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [96]:
# Configure the data channels
train_data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'X_train.csv')
validation_data_location = 's3://{}/{}/{}'.format(bucket, prefix,
'X_valid.csv')
train_channel = sagemaker.session.s3_input(train_data_location,
content_type='csv')
valid_channel = sagemaker.session.s3_input(validation_data_location,
content_type='csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [97]:
%%time
# Use xgb.fit to train the regression model
xgb.fit(inputs=data_channels, job_name=job_name, logs=True)

2020-11-24 03:41:24 Starting - Starting the training job...
2020-11-24 03:41:26 Starting - Launching requested ML instances......
2020-11-24 03:42:40 Starting - Preparing the instances for training...
2020-11-24 03:43:23 Downloading - Downloading input data...
2020-11-24 03:43:50 Training - Downloading the training image..
2020-11-24 03:44:17 Uploading - Uploading generated training model
2020-11-24 03:44:17 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:roo

In [98]:
%%time
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {'max_depth':IntegerParameter(1,10),
                         'alpha': ContinuousParameter(0, 100),
                         'min_child_weight': ContinuousParameter(1, 5),
                         'eta': ContinuousParameter(0.2, 0.7),  
                         'num_round': IntegerParameter(10,200)
                         }

objective_metric_name = 'validation:rmse'
objective_type = 'Minimize'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=6, # Set this to 10 or above depending upon budget & available time.
                            max_parallel_jobs=2,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit(inputs=data_channels, include_cls_metadata=False)
tuner.wait()

.......................................................................................................................................!
!
CPU times: user 725 ms, sys: 26.9 ms, total: 752 ms
Wall time: 11min 23s


In [99]:
%%time
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

CPU times: user 20.6 ms, sys: 175 µs, total: 20.8 ms
Wall time: 93.9 ms


'Completed'

In [100]:
%%time
from pprint import pprint
from sagemaker.analytics import HyperparameterTuningJobAnalytics

tuner_analytics = HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())

df_tuning_job_analytics = tuner_analytics.dataframe()

# Sort the tuning job analytics by the final metrics value
df_tuning_job_analytics.sort_values(
    by=['FinalObjectiveValue'],
    inplace=True,
    ascending=False if tuner.objective_type == "Maximize" else True)

# Show detailed analytics for the top 20 models
df_tuning_job_analytics.head(20)

CPU times: user 29.6 ms, sys: 0 ns, total: 29.6 ms
Wall time: 95.3 ms


Unnamed: 0,alpha,eta,max_depth,min_child_weight,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
5,91.92483,0.540998,7.0,1.753762,40.0,sagemaker-xgboost-201124-0344-001-b8f6341e,Completed,1131.890015,2020-11-24 03:46:42+00:00,2020-11-24 03:47:33+00:00,51.0
1,86.870143,0.318224,6.0,1.346574,46.0,sagemaker-xgboost-201124-0344-005-fb293306,Completed,1168.939941,2020-11-24 03:53:43+00:00,2020-11-24 03:54:41+00:00,58.0
3,33.952662,0.357582,8.0,1.591125,14.0,sagemaker-xgboost-201124-0344-003-73bc6116,Completed,1169.819946,2020-11-24 03:50:29+00:00,2020-11-24 03:51:23+00:00,54.0
2,40.205714,0.421023,7.0,2.229997,25.0,sagemaker-xgboost-201124-0344-004-7d28e6c9,Completed,1230.189941,2020-11-24 03:50:19+00:00,2020-11-24 03:51:15+00:00,56.0
4,22.807334,0.544063,3.0,3.706866,188.0,sagemaker-xgboost-201124-0344-002-a90f2d18,Completed,1250.51001,2020-11-24 03:46:49+00:00,2020-11-24 03:47:45+00:00,56.0
0,86.867091,0.261803,5.0,1.283464,57.0,sagemaker-xgboost-201124-0344-006-01975a92,Completed,1345.630005,2020-11-24 03:54:20+00:00,2020-11-24 03:55:15+00:00,55.0


In [101]:
attached_tuner = HyperparameterTuner.attach(tuner.latest_tuning_job.name, sagemaker_session=sagemaker.Session())
best_training_job = attached_tuner.best_training_job()

In [102]:
from sagemaker.estimator import Estimator
algo_estimator = Estimator.attach(best_training_job)

best_algo_model = algo_estimator.create_model(env={'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT':"text/csv"})


2020-11-24 03:47:33 Starting - Preparing the instances for training
2020-11-24 03:47:33 Downloading - Downloading input data
2020-11-24 03:47:33 Training - Training image download completed. Training in progress.
2020-11-24 03:47:33 Uploading - Uploading generated training model
2020-11-24 03:47:33 Completed - Training job completed


In [103]:
%%time
# Model Deployment
endpoint_na = 'my-endpoint2'
xgb_predictor = best_algo_model.deploy(initial_instance_count=1,
                         instance_type='ml.m5.xlarge',
                         endpoint_name= endpoint_na )


--------------!CPU times: user 231 ms, sys: 27.4 ms, total: 258 ms
Wall time: 7min 3s


In [104]:
%%time
from sagemaker.predictor import csv_serializer
# Creating predictor
xgb_predictor=sagemaker.predictor.RealTimePredictor(
endpoint_name = endpoint_na, 
sagemaker_session=sm_session,
serializer=csv_serializer,
content_type='csv')
# Leveraging the predictor to predict housing prices
# xgb_predictor.predict(X_valid.values[0]).decode('utf-8')
result = xgb_predictor.predict(X_valid.values).decode('utf-8')

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


CPU times: user 168 ms, sys: 4.39 ms, total: 172 ms
Wall time: 628 ms


In [105]:
result

'40435.62890625,83828.453125,56077.2421875,35877.9375,95684.1015625,37242.93359375,73451.28125,24694.037109375,54240.9453125,31313.63671875,92344.5234375,32079.23828125,56468.1015625,87152.0,52323.58984375,83536.2890625,86491.515625,48272.10546875,58593.40234375,93233.8515625,90318.78125,46922.36328125,95462.0859375,49542.9453125,53775.7578125,29406.265625,61475.09765625,51285.63671875,53081.0703125,77538.1015625,36887.8515625,63455.52734375,31241.39453125,37273.7734375,48625.1640625,44129.015625,58161.7734375,75143.5625,37522.4921875,75058.640625,96774.984375,40557.44140625,106604.0,74623.734375,40492.41015625,83302.734375,46007.54296875,49740.96875,61128.8828125,34991.40625,47825.16015625,50616.8984375,36097.140625,79817.4140625,73478.265625,77789.7578125,52463.2734375,57243.75390625,89832.140625,80338.6796875,44305.296875,31085.478515625,60054.5546875,61523.53125,72329.609375,67544.1875,41508.9921875,83080.9765625,57959.05859375,53497.86328125,34693.12109375,30715.181640625,49024.68

In [106]:
X_valid_xgb_new['Community'][:1]

34949    10号名邸 3室2厅 205.98平米
Name: Community, dtype: object

In [107]:
result2 = result.split(',')
result_list = []

for i in result2:
    i = round(float(i),2)
    result_list.append(i)

X_valid_xgb_new['Prediction'] = result_list
X_valid_xgb_new['Error'] = abs(X_valid_xgb_new['Price/Square Meter'] - X_valid_xgb_new['Prediction'])

X_valid_xgb_new['Community'] = X_valid_xgb_new['Community'].str.split(pat = ' ')

first30 = X_valid_xgb_new.head(10000)

In [108]:
for j in range(len(first30['Community'])):

    first30['Community'].iloc[j] = first30['Community'].iloc[j][0]

In [109]:
first30 = first30.rename(columns = {'Livable Area': 'Livable_Area'})

In [110]:
import ipywidgets as widgets
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt


# ALL_1 = 'ALL (Livable Area)'
# ALL_2 = 'ALL (Decoration Level)'
# ALL_3 = 'ALL (Year Built)'
ALL_4 = 'ALL (Community)'
ALL_5 = 'ALL (Floor Plan)'

# def unique_sorted_values_plus_Livable_Area(array):
#     unique = array.unique().tolist()
#     unique.sort()
#     unique.insert(0, ALL_1)
#     return unique

def unique_sorted_values_plus_Community(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL_4)
    return unique

def unique_sorted_values_plus_Floor_Plan(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL_5)
    return unique

output = widgets.Output()
plot_output = widgets.Output()

# dropdown_Livable_Area = widgets.Dropdown(options = unique_sorted_values_plus_Livable_Area(first30.Livable_Area))
dropdown_Community = widgets.Dropdown(options = unique_sorted_values_plus_Community(first30.Community))
dropdown_Floor_Plan = widgets.Dropdown(options = unique_sorted_values_plus_Floor_Plan(first30.Floor_Plan))


def common_filtering(Community, Floor_Plan):
    output.clear_output()
    plot_output.clear_output()
    
    if (Community == ALL_4) & (Floor_Plan == ALL_5) :
        common_filter = first30
    elif (Community == ALL_4):
        common_filter = first30[first30.Floor_Plan == Floor_Plan]
    elif (Floor_Plan == ALL_5):
        common_filter = first30[first30.Community == Community]
    else:
        common_filter = first30[(first30.Community == Community) & 
                                  (first30.Floor_Plan == Floor_Plan)]
    
    with output:
        display(common_filter)
        
    with plot_output:
        sns.kdeplot(common_filter['Price/Square Meter'], shade=True)
        plt.show()
    
        
def dropdown_Community_eventhandler(change):
    common_filtering(change.new, dropdown_Floor_Plan.value)
    
def dropdown_Floor_Plan_eventhandler(change):
    common_filtering(dropdown_Community.value, change.new)
    
dropdown_Community.observe(dropdown_Community_eventhandler, names='value')
dropdown_Floor_Plan.observe(dropdown_Floor_Plan_eventhandler, names='value')

In [111]:
item_layout = widgets.Layout(margin='0 0 50px 0')
input_widgets = widgets.HBox([dropdown_Community, dropdown_Floor_Plan],layout=item_layout)

tab = widgets.Tab([output, plot_output])
tab.set_title(0, 'Dataset Exploration')
tab.set_title(1, 'KDE Plot')

dashboard = widgets.VBox([input_widgets, tab])
display(dashboard)

VBox(children=(HBox(children=(Dropdown(options=('ALL (Community)', '10号名邸', '621小区', '七彩华园', '七省办', '七贤村', '万地…

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)