In [31]:
# import libraries
#This code imports the required libraries and defines the environment variables you need to prepare the data, train the ML model, and deploy the ML model.
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest',
              'eu-west-2': '205493899709.dkr.ecr.eu-west-2.amazonaws.com/xgboost-neo:'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the eu-west-2 region. You will use the 205493899709.dkr.ecr.eu-west-2.amazonaws.com/xgboost-neo: container for your SageMaker endpoint.


In [59]:
#Create the S3 bucket to store your data
#can skip this step because we already have a bucket
bucket_name = 'ucl-msin0166-2021-london-housing-epc' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'eu-west-2':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 error:  An error occurred (IllegalLocationConstraintException) when calling the CreateBucket operation: The unspecified location constraint is incompatible for the region specific endpoint this request was sent to.


In [29]:
#download the data to sagemaker
try:
  urllib.request.urlretrieve("ucl-msin0166-2021-london-housing-epc", "merged_table_epc_ratings.csv")
  print('Success: downloaded merged_table_epc_ratings.csv')
except Exception as e:
  print('Data load error: ',e)

try:
  model_data = pd.read_csv('./merged_table_epc_ratings.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Data load error:  unknown url type: 'ucl-msin0166-2021-london-housing-epc'
Data load error:  [Errno 2] No such file or directory: './merged_table_epc_ratings.csv'


In [25]:
# To display full output in Notebook, instead of only the last result 
from IPython.core.interactiveshell import InteractiveShell 

InteractiveShell.ast_node_interactivity = "all" 

In [81]:
#alternative method

bucket = 'ucl-msin0166-2021-london-housing-epc'
subfolder = 'sagemaker'

from sagemaker import get_execution_role
role = get_execution_role()

conn = boto3.client('s3')

contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']

for f in contents:
    print(f['Key'])

    #loading data manually into diretory # figure out how to change this
try:
    model_data = pd.read_csv('dexters_epc_ratings.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
# import pickle

# my_bucket = 'ucl-msin0166-2021-london-housing-epc'
# my_file = 'sagemaker/merged_table.csv'


# s3client = boto3.client('s3')
# response = s3client.get_object(Bucket=my_bucket, Key=my_file)

# response
# body = response['Body']

# data = pickle.loads(body.read())

sagemaker/
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/profiler-output/system/incremental/2021050102/1619836020.algo-1.json
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/profiler-output/system/incremental/2021050102/1619836080.algo-1.json
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/rule-output/ProfilerReport-1619835886/profiler-output/profiler-report.html
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/rule-output/ProfilerReport-1619835886/profiler-output/profiler-report.ipynb
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/rule-output/ProfilerReport-1619835886/profiler-output/profiler-reports/BatchSize.json
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/rule-output/ProfilerReport-1619835886/profiler-output/profiler-reports/CPUBottleneck.json
sagemaker/output/sagemaker-xgboost-2021-05-01-02-24-46-963/rule-output/ProfilerReport-1619835886/profiler-output/profiler-reports/Dataloader.json
sagemaker/output/sagemak

In [82]:
#Shuffle and split the data into training data and test data. Copy and paste the following code into the next code cell and choose Run.
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(9, 18) (4, 18)


In [83]:
train_data

Unnamed: 0_level_0,street_name,price,address,bedrooms,bathrooms,reception,link,subway_station,distance,efficient,average_epc_C,average_epc_D,tube_line_Central,tube_line_Circle,tube_line_DLR,tube_line_District,tube_line_Jubilee,tube_line_Piccadilly
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
143179,Old Queen Street,9250000.0,"Westminster, SW1H",5,4,6,https://www.dexters.co.uk/property-for-sale/ho...,St.James's Park,0.16,0,1,0,0,1,0,0,0,0
138689,Queen Street,6000000.0,"Mayfair, W1J",3,3,2,https://www.dexters.co.uk/property-for-sale/fl...,Green Park,0.18,0,1,0,0,0,0,0,1,0
151907,Lancaster Gate,23000000.0,"Hyde Park, W2",60,0,1,https://www.dexters.co.uk/property-for-sale/pr...,Lancaster Gate,0.26,0,1,0,1,0,0,0,0,0
84720,Copse Hill,4650000.0,"Wimbledon, SW20",5,4,5,https://www.dexters.co.uk/property-for-sale/pr...,Wimbledon,1.24,0,1,0,0,0,0,1,0,0
159717,Ladbroke Road,5500000.0,"Notting Hill, W11",5,4,2,https://www.dexters.co.uk/property-for-sale/pr...,Notting Hill Gate,0.14,0,0,1,1,0,0,0,0,0
156629,Fairhazel Gardens,5495000.0,"South Hampstead, NW6",5,5,3,https://www.dexters.co.uk/property-for-sale/pr...,Finchley Road,0.19,0,0,1,0,0,0,0,1,0
124656,Wilton Crescent,19500000.0,"Wilton Crescent, SW1X",5,5,2,https://www.dexters.co.uk/property-for-sale/ho...,Knightsbridge,0.22,0,0,1,0,0,0,0,0,1
157041,Kings Road,5000000.0,"Richmond, TW10",8,3,3,https://www.dexters.co.uk/property-for-sale/pr...,Richmond,0.41,0,0,1,0,0,0,1,0,0
151165,Green Street,4950000.0,"Mayfair, W1K",3,3,2,https://www.dexters.co.uk/property-for-sale/fl...,Marble Arch,0.1,0,0,1,1,0,0,0,0,0


# train the model

In [77]:
from sagemaker import image_uris
container=image_uris.retrieve(framework='xgboost',region='eu-west-2',version='1.2-1')
container

'764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.2-1'

In [98]:
model_data

Unnamed: 0_level_0,street_name,price,address,bedrooms,bathrooms,reception,link,subway_station,distance,efficient,average_epc_C,average_epc_D,tube_line_Central,tube_line_Circle,tube_line_DLR,tube_line_District,tube_line_Jubilee,tube_line_Piccadilly
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
151907,Lancaster Gate,23000000.0,"Hyde Park, W2",60,0,1,https://www.dexters.co.uk/property-for-sale/pr...,Lancaster Gate,0.26,0,1,0,1,0,0,0,0,0
124656,Wilton Crescent,19500000.0,"Wilton Crescent, SW1X",5,5,2,https://www.dexters.co.uk/property-for-sale/ho...,Knightsbridge,0.22,0,0,1,0,0,0,0,0,1
143179,Old Queen Street,9250000.0,"Westminster, SW1H",5,4,6,https://www.dexters.co.uk/property-for-sale/ho...,St.James's Park,0.16,0,1,0,0,1,0,0,0,0
138689,Queen Street,6000000.0,"Mayfair, W1J",3,3,2,https://www.dexters.co.uk/property-for-sale/fl...,Green Park,0.18,0,1,0,0,0,0,0,1,0
138689,Queen Street,6000000.0,"Mayfair, W1J",3,3,2,https://www.dexters.co.uk/property-for-sale/fl...,Green Park,0.18,0,1,0,0,0,0,0,1,0
138690,Queen Street,6000000.0,"Mayfair, W1J",3,3,1,https://www.dexters.co.uk/property-for-sale/fl...,East India,0.17,0,1,0,0,0,1,0,0,0
138690,Queen Street,6000000.0,"Mayfair, W1J",3,3,1,https://www.dexters.co.uk/property-for-sale/fl...,East India,0.17,0,1,0,0,0,1,0,0,0
159717,Ladbroke Road,5500000.0,"Notting Hill, W11",5,4,2,https://www.dexters.co.uk/property-for-sale/pr...,Notting Hill Gate,0.14,0,0,1,1,0,0,0,0,0
156629,Fairhazel Gardens,5495000.0,"South Hampstead, NW6",5,5,3,https://www.dexters.co.uk/property-for-sale/pr...,Finchley Road,0.19,0,0,1,0,0,0,0,1,0
157041,Kings Road,5000000.0,"Richmond, TW10",8,3,3,https://www.dexters.co.uk/property-for-sale/pr...,Richmond,0.41,0,0,1,0,0,0,1,0,0


In [101]:
#according to the other documentation, upload the data like this : 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(model_data.drop(['efficient'], axis=1), model_data['efficient'], test_size=0.2, random_state=1)
# X_train_display = X_display.loc[X_train.index]

In [104]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [105]:
import pandas as pd
train = pd.concat([pd.Series(y_train, index=X_train.index), X_train], axis=1)

validation = pd.concat([pd.Series(y_val, index=X_val.index), X_val], axis=1)
test = pd.concat([pd.Series(y_test, index=X_test.index), X_test], axis=1)

In [106]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [109]:
#according to the other documentation, upload the data like this : 






import sagemaker, boto3, os
bucket_name = 'ucl-msin0166-2021-london-housing-epc'
prefix='sagemaker'

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')

In [110]:
#Run the following AWS CLI to check if the CSV files are successfully uploaded to the S3 bucket.
! aws s3 ls {bucket}/{prefix}/data --recursive

2021-05-01 09:51:52       1287 sagemaker/data/train.csv
2021-05-01 09:51:52        513 sagemaker/data/validation.csv


In [111]:
#Attempt 2 at training model
import sagemaker

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: eu-west-2
RoleArn: arn:aws:iam::849689169827:role/service-role/AmazonSageMaker-ExecutionRole-20210430T103372


In [112]:
! pip install -qU sagemaker

In [113]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [114]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "binary:logistic",
    num_round = 1000
)

In [115]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

In [116]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2021-05-01 09:53:12 Starting - Starting the training job...
2021-05-01 09:53:13 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1619862791: InProgress
......
2021-05-01 09:54:28 Starting - Preparing the instances for training......
2021-05-01 09:55:40 Downloading - Downloading input data...
2021-05-01 09:56:08 Training - Downloading the training image...
2021-05-01 09:56:41 Training - Training image download completed. Training in progress..[34m[2021-05-01 09:56:43.342 ip-10-0-97-161.eu-west-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sa

In [117]:
#specify the S3 bucket URI where the Debugger training reports are generated and check if the reports exist
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2021-05-01 09:59:00     322345 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-report.html
2021-05-01 09:59:00     168692 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-report.ipynb
2021-05-01 09:58:55        190 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/BatchSize.json
2021-05-01 09:58:55        198 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/CPUBottleneck.json
2021-05-01 09:58:55        126 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/Dataloader.json
2021-05-01 09:58:55        127 sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810

In [118]:
#Download the Debugger XGBoost training and profiling reports to the current workspace:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://ucl-msin0166-2021-london-housing-epc/sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/CPUBottleneck.json to ProfilerReport-1619862791/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://ucl-msin0166-2021-london-housing-epc/sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/Dataloader.json to ProfilerReport-1619862791/profiler-output/profiler-reports/Dataloader.json
download: s3://ucl-msin0166-2021-london-housing-epc/sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-1619862791/profiler-output/profiler-reports/BatchSize.json to ProfilerReport-1619862791/profiler-output/profiler-reports/BatchSize.json
download: s3://ucl-msin0166-2021-london-housing-epc/sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/rule-output/ProfilerReport-16

In [119]:
#get the file link of the XGBoost training report:
from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

'Click link below to view the XGBoost Training report'

In [120]:
xgb_model.model_data

's3://ucl-msin0166-2021-london-housing-epc/sagemaker/xgboost_model/sagemaker-xgboost-2021-05-01-09-53-11-810/output/model.tar.gz'

# Deploying the model

In [121]:
#hosting a model through Amazon EC2 using Amazon SageMaker
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

-----------------!

In [122]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2021-05-01-10-02-43-236'

# Evaluate the model

In [123]:
#Setting up the following function to predict each line of the test set
import numpy as np
def predict(data, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

In [127]:
# import matplotlib.pyplot as plt

predictions=predict(test.to_numpy()[:,1:])
# plt.hist(predictions)
# plt.show()

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from model with message "Loading csv data failed with Exception, please ensure data is in csv format:
 <class 'ValueError'>
 could not convert string to float: 'Old Queen Street'". See https://eu-west-2.console.aws.amazon.com/cloudwatch/home?region=eu-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/sagemaker-xgboost-2021-05-01-10-02-43-236 in account 849689169827 for more information.

In [125]:
import sklearn

cutoff=0.5
print(sklearn.metrics.confusion_matrix(test.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))
print(sklearn.metrics.classification_report(test.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))

NameError: name 'predictions' is not defined

In [133]:
test_draft=test.drop(['street_name','address','link','subway_station'], axis=1)

In [134]:
test_draft

Unnamed: 0_level_0,efficient,price,bedrooms,bathrooms,reception,distance,average_epc_C,average_epc_D,tube_line_Central,tube_line_Circle,tube_line_DLR,tube_line_District,tube_line_Jubilee,tube_line_Piccadilly
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
143179,0,9250000.0,5,4,6,0.16,1,0,0,1,0,0,0,0
138689,0,6000000.0,3,3,2,0.18,1,0,0,0,0,0,1,0
138689,0,6000000.0,3,3,2,0.18,1,0,0,0,0,0,1,0


In [145]:
# test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
test_data_array = test.values.tolist()
xgb_predictor.serializer = CSVSerializer() # set the serializer type

predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from model with message "Loading csv data failed with Exception, please ensure data is in csv format:
 <class 'ValueError'>
 could not convert string to float: 'Old Queen Street'". See https://eu-west-2.console.aws.amazon.com/cloudwatch/home?region=eu-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/sagemaker-xgboost-2021-05-01-10-02-43-236 in account 849689169827 for more information.

In [137]:
cm = pd.crosstab(index=test['efficient'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Not efficient", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Efficient", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

NameError: name 'predictions_array' is not defined