In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and assign S3 bucket
role = get_execution_role()
prefix = 'sagemaker/wimlds-sagemaker-xgboost-demo'
bucket_name = 'sagemaker-house-prediction-xgbbost' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container

my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-2 region. You will use the 825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [2]:
data = pd.read_csv('s3://final-assignment/kc_house_data.csv')

### Prepare the data

In [3]:
train_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data))])
print(train_data.shape, test_data.shape)

(15129, 21) (6484, 21)


In [4]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17399,5153900080,20140714T000000,199000.0,3,1.0,1510,9100,1.0,0,0,...,7,1510,0,1966,0,98003,47.3331,-122.319,1180,7220
16005,2754700095,20150316T000000,747000.0,3,1.5,1710,5120,2.0,0,0,...,7,1710,0,1920,0,98115,47.6801,-122.305,1530,5170
8382,4399200100,20150428T000000,288000.0,3,2.25,1560,9706,1.0,0,0,...,7,1560,0,1963,0,98002,47.3191,-122.213,1510,9706
7381,3897100275,20141027T000000,460000.0,3,1.75,1660,9900,2.0,0,0,...,8,1660,0,1978,0,98033,47.6704,-122.184,1720,6600
16824,9477201470,20141022T000000,379950.0,3,1.0,1270,6900,1.0,0,0,...,7,1270,0,1977,0,98034,47.7279,-122.192,1480,7280


In [5]:
len(train_data.columns)

21

Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format. For this example, we'll stick to CSV. Note that the first column must be the target variable and the CSV should not include headers. Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before. This avoids any misalignment issues due to random reordering.

In [6]:
attributes = ['price','bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']
len(attributes)

18

In [7]:
train_data = train_data[attributes]

In [8]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_data = scaler.fit_transform(train_data)
# train_data = pd.DataFrame(train_data)

In [9]:
train_data.to_csv('train.csv', index=False, header=False)

### Upload training data to S3 Bucket

In [10]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [11]:
bucket_name

'sagemaker-house-prediction-xgbbost'

### Training

First we'll need to specify training parameters to the estimator. This includes:

- The xgboost algorithm container
- The IAM role to use
- Training instance type and count
- S3 location for output data
- Algorithm hyperparameters

And then a .fit() function which specifies:

S3 location for output data. In this case we have both a training and validation set which are passed in.

In [12]:
sess = sagemaker.Session()

### XG Boost Hyperparamaters


There are lot of hyperparameters, few of them are :

1. Subsample
    - Subsample ratio of the training instance. 
    - Setting it to 0.5 means that XGBoost randomly collects half of the data instances to grow trees. 
    - This prevents overfitting.

        Optional

        Valid values: Float. Range: [0,1].

        Default value: 1
2. Eta
    - Step size shrinkage, prevents overfitting.
3. Gamma
    - Minimum loss reduction to create a partition, larger = more conservative
4. Alpha
    - L1 regularization term; larger = more conservative
5. Lambda
    - L2 regularization term; larger = more conservative



In [13]:
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.06,
                        alpha=0.8,
                        lambda_bias=0.8,
                        gamma=50,
                        min_child_weight=6,
                        subsample=0.5,
                        silent=0,
                        early_stopping_rounds=5,
                        objective='reg:linear',
                        num_round=1000)
                        

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [14]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-11-19-02-05-303


2023-03-11 19:02:05 Starting - Starting the training job...
2023-03-11 19:02:30 Starting - Preparing the instances for training......
2023-03-11 19:03:22 Downloading - Downloading input data...
2023-03-11 19:03:57 Training - Downloading the training image...
2023-03-11 19:04:33 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-03-11:19:04:48:INFO] Running standalone xgboost training.[0m
[34m[2023-03-11:19:04:48:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-03-11:19:04:48:INFO] File size need to be processed in the node: 1.17mb. Available memory size in the node: 8603.28mb[0m
[34m[2023-03-11:19:04:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:04:48] S3DistributionType set as FullyReplicated[0m
[34m[19:04:48] 15129x17 matrix with 257193 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[19:04:48] src/tree/updater_prune.cc:74: tree pruning end,


2023-03-11 19:05:03 Uploading - Uploading generated training model[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[411]#011train-rmse:71709.8[0m
[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[412]#011train-rmse:71676.4[0m
[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[413]#011train-rmse:71659.9[0m
[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[414]#011train-rmse:71645.1[0m
[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[415]#011train-rmse:71579.2[0m
[34m[19:04:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 78 extra nodes, 0 pruned nodes, max_depth=6[0m
[34m[416]#011train-rm


2023-03-11 19:05:15 Completed - Training job completed
Training seconds: 112
Billable seconds: 112


In [15]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2023-03-11-19-05-49-467
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-03-11-19-05-49-467
INFO:sagemaker:Creating endpoint with name xgboost-2023-03-11-19-05-49-467


------!

### Evaluation

There are many ways to compare the performance of a machine learning model, but let's start by simply comparing actual to predicted values. 

First we'll need to determine how we pass data into and receive data from our endpoint. Our data is currently stored as NumPy arrays in memory of our notebook instance. To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.

Note: For inference with CSV format, SageMaker XGBoost requires that the data does NOT include the target variable.

In [16]:
test_data_array = test_data.drop([ 'price','id','sqft_above','date'], axis=1).values #load the data into an array

In [17]:
xgb_predictor.__setattr__(xgb_predictor.content_type, "text/csv")
xgb_predictor.serializer = csv_serializer # set the serializer type


In [18]:
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(6484,)


In [19]:
from sklearn.metrics import r2_score
print("R2 score : %.2f" % r2_score(test_data['price'],predictions_array))

R2 score : 0.88
