In [30]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and assign S3 bucket
role = get_execution_role()
prefix = 'sagemaker/wimlds-sagemaker-xgboost-demo'
bucket_name = 'sagemaker-us-east-1-2fd62760754591' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container

my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


### Prepare the data

In [31]:
train_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data))])
print(train_data.shape, test_data.shape)

(15129, 21) (6484, 21)


In [32]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17399,5153900080,2014-07-14,199000.0,3,1.0,1510,9100,1.0,0,0,...,7,1510,0,1966,0,98003,47.3331,-122.319,1180,7220
16005,2754700095,2015-03-16,747000.0,3,1.5,1710,5120,2.0,0,0,...,7,1710,0,1920,0,98115,47.6801,-122.305,1530,5170
8382,4399200100,2015-04-28,288000.0,3,2.25,1560,9706,1.0,0,0,...,7,1560,0,1963,0,98002,47.3191,-122.213,1510,9706
7381,3897100275,2014-10-27,460000.0,3,1.75,1660,9900,2.0,0,0,...,8,1660,0,1978,0,98033,47.6704,-122.184,1720,6600
16824,9477201470,2014-10-22,379950.0,3,1.0,1270,6900,1.0,0,0,...,7,1270,0,1977,0,98034,47.7279,-122.192,1480,7280


In [33]:
len(train_data.columns)

21

Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format. For this example, we'll stick to CSV. Note that the first column must be the target variable and the CSV should not include headers. Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before. This avoids any misalignment issues due to random reordering.

In [34]:
attributes = ['price','bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']
len(attributes)

18

In [35]:
train_data = train_data[attributes]

In [36]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_data = scaler.fit_transform(train_data)
# train_data = pd.DataFrame(train_data)

In [37]:
train_data.to_csv('train.csv', index=False, header=False)

### Upload training data to S3 Bucket

In [38]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

### Training

First we'll need to specify training parameters to the estimator. This includes:

- The xgboost algorithm container
- The IAM role to use
- Training instance type and count
- S3 location for output data
- Algorithm hyperparameters

And then a .fit() function which specifies:

S3 location for output data. In this case we have both a training and validation set which are passed in.

In [39]:
sess = sagemaker.Session()

### XG Boost Hyperparamaters


There are lot of hyperparameters, few of them are :

1. Subsample
    - Subsample ratio of the training instance. 
    - Setting it to 0.5 means that XGBoost randomly collects half of the data instances to grow trees. 
    - This prevents overfitting.

        Optional

        Valid values: Float. Range: [0,1].

        Default value: 1
2. Eta
    - Step size shrinkage, prevents overfitting.
3. Gamma
    - Minimum loss reduction to create a partition, larger = more conservative
4. Alpha
    - L1 regularization term; larger = more conservative
5. Lambda
    - L2 regularization term; larger = more conservative



In [40]:
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.06,
                        alpha=0.8,
                        lambda_bias=0.8,
                        gamma=50,
                        min_child_weight=6,
                        subsample=0.5,
                        silent=0,
                        early_stopping_rounds=5,
                        objective='reg:linear',
                        num_round=1000)
                        

In [1]:
xgb.fit({'train': s3_input_train})

In [42]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

---------------!

### Evaluation

There are many ways to compare the performance of a machine learning model, but let's start by simply comparing actual to predicted values. 

First we'll need to determine how we pass data into and receive data from our endpoint. Our data is currently stored as NumPy arrays in memory of our notebook instance. To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.

Note: For inference with CSV format, SageMaker XGBoost requires that the data does NOT include the target variable.

In [43]:
test_data_array = test_data.drop([ 'price','id','sqft_above','date'], axis=1).values #load the data into an array

In [44]:
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type


In [45]:
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(6484,)


In [46]:
from sklearn.metrics import r2_score
print("R2 score : %.2f" % r2_score(test_data['price'],predictions_array))

R2 score : 0.89
