In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and assign S3 bucket
role = get_execution_role()
prefix = 'sagemaker/wimlds-sagemaker-xgboost-demo'
bucket_name = 'sagemaker-house-prediction-xgbbost' 

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} 

my_region = boto3.session.Session().region_name 
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-2 region. You will use the 825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [2]:
data = pd.read_csv('s3://final-assignment/kc_house_data.csv')

In [4]:
drop_columns = ['id', 'date', 'lat', 'long']

data.drop(drop_columns, 1, inplace = True)

  data.drop(drop_columns, 1, inplace = True)


In [5]:
for i in data.columns:

# Remove outliers for each variable
  Q1 = data[i].quantile(0.25)
  Q3 = data[i].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5*IQR
  upper_bound = Q3 + 1.5*IQR

  # Filter the data without outliers
  data = data[(data[i] >= lower_bound) & (data[i] <= upper_bound)]

### Prepare the data

In [6]:
train_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data))])
print(train_data.shape, test_data.shape)

(9898, 17) (4243, 17)


In [7]:
train_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15
10503,495000.0,3,2.0,1340,2550,2.0,0,0,3,7,1340,0,1984,0,98117,1370,5100
7452,479000.0,3,1.75,1470,6018,1.0,0,0,3,8,1470,0,1987,0,98074,1720,6584
8000,355200.0,3,1.0,1120,7320,1.0,0,0,4,7,1120,0,1954,0,98146,1410,6328
11621,565000.0,4,1.0,1540,2452,1.5,0,0,4,7,1540,0,1906,0,98103,1290,3360
191,166950.0,3,1.0,1190,8820,1.0,0,0,3,6,1190,0,1959,0,98058,1230,7980


In [8]:
len(train_data.columns)

17

Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format. For this example, we'll stick to CSV. Note that the first column must be the target variable and the CSV should not include headers. Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before. This avoids any misalignment issues due to random reordering.

In [9]:
attributes = ['price','bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15', 'sqft_lot15']
len(attributes)

16

In [10]:
train_data = train_data[attributes]

In [8]:
# Experiment scaling and not scaling it
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_data = scaler.fit_transform(train_data)
# train_data = pd.DataFrame(train_data)

In [11]:
train_data.to_csv('train.csv', index=False, header=False)

### Upload training data to S3 Bucket

In [12]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [13]:
bucket_name

'sagemaker-house-prediction-xgbbost'

### Training

First we'll need to specify training parameters to the estimator. This includes:

- The xgboost algorithm container
- The IAM role to use
- Training instance type and count
- S3 location for output data
- Algorithm hyperparameters

And then a .fit() function which specifies:

S3 location for output data. In this case we have both a training and validation set which are passed in.

In [14]:
sess = sagemaker.Session()

### XG Boost Hyperparamaters


There are lot of hyperparameters, few of them are :

1. Subsample
    - Subsample ratio of the training instance. 
    - Setting it to 0.5 means that XGBoost randomly collects half of the data instances to grow trees. 
    - This prevents overfitting.

        Optional

        Valid values: Float. Range: [0,1].

        Default value: 1
2. Eta
    - Step size shrinkage, prevents overfitting.
3. Gamma
    - Minimum loss reduction to create a partition, larger = more conservative
4. Alpha
    - L1 regularization term; larger = more conservative
5. Lambda
    - L2 regularization term; larger = more conservative



In [16]:
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.06,
                        alpha=0.8,
                        lambda_bias=0.8,
                        gamma=50,
                        min_child_weight=6,
                        subsample=0.5,
                        silent=0,
                        early_stopping_rounds=5,
                        objective='reg:linear',
                        num_round=1000)
                        

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [17]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-14-12-13-25-561


2023-03-14 12:13:25 Starting - Starting the training job...
2023-03-14 12:13:49 Starting - Preparing the instances for training......
2023-03-14 12:14:50 Downloading - Downloading input data...
2023-03-14 12:15:15 Training - Downloading the training image...
2023-03-14 12:15:51 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-03-14:12:16:05:INFO] Running standalone xgboost training.[0m
[34m[2023-03-14:12:16:05:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-03-14:12:16:05:INFO] File size need to be processed in the node: 0.6mb. Available memory size in the node: 8599.93mb[0m
[34m[2023-03-14:12:16:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[12:16:05] S3DistributionType set as FullyReplicated[0m
[34m[12:16:05] 9898x15 matrix with 148470 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[12:16:05] src/tree/updater_prune.cc:74: tree pruning end, 1

In [18]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2023-03-14-12-17-14-846
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-03-14-12-17-14-846
INFO:sagemaker:Creating endpoint with name xgboost-2023-03-14-12-17-14-846


------!

### Evaluation

There are many ways to compare the performance of a machine learning model, but let's start by simply comparing actual to predicted values. 

First we'll need to determine how we pass data into and receive data from our endpoint. Our data is currently stored as NumPy arrays in memory of our notebook instance. To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.

Note: For inference with CSV format, SageMaker XGBoost requires that the data does NOT include the target variable.

In [21]:
test_data.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15',
       'sqft_lot15'],
      dtype='object')

In [22]:
train_data.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_basement', 'yr_built',
       'yr_renovated', 'zipcode', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [24]:
test_data_array = test_data.drop([ 'price','sqft_above'], axis=1).values

In [25]:
xgb_predictor.__setattr__(xgb_predictor.content_type, "text/csv")
xgb_predictor.serializer = csv_serializer # set the serializer type


In [26]:
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(4243,)


In [27]:
from sklearn.metrics import r2_score
print("R2 score : %.2f" % r2_score(test_data['price'],predictions_array))

R2 score : 0.83
