In [1]:
% matplotlib inline
    

In [3]:
# Download train.csv from https://www.kaggle.com/c/boston-housing/


In [9]:
import pandas as pd

from os.path import expanduser

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter3/'


In [10]:
housing_df = pd.read_csv(SRC_PATH + 'train.csv')
housing_df.head()


Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [11]:
training_features = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
label = 'medv'

In [13]:
from sklearn.model_selection import train_test_split

housing_df_reordered = housing_df[[label] + training_features]
training_df, test_df = train_test_split(housing_df_reordered, test_size=0.2)
training_df.head()


Unnamed: 0,medv,crim,zn,indus,chas,nox,rm,age,dis,tax,ptratio,lstat
17,14.8,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,307,21.0,17.28
149,23.3,0.0456,0.0,13.89,1,0.55,5.888,56.0,3.1121,276,16.4,13.51
104,17.0,1.41385,0.0,19.58,1,0.871,6.129,96.0,1.7494,403,14.7,15.12
168,24.3,0.33983,22.0,5.86,0,0.431,6.108,34.9,8.0555,330,19.1,9.16
223,20.6,0.03306,0.0,5.19,0,0.515,6.059,37.3,4.8122,224,20.2,8.51


In [39]:
training_df.reindex().to_csv('training-housing.csv', header=False, index=False)
test_df[training_features].reindex().to_csv('testing-housing.csv', header=False, index=False)

In [40]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()
role = get_execution_role()

bucket = "mastering-ml-aws"
prefix = "chapter3/linearmodels"

train_path = prefix + '/train'
validation_path = prefix + '/validation'

sess.upload_data(path='training-housing.csv', bucket=bucket, key_prefix=train_path)
sess.upload_data(path='testing-housing.csv', bucket=bucket, key_prefix=validation_path)

s3_train_data = 's3://{}/{}'.format(bucket, train_path)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_path)


In [21]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [2]:
import boto3
import sagemaker
from sagemaker.session import s3_input

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=s3_output_location,
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=len(training_features),
                           predictor_type='regressor',
                           mini_batch_size=1)

linear.fit({'train': s3_input(s3_train_data, content_type='text/csv'), \
            'test': s3_input(s3_validation_data, content_type='text/csv')})




In [106]:
transformer = linear.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=s3_output_location)

INFO:sagemaker:Creating model with name: linear-learner-2019-01-04-00-02-36-561


In [107]:
transformer.transform(s3_validation_data, content_type='text/csv')

INFO:sagemaker:Creating transform job with name: linear-learner-2019-01-04-00-09-38-808


In [108]:
transformer.wait()

..........................................!


In [109]:
transformer.output_path

's3://mastering-ml-aws/chapter3/linearmodels/output'

In [112]:
!aws s3 cp s3://mastering-ml-aws/chapter3/linearmodels/output/testing-housing.csv.out - | head

{"score":18.911674499511719}
{"score":41.916255950927734}
{"score":20.833599090576172}
{"score":38.696208953857422}
{"score":30.833646774291992}
{"score":19.361166000366211}
{"score":24.461696624755859}
{"score":24.615261077880859}
{"score":4.382085800170898}
{"score":25.914594650268555}


In [110]:
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file('chapter3/linearmodels/output/testing-housing.csv.out', 'testing-housing.csv.out')

In [111]:
predictions = pd.read_json('testing-housing.csv.out',lines=True)
predictions.head()

Unnamed: 0,score
0,18.911674
1,41.916256
2,20.833599
3,38.696209
4,30.833647


In [113]:
evaluation_df = pd.DataFrame({'actual':list(test_df[label]),'predicted':list(predictions['score'])})
evaluation_df.head()

Unnamed: 0,actual,predicted
0,14.3,18.911674
1,50.0,41.916256
2,23.2,20.833599
3,46.0,38.696209
4,30.8,30.833647


In [114]:
from sklearn.metrics import r2_score

r2_score(evaluation_df['actual'], evaluation_df['predicted'])

0.7962794586033369