In [None]:
%%sh
pip -q install pandas scikit-learn joblib

# Vanilla code

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

data = pd.read_csv('housing.csv')
labels = data[['medv']]
samples = data.drop(['medv'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.1, random_state=123)
regr = LinearRegression(normalize=True)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
joblib.dump(regr, 'model.joblib')

In [None]:
%%sh
export SM_CHANNEL_TRAINING=
export SM_MODEL_DIR=
python sklearn-boston-housing.py --normalize True --test-size 0.1 --training . --model-dir .
ls -l model.joblib

## Run with SageMaker Local Mode

In [None]:
%%sh
pip install -q -t . --upgrade joblib

In [None]:
import sagemaker
from sagemaker.sklearn import SKLearn

In [None]:
training = 'file://.'
output = 'file://.'

sk = SKLearn(entry_point='sklearn-boston-housing.py',
             framework_version='0.23-1',
             role=sagemaker.get_execution_role(),
             instance_count=1, 
             instance_type='local',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='local')

In [None]:
data = pd.read_csv('housing.csv')
payload = data[:10].drop(['medv'], axis=1) 
payload = payload.to_csv(header=False, index=False)
print(payload)

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, 
                         instance_type='local')

In [None]:
data = pd.read_csv('housing.csv')
payload = data[:10].drop(['medv'], axis=1) 
payload = payload.to_csv(header=False, index=False)

sk_predictor.serializer = sagemaker.serializers.CSVSerializer()
sk_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = sk_predictor.predict(payload)

print(response)

In [None]:
sk_predictor.delete_endpoint()

## Run with SageMaker managed infrastructure

In [None]:
sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sklearn-boston-housing'

training = sess.upload_data(path='housing.csv', key_prefix=prefix + "/training")
output   = 's3://{}/{}/output/'.format(bucket,prefix)
print(training)
print(output)

In [None]:
sk = SKLearn(entry_point='sklearn-boston-housing.py',
             role=sagemaker.get_execution_role(),
             framework_version='0.23-1',
             instance_count=1, 
             instance_type='ml.m5.large',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

You can reuse the cells above for prediction.

In [None]:
sk_predictor.delete_endpoint()