In [None]:
%%sh
mdir
pip -q install pandas scikit-learn joblib

# Adding source files for training

In [None]:
%%sh
mkdir src

In [None]:
%%writefile src/printmetrics.py

from sklearn.metrics import mean_squared_error, r2_score

def printmetrics(y_test, y_pred):
    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
    print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [None]:
%%writefile src/sklearn-boston-housing.py

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
import argparse, os
import printmetrics

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return model

if __name__ == '__main__':
        
    parser = argparse.ArgumentParser()
    parser.add_argument('--normalize', type=bool, default=False)
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--random-state', type=int, default=123)
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    
    args, _ = parser.parse_known_args()
    normalize = args.normalize
    test_size = args.test_size
    random_state = args.random_state
    model_dir  = args.model_dir
    training_dir = args.training

    filename = os.path.join(training_dir, 'housing.csv')
    data = pd.read_csv(filename)
    labels = data[['medv']]
    samples = data.drop(['medv'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(samples, labels, 
                                                        test_size=test_size, random_state=random_state)
    regr = LinearRegression(normalize=normalize)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    printmetrics.printmetrics(y_test, y_pred)
    
    model = os.path.join(model_dir, 'model.joblib')
    joblib.dump(regr, model)

In [None]:
import sagemaker
from sagemaker.sklearn import SKLearn

training = 'file://.'
output = 'file://.'

role = sagemaker.get_execution_role()
sk = SKLearn(entry_point='sklearn-boston-housing.py',
             source_dir='src',
             role=role,
             train_instance_count=1, 
             train_instance_type='local',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

# Adding libraries for training

In [None]:
%%sh
mkdir lib
pip install -q -t lib joblib

In [None]:
%%writefile src/sklearn-boston-housing-joblib.py

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib
import argparse, os

from printmetrics import printmetrics

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return model

if __name__ == '__main__':
        
    parser = argparse.ArgumentParser()
    parser.add_argument('--normalize', type=bool, default=False)
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--random-state', type=int, default=123)
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    
    args, _ = parser.parse_known_args()
    normalize = args.normalize
    test_size = args.test_size
    random_state = args.random_state
    model_dir  = args.model_dir
    training_dir = args.training

    filename = os.path.join(training_dir, 'housing.csv')
    data = pd.read_csv(filename, delim_whitespace=True)
    labels = data[['mdev']]
    samples = data.drop(['mdev'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(samples, labels, 
                                                        test_size=test_size, random_state=random_state)
    regr = LinearRegression(normalize=normalize)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    printmetrics(y_test, y_pred)
    
    model = os.path.join(model_dir, 'model.joblib')
    joblib.dump(regr, model)

In [None]:
import sagemaker
from sagemaker.sklearn import SKLearn

training = 'file://.'
output = 'file://.'

role = sagemaker.get_execution_role()
sk = SKLearn(entry_point='sklearn-boston-housing-joblib.py',
             source_dir='src',
             dependencies=['lib/joblib'],
             role=role,
             train_instance_count=1, 
             train_instance_type='local',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

In [None]:
# This will fail with "No module named 'joblib'"
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='local')

In [None]:
# Installing libraries in the container

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='local')

In [None]:
data = pd.read_csv('housing.csv', delim_whitespace=True)
payload = data[:10].drop(['mdev'], axis=1) 
payload = payload.to_csv(header=False, index=False)
print(payload)

In [None]:
%%sh
pip install -q -t . --upgrade joblib

In [None]:
from sagemaker.predictor import csv_serializer, csv_deserializer

sk_predictor.content_type = 'text/csv'
sk_predictor.accept = 'text/csv'
sk_predictor.serializer = csv_serializer
sk_predictor.deserializer = csv_deserializer

response = sk_predictor.predict(payload)

print(response)

In [None]:
sk_predictor.delete_endpoint()

# Step 5 - run with SageMaker managed infrastructure

In [None]:
sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sklearn-boston-housing'

training = sess.upload_data(path='housing.csv', key_prefix=prefix + "/training")
output   = 's3://{}/{}/output/'.format(bucket,prefix)
print(training)
print(output)

In [None]:
sk = SKLearn(entry_point='sklearn-boston-housing.py',
             role=role,
             train_instance_count=1, 
             train_instance_type='ml.m5.large',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

You can reuse the cells above for prediction.

In [None]:
sk_predictor.delete_endpoint()