# Auto MPG prediction

#### Install required versions of Sagemaker SDK and Experiments

In [1]:
%pip install -U sagemaker>=2.15
%pip install sagemaker-experiments

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Intiailize Sagemaker session and Sagemaker boto3 client

In [2]:
import requests
import boto3
import sagemaker
from sagemaker import get_execution_role
from io import StringIO

In [3]:
# initialize sagemaker session 
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = sess.default_bucket() 
role = get_execution_role()

# boto3 client
sm_client = boto3.client('sagemaker')

print("Using default bucket: ", bucket)
print("Using  Region: ", region)
print("Using execution Role: ", role)

Using default bucket:  sagemaker-us-east-1-392525434032
Using  Region:  us-east-1
Using execution Role:  arn:aws:iam::392525434032:role/service-role/AmazonSageMaker-ExecutionRole-20220310T175822


### Import Python Libraries

In [4]:
import os
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Define S3 bucket prefixes for Data and Models

In [5]:
prefix = "auto_mpg"

# raw data path
raw_train_prefix = f"{prefix}/data/bronze/train"
raw_val_prefix = f"{prefix}/data/bronze/val"
raw_test_prefix = f"{prefix}/data/bronze/test"

# preprocessed features path
pp_train_prefix = f"{prefix}/data/gold/train"
pp_val_prefix = f"{prefix}/data/gold/val"
pp_test_prefix = f"{prefix}/data/gold/test"

# preprocessor and ml models
pp_model_prefix = f"{prefix}/models/preprocessor"
ml_model_prefix = f"{prefix}/models/ml"


def get_s3_path(prefix, bucket=bucket):
    """ get full path in s3 """
    return f"s3://{bucket}/{prefix}"


### Upload Raw Data to S3

In this step we perform the following
1. Download the Raw Data
2. Split it into train and test
3. Upload to s3

In [6]:
def upload_raw_data_to_s3(sess,
                          raw_train_prefix=raw_train_prefix,
                          raw_val_prefix=raw_val_prefix,
                          raw_test_prefix=raw_test_prefix, 
                          split=0.8):
    """
    Read MPG dataset, peform train test split, then upload to s3
    """
    # filenames
    train_fn = "train.csv"
    val_fn = "val.csv"
    test_fn = "test.csv"
    
    # download data
    data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
    res = requests.get(data_url)
    file = StringIO(res.text)
    
    # read data
    data = pd.read_csv(file, header = None, delimiter = '\s+', low_memory = False, na_values = "?")
    data_frame = data.drop(columns = 8)
    data_frame = data_frame.fillna(data_frame.mean())
    data_frame = data_frame.rename(index = int, columns = {0: "mpg", 1:"cylinders", 2: "displacement",3: "horsepower", 4: "weight", 5:"acceleration",6:"model year",7:"origin"})
    
    # train - test - split
    train_df = data_frame.sample(frac=split)
    test_df = data_frame.drop(train_df.index)
    
    # take the last 10 rows of test_df as the test data and the 
    val_df = test_df[:-10]
    test_df = test_df[-10:]
    
    assert set(list(train_df.index)).intersection(list(test_df.index)) == set([]), "overlap between train and test"
    
    # save data locally and upload data to s3
    train_df.to_csv(train_fn, index=False, sep=',', encoding='utf-8')
    train_path = sess.upload_data(path=train_fn, bucket=bucket, key_prefix=raw_train_prefix)
    
    val_df.to_csv(val_fn, index=False, sep=',', encoding='utf-8')
    val_path = sess.upload_data(path=val_fn, bucket=bucket, key_prefix=raw_val_prefix)
    
    test_df.to_csv(test_fn, index=False, sep=',', encoding='utf-8')
    test_path = sess.upload_data(path=test_fn, bucket=bucket, key_prefix=raw_test_prefix)
    
    # delete local versions of the data
    os.remove(train_fn)
    os.remove(val_fn)
    os.remove(test_fn)
    
    print("Path to raw train data:", train_path)
    print("Path to raw val data:", val_path)
    print("Path to raw test data:", test_path)
    
    return train_path, val_path, test_path

In [7]:
train_path, val_path, test_path = upload_raw_data_to_s3(sess)

Path to raw train data: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/train/train.csv
Path to raw val data: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/val/val.csv
Path to raw test data: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/test/test.csv


## Preprocess Data

In [8]:
# use framework preprocessor to use custom dependencies
# there is no provision for custom dependences with sklearn preprocessor
# https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb
from datetime import datetime
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import FrameworkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S").replace(":", "-")
TRAIN_FN = 'train.csv'
VAL_FN = 'val.csv'
TRAIN_FEATS_FN = 'train_feats.npy'
VAL_FEATS_FN = 'val_feats.npy'


sklearn_processor = FrameworkProcessor(
    base_job_name=f"auto-mpg-feature-eng-{current_time}",
    framework_version="1.0-1",
    role=role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    estimator_cls=SKLearn
)

sklearn_processor.run(
    code="train.py",
    source_dir="scripts/preprocessor/",
    inputs=[
        ProcessingInput(source=get_s3_path(raw_train_prefix), destination="/opt/ml/processing/input/train"),
        ProcessingInput(source=get_s3_path(raw_val_prefix), destination="/opt/ml/processing/input/test")
    ],
    outputs=[
        ProcessingOutput(output_name="train_features", source="/opt/ml/processing/train", destination=get_s3_path(pp_train_prefix)),
        ProcessingOutput(output_name="val_features", source="/opt/ml/processing/test", destination=get_s3_path(pp_val_prefix)),
        ProcessingOutput(output_name="preprocessor_model", source="/opt/ml/processing/output", destination=get_s3_path(pp_model_prefix)),
    ],
    arguments=["--train-filename", TRAIN_FN,
               "--test-filename", VAL_FN,
               "--train-feats-filename", TRAIN_FEATS_FN,
               "--test-feats-filename", VAL_FEATS_FN],
)


Job Name:  auto-mpg-feature-eng-30-Oct-2022-11-34--2022-10-30-11-34-22-698
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/train', 'LocalPath': '/opt/ml/processing/input/train', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/val', 'LocalPath': '/opt/ml/processing/input/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/auto-mpg-feature-eng-30-Oct-2022-11-34--2022-10-30-11-34-22-698/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistrib

## Train Model

In [9]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [31]:
current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S").replace(":", "-")
experiment_name = "auto-mg-experiment"
try:
    auto_experiment = Experiment.load(experiment_name=experiment_name)
    print(f'experiment {experiment_name} was loaded')
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        auto_experiment = Experiment.create(experiment_name = experiment_name,
                                            description = "Regression on Auto MPG dataset",
                                            tags = [{'Key': 'Name', 'Value': f"auto-mg-experiment-{current_time}"},
                                                    {'Key': 'MLEngineer', 'Value': f"Temiloluwa Adeoti"},
                                                   ])
        print(f'experiment {experiment_name} was created')


experiment auto-mg-experiment was loaded


In [34]:
from sagemaker.sklearn.estimator import SKLearn

current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S").replace(":", "-")
n_estimators = 10
trail_name = f"auto-mg-{n_estimators}-estimators"
training_job_trial = Trial.create(trial_name = f"{trail_name}-{current_time}",
                              experiment_name = auto_experiment.experiment_name,
                              sagemaker_boto_client=sm_client,
                              tags = [{'Key': 'Name', 'Value': f"auto-mg-{current_time}"},
                                       {'Key': 'MLEngineer', 'Value': f"Temiloluwa Adeoti"}])
model = SKLearn(
    entry_point="train.py",
    source_dir="./scripts/model",
    framework_version="1.0-1", 
    instance_type="ml.m5.xlarge", 
    role=role,
    output_path = get_s3_path(ml_model_prefix), # model output path
    hyperparameters = {
        "n_estimators": n_estimators
    },
    metric_definitions=[
            {"Name": "train:mae", "Regex": "train_mae=(.*?);"},
            {"Name": "test:mae", "Regex": "test_mae=(.*?);"},
            {"Name": "train:mse", "Regex": "train_mse=(.*?);"},
            {"Name": "test:mse", "Regex": "test_mse=(.*?);"},
            {"Name": "train:rmse", "Regex": "train_rmse=(.*?);"},
            {"Name": "test:rmse", "Regex": "test_rmse=(.*?);"},
        ],
    enable_sagemaker_metrics=True
)


model.fit(job_name=f"auto-mpg-{current_time}",
          inputs = {"train": get_s3_path(pp_train_prefix), 
                    "test": get_s3_path(pp_val_prefix)
                   }, 
          experiment_config={
            "TrialName": training_job_trial.trial_name,
            "TrialComponentDisplayName": f"Training-auto-mg-run-{current_time}",
          },
          logs="All")

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: auto-mpg-30-Oct-2022-13-08-48


2022-10-30 13:08:48 Starting - Starting the training job...
2022-10-30 13:09:11 Starting - Preparing the instances for trainingProfilerReport-1667135328: InProgress
.........
2022-10-30 13:10:31 Downloading - Downloading input data...
2022-10-30 13:11:12 Training - Downloading the training image...
2022-10-30 13:11:45 Uploading - Uploading generated training model[34m2022-10-30 13:11:37,728 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-10-30 13:11:37,732 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-30 13:11:37,740 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-10-30 13:11:38,149 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-30 13:11:38,161 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-30 13:11:38,176 sagemaker-training-toolkit I

## Inference Pipeline Model

In [76]:
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.pipeline import PipelineModel
from sagemaker.serializers import CSVSerializer
from datetime import datetime

current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S").replace(":", "-")
model_name = f"inference-pipeline-{current_time}"
endpoint_name = f"inference-pipeline-{current_time}"
pp_model_path = get_s3_path(pp_model_prefix) + "/model.tar.gz"

print("preprocessor model path ", pp_model_path)

# preprocessor
sklearn_processor_model = SKLearnModel(
                             model_data=pp_model_path,
                             role=role,
                             entry_point="scripts/preprocessor/inference.py",
                             dependencies=["scripts/preprocessor/custom_preprocessor.py"],
                             framework_version="1.0-1",
                             sagemaker_session=sess)

# regression model
reg_model = model.create_model(entry_point="inference.py",
                               source_dir="./scripts/model")
    
inference_pipeline = PipelineModel(
    name=model_name, role=role, models=[sklearn_processor_model, reg_model],
    sagemaker_session=sess
)

predictor = inference_pipeline.deploy(initial_instance_count=1, 
                                      instance_type="ml.c4.xlarge", 
                                      endpoint_name=endpoint_name,
                                      serializer=CSVSerializer() # to ensure input is csv
                                     )

preprocessor model path  s3://sagemaker-us-east-1-392525434032/auto_mpg/models/preprocessor/model.tar.gz


INFO:sagemaker:Creating model with name: inference-pipeline-30-Oct-2022-15-15-21
INFO:sagemaker:Creating endpoint-config with name inference-pipeline-30-Oct-2022-15-15-21
INFO:sagemaker:Creating endpoint with name inference-pipeline-30-Oct-2022-15-15-21


---------------!

### Download Test Data From S3

In [88]:
from pprint import pprint

sess.download_data(path=".", bucket=bucket, key_prefix=raw_test_prefix)
with open("test.csv", "r") as f:
    test_data = f.readlines()[1:]

pprint([f"id: {i}, raw_input: {v}" for i, v in enumerate(test_data)])

['id: 0, raw_input: 39.0,4,86.0,64.0,1875.0,16.4,81,1\n',
 'id: 1, raw_input: 32.3,4,97.0,67.0,2065.0,17.8,81,3\n',
 'id: 2, raw_input: 32.9,4,119.0,100.0,2615.0,14.8,81,3\n',
 'id: 3, raw_input: 22.4,6,231.0,110.0,3415.0,15.8,81,1\n',
 'id: 4, raw_input: 29.0,4,135.0,84.0,2525.0,16.0,82,1\n',
 'id: 5, raw_input: 36.0,4,105.0,74.0,1980.0,15.3,82,2\n',
 'id: 6, raw_input: 36.0,4,98.0,70.0,2125.0,17.3,82,1\n',
 'id: 7, raw_input: 34.0,4,108.0,70.0,2245.0,16.9,82,3\n',
 'id: 8, raw_input: 32.0,4,91.0,67.0,1965.0,15.7,82,3\n',
 'id: 9, raw_input: 32.0,4,144.0,96.0,2665.0,13.9,82,3\n']


### Make Predictions with predictor

In [93]:
from sagemaker.predictor import Predictor
from sagemaker.deserializers import JSONLinesDeserializer

predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sess, serializer=CSVSerializer(), deserializer=JSONLinesDeserializer()
)

num_of_samples = 1
response = predictor.predict(test_data[:num_of_samples])
response

[[{'id': 0, 'prediction': 34.96311893939394}]]

### Clean Up

In [74]:
# remove downloaded test data
os.remove("test.csv")
# delete endpoint
predictor.delete_endpoint()