# Auto MPG prediction

#### Install required versions of Sagemaker SDK and Experiments

In [1]:
%pip install -U sagemaker>=2.15
%pip install sagemaker-experiments

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


### Intiailize Sagemaker session and Sagemaker boto3 client

In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role

In [3]:
# initialize sagemaker session 
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = sess.default_bucket() 
role = get_execution_role()

# boto3 client
sm_client = boto3.client('sagemaker')

print("Using default bucket: ", bucket)
print("Using  Region: ", region)
print("Using execution Role: ", role)

Using default bucket:  sagemaker-us-east-1-392525434032
Using  Region:  us-east-1
Using execution Role:  arn:aws:iam::392525434032:role/service-role/AmazonSageMaker-ExecutionRole-20220310T175822


#### Import Python Libraries

In [4]:
import os
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Define S3 bucket prefixes for Data and Models

In [5]:
prefix = "auto_mpg"

# raw data path
raw_train_prefix = f"{prefix}/data/bronze/train"
raw_val_prefix = f"{prefix}/data/bronze/val"
raw_test_prefix = f"{prefix}/data/bronze/test"

# preprocessed features path
pp_train_prefix = f"{prefix}/data/gold/train"
pp_val_prefix = f"{prefix}/data/gold/val"
pp_test_prefix = f"{prefix}/data/gold/test"

# preprocessor and ml models
pp_model_prefix = f"{prefix}/models/preprocessor"
ml_model_prefix = f"{prefix}/models/ml"

#### Upload Raw Data to S3

In this step we perform the following
1. Download the Raw Data
2. Split it into train and test
3. Upload to s3

In [6]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

--2022-10-15 12:32:46--  https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: ‘auto-mpg.data.2’


2022-10-15 12:32:46 (457 KB/s) - ‘auto-mpg.data.2’ saved [30286/30286]



In [7]:
def upload_raw_data_to_s3(sess, file_name="auto-mpg.data", split=0.8):
    """
    Read MPG dataset, peform train test split, then upload to s3
    """
    # read data
    data = pd.read_csv(file_name, header = None, delimiter = '\s+', low_memory = False, na_values = "?")
    data_frame = data.drop(columns = 8)
    data_frame = data_frame.fillna(data_frame.mean())
    data_frame = data_frame.rename(index = int, columns = {0: "mpg", 1:"cylinders", 2: "displacement",3: "horsepower", 4: "weight", 5:"acceleration",6:"model year",7:"origin"})
    
    # train - test - split
    train_df = data_frame.sample(frac=split)
    test_df = data_frame.drop(train_df.index)
    
    val_df = test_df[:-10]
    test_df = test_df[-10:]
    
    assert set(list(train_df.index)).intersection(list(test_df.index)) == set([]), "overlap between train and test"
    
    # upload data to s3
    train_df.to_csv("train.csv", index=False, sep=',', encoding='utf-8')
    train_path = sess.upload_data(path="train.csv", bucket=bucket, key_prefix=raw_train_prefix)
    
    val_df.to_csv("val.csv", index=False, sep=',', encoding='utf-8')
    val_path = sess.upload_data(path="val.csv", bucket=bucket, key_prefix=raw_val_prefix)
    
    test_df.to_csv("test.csv", index=False, sep=',', encoding='utf-8')
    test_path = sess.upload_data(path="test.csv", bucket=bucket, key_prefix=raw_test_prefix)
    
    os.remove("train.csv")
    os.remove("val.csv")
    os.remove("test.csv")
    
    return train_path, val_path, test_path

In [8]:
train_path, val_path, test_path = upload_raw_data_to_s3(sess)
print("raw train path:", train_path)
print("raw val path:", val_path)
print("raw test path:", test_path)

raw train path: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/train/train.csv
raw val path: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/val/val.csv
raw test path: s3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/test/test.csv


#### Preprocess Data

In [29]:
from datetime import datetime
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

def get_s3_path(pth, bucket=bucket):
    """ get full path in s3 """
    return f"s3://{bucket}/{pth}"


current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S")

sklearn_processor = SKLearnProcessor(
    base_job_name="auto-mpg-feature-eng",
    framework_version="1.0-1",
    role=role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
)

sklearn_processor.run(
    code="preprocess.py",
    inputs=[
        ProcessingInput(source=get_s3_path(raw_train_prefix), destination="/opt/ml/processing/input/train"),
        ProcessingInput(source=get_s3_path(raw_val_prefix), destination="/opt/ml/processing/input/test")
    ],
    outputs=[
        ProcessingOutput(output_name="train_features", source="/opt/ml/processing/train", destination=get_s3_path(pp_train_prefix)),
        ProcessingOutput(output_name="val_features", source="/opt/ml/processing/test", destination=get_s3_path(pp_val_prefix)),
        ProcessingOutput(output_name="preprocessor_model", source="/opt/ml/processing/output", destination=get_s3_path(pp_model_prefix)),
    ],
    arguments=["--train-filename", "train.csv", "--test-filename", "val.csv"],
)



Job Name:  autompg-feature-eng-2022-10-15-12-56-04-718
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/train', 'LocalPath': '/opt/ml/processing/input/train', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/auto_mpg/data/bronze/val', 'LocalPath': '/opt/ml/processing/input/test', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-392525434032/autompg-feature-eng-2022-10-15-12-56-04-718/input/code/preprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3Compre

#### Train Model

In [None]:
from sagemaker.sklearn.estimator import SKLearn



#### Inference Pipeline

In [38]:
output_config = sklearn_processor.jobs[-1].describe()['ProcessingOutputConfig']['Outputs']
pp_model_path = output_config[2]['S3Output']['S3Uri'] + '/pl.joblib'
pp_model_path

's3://sagemaker-us-east-1-392525434032/auto_mpg/models/preprocessor/pl.joblib'

#### Create Experiment, Trial and Tracker

#### SKLearn Model

Train Model in Script Mode

In [None]:
def get_dir(pth):
    """ get the directory name of path """
    return "/".join(pth.split("/")[:-1])



train_path = get_dir(train_path)
val_path = get_dir(val_path)
test_path = get_dir(test_path)

In [None]:
print(train_path)
print(val_path)
print(test_path)

###

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
create_date = strftime("%Y-%m-%d-%H-%M-%S")
auto_experiment = Experiment.create(experiment_name = "auto-experiment-{}".format(create_date),
                                    description = "auto experiment",
                                    tags = [{'Key': 'auto-experiment', 'Value': 'demo1'}])


demo_trial = Trial.create(trial_name = "auto-trial-{}".format(create_date),
                          experiment_name = auto_experiment.experiment_name,
                          sagemaker_boto_client=sm_client,
                          tags = [{'Key': 'auto-trial', 'Value': 'demo1'}])

In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"
script_path = "model.py"

sklearn = SKLearn(
    entry_point=script_path,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sess,
    hyperparameters={"n_estimators": 2},
    tags=[{'Key': 'auto', 'Value': 'demo1'}]
)

In [None]:
sklearn.fit({"train": train_path}, experiment_config = {
                "TrialName" : demo_trial.trial_name,
                "TrialComponentDisplayName" : "TrainingJob",
            })

In [None]:
train_loss = 0.5
test_loss = 0.6

In [None]:
with Tracker.create(display_name="Training-Evaluation", sagemaker_boto_client=sm_client) as tracker:
    tracker.log_parameters(
        {
            "train-mse-loss": train_loss,
            "test-mse-loss": test_loss ,
        }
    )
    tracker.log_input(name="auto", media_type="s3/uri", value=get_dir(train_path))

demo_trial.add_trial_component(tracker.trial_component)

In [17]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

target = 'mpg'
original_features = ['cylinders',
                     'displacement',
                     'horsepower',
                     'weight',
                     'acceleration',
                     'model year',
                     'origin']


class CustomFeaturePreprocessor(BaseEstimator, TransformerMixin):
    """
    This is a custom transformer class that does the following
    
        1. converts model year to age
        2. converts data type of categorical columns
    """
    
    feat = original_features
    new_datatypes = {'cylinders': 'category', 'origin': 'category'}
    
    def fit(self, X, y=None):
        """ Fit function"""
        return self

    def transform(self, X, y=None):
        """ Transform Dataset """
        assert set(list(X.columns)) - set(list(self.feat))\
                    ==  set([]), "input does have the right features"
        
        # conver model year to age
        X["model year"] = 82 - X["model year"]
        
        # change data types of cylinders and origin 
        X = X.astype(self.new_datatypes)
        
        return X
    
    def fit_transform(self, X, y=None):
        """ Fit transform function """
        x = self.fit(X)
        x = self.transform(X)
        return x

    
# one hot categorical features
# apply standard scaler to numerical features
ct = ColumnTransformer([("categorical-feats", OneHotEncoder(), make_column_selector(dtype_include="category")),
                        ("numerical-feats", StandardScaler(), make_column_selector(dtype_exclude="category"))])

# apply custom preprocessing
pl = Pipeline([("custom-preprocessing", CustomFeaturePreprocessor()), ("column-preprocessing", ct)])

train_data = train_df.iloc[:, 1:]
train_target = train_df["mpg"].values.reshape(-1, 1)

a = pl.fit_transform(train_data)


def save_numpy(np_array, path):
    """ save np array """
    with open(path, 'wb') as f:
        np.save(f, np_array)
        
#save_numpy(a, "a.npy")

In [12]:
a.shape

(318, 13)

In [20]:
np.concatenate([train_target, a], axis=1).shape

(318, 14)

In [25]:
from datetime import datetime

current_time = datetime.now().strftime("%d-%b-%Y-%H:%M:%S")
print(timestampStr)

15-Oct-2022-12:52:45
