# renfe-guru sagemaker example

## 0. python general imports

In [1]:
import pandas as pd
import logging
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pd.show_versions()



INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.3.final.0
python-bits      : 64
OS               : Linux
OS-release       : 3.10.0-1062.4.3.el7.x86_64
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 0.25.1
numpy            : 1.16.2
pytz             : 2018.9
dateutil         : 2.8.0
pip              : 20.0.2
setuptools       : 41.6.0
Cython           : 0.29.6
pytest           : 4.3.1
hypothesis       : None
sphinx           : 1.8.5
blosc            : None
feather          : None
xlsxwriter       : 1.1.5
lxml.etree       : 4.3.2
html5lib         : 1.0.1
pymysql          : None
psycopg2         : 2.8.3 (dt dec pq3 ext)
jinja2           : 2.10.3
IPython          : 7.4.0
pandas_datareader: None
bs4              : 4.7.1
bottleneck       : 1.2.1
fastparquet      : 0.2.1
gcsfs            : None
lxml.etree       : 4.3.2
mat

In [3]:
def get_logger():
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    return logger

logger = get_logger()

logger.info("hi!")

2020-04-17 13:41:03,721 - root - INFO - hi!


## 1. data loading

the dataset can be downloaded here: https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing

In [4]:
#renfe = pd.read_parquet('../data/raw/renfe.parquet')  # about 60MB in .parquet file, but 3.5Gb in memory, be careful!
#renfe = pd.read_csv('../data/raw/renfe.csv',infer_datetime_format=True)  


In [5]:
# modified below code so that data can be read from  s3
path = "renfe.parquet"
bucket = "dsla"
folder = "ml-in-production/data" #-sagemaker

bucket_uri = f's3://{bucket}/{folder}/{path}'

print(bucket_uri)

renfe = pd.read_parquet(bucket_uri,engine='auto',columns=None)

2020-04-17 13:41:09,938 - s3fs - DEBUG - Setting up s3fs instance
2020-04-17 13:41:10,030 - s3fs - DEBUG - Get directory listing page for dsla/ml-in-production/data


s3://dsla/ml-in-production/data/renfe.parquet


2020-04-17 13:41:10,578 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 61947502-62013038
2020-04-17 13:41:11,276 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 7138430-12449078
2020-04-17 13:41:19,262 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 4-12381213
2020-04-17 13:41:22,503 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 7274184-32057204
2020-04-17 13:41:27,429 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 26814423-52160157
2020-04-17 13:41:32,418 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 46917374-47785943
2020-04-17 13:41:32,746 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 48187868-62013038
2020-04-17 13:41:35,488 - s3fs - DEBUG - Fetch: dsla/ml-in-production/data/renfe.parquet, 58081695-58673628


In [6]:
renfe.head()

Unnamed: 0,insert_date,origin,destination,start_date,end_date,train_type,price,train_class,fare
0,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 13:40:00,2019-08-29 16:10:00,AVE,47.3,Turista,Promo
1,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 14:45:00,2019-08-29 17:15:00,AVE,53.4,Turista,Promo
2,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 14:58:00,2019-08-29 17:50:00,ALVIA,,Preferente,Promo
3,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 15:45:00,2019-08-29 18:15:00,AVE,61.45,Preferente,Promo
4,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 16:45:00,2019-08-29 19:17:00,AVE,60.3,Turista,Promo


In [7]:
renfe.dtypes

insert_date    datetime64[ns]
origin                 object
destination            object
start_date     datetime64[ns]
end_date       datetime64[ns]
train_type             object
price                 float64
train_class            object
fare                   object
dtype: object

In [8]:
renfe.info()  # with deep memory usage will take a while...</div><i class="fa fa-lightbulb-o "></i>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10800510 entries, 0 to 10800509
Data columns (total 9 columns):
insert_date    datetime64[ns]
origin         object
destination    object
start_date     datetime64[ns]
end_date       datetime64[ns]
train_type     object
price          float64
train_class    object
fare           object
dtypes: datetime64[ns](3), float64(1), object(5)
memory usage: 741.6+ MB


## 2. data wrangling

first of all, null values will be dropped. all null values are due to:
- scrapping errors, specially at the beggining of the process
- trains with no ticket available (usually full, canceled, etc.)

In [9]:
# filtering null values, inplace to modify original df

renfe.dropna(inplace=True)

as the goal is to predict ticket price in advance, some interesting features can be derived:
- trip duration (in hours)
- time to departure (in days)
- hour of departure (24h)
- week day of departure

In [10]:
# feature engineering / generation

def add_features(renfe_df):

    renfe_df['duration'] = (renfe_df['end_date'] - renfe_df['start_date']).dt.seconds / 3600
    renfe_df['time_to_departure'] = (renfe_df['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \
                                   - renfe_df['insert_date'].dt.tz_localize('UTC')).dt.days
    renfe_df['hour'] = renfe_df['start_date'].dt.hour
    renfe_df['weekday'] = renfe_df['start_date'].dt.dayofweek

add_features(renfe)

following, perform train - validation - test splits:

In [11]:
# train - test split

from sklearn.model_selection import train_test_split

renfe_train_validation, renfe_test = train_test_split(renfe)
renfe_train, renfe_validation = train_test_split(renfe_train_validation)

# to avoid chained assignment 'pandas warning'

renfe_train = renfe_train.copy()
renfe_validation = renfe_validation.copy()
renfe_test = renfe_test.copy()

In [12]:
logger.info(f'n obs in training set are: {renfe_train.shape[0]}')
logger.info(f'n obs in validation set are: {renfe_validation.shape[0]}')
logger.info(f'n obs in test set are: {renfe_test.shape[0]}')

2020-04-17 13:58:09,321 - root - INFO - n obs in training set are: 5699503
2020-04-17 13:58:09,322 - root - INFO - n obs in validation set are: 1899835
2020-04-17 13:58:09,323 - root - INFO - n obs in test set are: 2533113


data looks like this so far:

In [13]:
renfe_train.head().T

Unnamed: 0,9990470,10243411,7292010,6467341,1184519
insert_date,2019-08-01 03:26:13,2019-08-04 11:48:31,2019-05-26 07:16:37,2019-05-17 09:18:11,2019-09-02 09:34:21
origin,SEVILLA,SEVILLA,MADRID,VALENCIA,SEVILLA
destination,MADRID,MADRID,VALENCIA,MADRID,MADRID
start_date,2019-09-13 19:45:00,2019-08-14 14:45:00,2019-07-14 14:10:00,2019-05-23 08:00:00,2019-09-15 16:15:00
end_date,2019-09-13 22:17:00,2019-08-14 17:15:00,2019-07-14 16:03:00,2019-05-23 09:47:00,2019-09-15 18:54:00
train_type,AVE,AVE,AVE,AVE,AVE
price,47.3,53.5,33.65,57.75,69.4
train_class,Turista,Preferente,Turista,Turista,Preferente
fare,Promo,Promo,Promo,Promo,Promo
duration,2.53333,2.5,1.88333,1.78333,2.65


following, there are some categorical columns that have to be encoded (most ML algorithms will need that):

In [14]:
# preprocessing

from sklearn.preprocessing import OrdinalEncoder
import joblib

encode_cols = ['train_type', 'train_class', 'fare', 'origin', 'destination']
encoder = OrdinalEncoder()
encoder.fit(renfe[encode_cols])  # warning, it should be fit only on training data!
joblib.dump(encoder, '../output/pickle_data/encoder.joblib')

for split, df in {'training': renfe_train, 
                  'validation': renfe_validation, 
                  'test': renfe_test}.items():
    logger.info(f'transforming {split} set...')
    df.loc[:,encode_cols] = encoder.transform(df.loc[:,encode_cols])

2020-04-17 13:58:11,992 - root - INFO - transforming training set...
2020-04-17 13:58:23,331 - root - INFO - transforming validation set...
2020-04-17 13:58:27,002 - root - INFO - transforming test set...


with those columns encoded, data looks like this:

In [15]:
renfe_train.head().T

Unnamed: 0,9990470,10243411,7292010,6467341,1184519
insert_date,2019-08-01 03:26:13,2019-08-04 11:48:31,2019-05-26 07:16:37,2019-05-17 09:18:11,2019-09-02 09:34:21
origin,4,4,2,5,4
destination,2,2,5,2,2
start_date,2019-09-13 19:45:00,2019-08-14 14:45:00,2019-07-14 14:10:00,2019-05-23 08:00:00,2019-09-15 16:15:00
end_date,2019-09-13 22:17:00,2019-08-14 17:15:00,2019-07-14 16:03:00,2019-05-23 09:47:00,2019-09-15 18:54:00
train_type,2,2,2,2,2
price,47.3,53.5,33.65,57.75,69.4
train_class,4,2,4,4,2
fare,8,8,8,8,8
duration,2.53333,2.5,1.88333,1.78333,2.65


## 3. upload data to S3

to use sagemaker using aws apis (sagemaker or boto) data must be formated in a particular way and stored in aws s3

In [16]:
# target must be in the first position of csv columns for xgboost via sagemaker API

target = 'price'
features = ['train_type', 'train_class', 'fare', 'duration', 'time_to_departure', 'hour', 'weekday']

renfe_train[[target] + features].head()

Unnamed: 0,price,train_type,train_class,fare,duration,time_to_departure,hour,weekday
9990470,47.3,2.0,4.0,8.0,2.533333,43,19,4
10243411,53.5,2.0,2.0,8.0,2.5,10,14,2
7292010,33.65,2.0,4.0,8.0,1.883333,49,14,6
6467341,57.75,2.0,4.0,8.0,1.783333,5,8,3
1184519,69.4,2.0,2.0,8.0,2.65,13,16,6


data must be pushed to s3, aws credentials must be properly set for this purpose (`/home/user/.aws/credentials`).

In [17]:
# S3 bucket for saving code and model artifacts (will be created if it doesn't exists).
BUCKET_NAME = 'eu.com.syngenta-datascience-model-training'
#BUCKET_NAME = 'dsla'
EXPERIMENT_NAME = 'ml-in-production-sagemaker'

SCHEMA_NAME = 's3:/' 

TRAIN_CONTAINER_NAME= 'train'
VALIDATION_CONTAINER_NAME = 'validation'
MODEL_CONTAINER_NAME =  'model'

#import getpass
#USER_NAME = getpass.getuser()
#print(USER_NAME)

USER_NAME = 'Preetam.Balijepalli@syngenta.com' 

s3_train =  f'{SCHEMA_NAME}/{BUCKET_NAME}/{USER_NAME}/{ EXPERIMENT_NAME}/{TRAIN_CONTAINER_NAME}'
s3_validation =  f'{SCHEMA_NAME}/{BUCKET_NAME}/{USER_NAME}/{ EXPERIMENT_NAME}/{VALIDATION_CONTAINER_NAME}'
s3_model_output = f'{SCHEMA_NAME}/{BUCKET_NAME}/{USER_NAME}/{ EXPERIMENT_NAME}/{MODEL_CONTAINER_NAME}'

print(s3_train)
print(s3_validation)
print(s3_model_output)

s3://eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train
s3://eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation
s3://eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/model


In [18]:
import s3fs
fs = s3fs.S3FileSystem()

renfe_train[[target] + features].to_csv(s3_train + '/train.csv', index=False, header=False)  # .csv file without header
renfe_validation[[target] + features].to_csv(s3_validation + '/validation.csv', index=False, header=False)

2020-04-17 13:58:44,513 - s3fs - DEBUG - Setting up s3fs instance
2020-04-17 13:58:44,690 - s3fs - DEBUG - Setting up s3fs instance
2020-04-17 13:58:45,860 - s3fs - DEBUG - Initiate upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>
2020-04-17 13:58:45,861 - s3fs - DEBUG - CALL: create_multipart_upload - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv', 'ACL': ''}
2020-04-17 13:58:46,338 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, final=False, loc=5247748, buffer loc=5247748
2020-04-17 13:58:46,340 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-pr

2020-04-17 13:59:47,851 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, 10
2020-04-17 13:59:47,851 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 10, 'UploadId': 'aNb89ioyYAXswXO2x2oU6D6SV00AdpTmNtgTFOSBLvMdQpIjcG8J1dp4V5SpcY0RbKRgMJ5gZVX19y0djLQt0g7Px6MDDw61bdbOJ3iFnEQoakGK2aTXKk0QSFj3O5OU', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv'}
2020-04-17 13:59:56,358 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, final=False, loc=57725632, buffer loc=5247886
2020-04-17 13:59:56,360 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-

2020-04-17 14:01:13,604 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, 20
2020-04-17 14:01:13,605 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 20, 'UploadId': 'aNb89ioyYAXswXO2x2oU6D6SV00AdpTmNtgTFOSBLvMdQpIjcG8J1dp4V5SpcY0RbKRgMJ5gZVX19y0djLQt0g7Px6MDDw61bdbOJ3iFnEQoakGK2aTXKk0QSFj3O5OU', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv'}
2020-04-17 14:01:20,936 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, final=False, loc=110203013, buffer loc=5247701
2020-04-17 14:01:20,938 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml

2020-04-17 14:02:31,752 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, 30
2020-04-17 14:02:31,752 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 30, 'UploadId': 'aNb89ioyYAXswXO2x2oU6D6SV00AdpTmNtgTFOSBLvMdQpIjcG8J1dp4V5SpcY0RbKRgMJ5gZVX19y0djLQt0g7Px6MDDw61bdbOJ3iFnEQoakGK2aTXKk0QSFj3O5OU', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv'}
2020-04-17 14:02:37,694 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, final=False, loc=162681012, buffer loc=5247634
2020-04-17 14:02:37,696 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml

2020-04-17 14:03:48,070 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, 40
2020-04-17 14:03:48,070 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 40, 'UploadId': 'aNb89ioyYAXswXO2x2oU6D6SV00AdpTmNtgTFOSBLvMdQpIjcG8J1dp4V5SpcY0RbKRgMJ5gZVX19y0djLQt0g7Px6MDDw61bdbOJ3iFnEQoakGK2aTXKk0QSFj3O5OU', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv'}
2020-04-17 14:04:01,446 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/train/train.csv>, final=False, loc=215160065, buffer loc=5247998
2020-04-17 14:04:01,448 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml

2020-04-17 14:04:31,573 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv>, 2
2020-04-17 14:04:31,574 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 2, 'UploadId': '8p3awhvZHmxxaRfgBF2VMZsbrM4mvxaOCeXSSGzT2tOqRxAwM.mrA4lDdrbXH_UQG6Y.XNJ7gY__ez2WpnKM5.uiBe1eVwC2I2a1Nfo_G53dj996JXdN3.6UQbuOoSOg', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv'}
2020-04-17 14:04:35,523 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv>, final=False, loc=15744543, buffer loc=5248041
2020-04-17 14:04:35,525 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.

2020-04-17 14:05:43,054 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv>, final=False, loc=62975135, buffer loc=5247425
2020-04-17 14:05:43,057 - s3fs - DEBUG - Upload chunk <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv>, 12
2020-04-17 14:05:43,057 - s3fs - DEBUG - CALL: upload_part - ({},) - {'Bucket': 'eu.com.syngenta-datascience-model-training', 'PartNumber': 12, 'UploadId': '8p3awhvZHmxxaRfgBF2VMZsbrM4mvxaOCeXSSGzT2tOqRxAwM.mrA4lDdrbXH_UQG6Y.XNJ7gY__ez2WpnKM5.uiBe1eVwC2I2a1Nfo_G53dj996JXdN3.6UQbuOoSOg', 'Key': 'Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/validation/validation.csv'}
2020-04-17 14:05:48,675 - s3fs - DEBUG - Upload for <File-like object S3FileSystem, eu.com.syngenta-datascience-model-training/Preetam.

In [19]:
import boto3

s3 = boto3.resource('s3')
my_bucket = s3.Bucket(BUCKET_NAME)

for object_summary in my_bucket.objects.filter(Prefix="Preetam.Balijepalli@syngenta.com"):
    print(object_summary.key)

Preetam.Balijepalli@syngenta.com/
Preetam.Balijepalli@syngenta.com/DEMO-hpo-automl-dm/output/
Preetam.Balijepalli@syngenta.com/DEMO-hpo-automl-dm/test/automl-test.csv
Preetam.Balijepalli@syngenta.com/DEMO-hpo-automl-dm/train/automl-train.csv
Preetam.Balijepalli@syngenta.com/DEMO-hpo-xgboost-dm/test/test.csv
Preetam.Balijepalli@syngenta.com/DEMO-hpo-xgboost-dm/train/train.csv
Preetam.Balijepalli@syngenta.com/DEMO-hpo-xgboost-dm/validation/validation.csv
Preetam.Balijepalli@syngenta.com/keras-sagemaker/
Preetam.Balijepalli@syngenta.com/keras-sagemaker/model/
Preetam.Balijepalli@syngenta.com/keras-sagemaker/output/
Preetam.Balijepalli@syngenta.com/keras-sagemaker/output/datascience-model-training-2020-04-16-12-42-56/output/model.tar.gz
Preetam.Balijepalli@syngenta.com/keras-sagemaker/output/datascience-model-training-2020-04-16-13-18-16/output/model.tar.gz
Preetam.Balijepalli@syngenta.com/keras-sagemaker/output/datascience-model-training-2020-04-16-13-42-34/output/model.tar.gz
Preetam.Bal

## 4. train model with sagemaker api

first, using sagemaker api for xgboost image, can launch a model training job using the following code. please note that `role` and `region` information must be specified. can be fetched from aws programatically or hardcoded. __please check your aws console while training__

In [None]:
import sys
!{sys.executable} -m pip install requests==2.20.1


In [20]:
from datetime import datetime
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator

#AWS Debugger need to verified with latest version of sdk installed.

#from smdebug.xgboost import Hook
#dtrain = xgb.DMatrix("train.libsvm")
#dtest = xgb.DMatrix("test.libsmv")
#hook = Hook.create_from_json_file()
#hook.train_data = dtrain  # required
#hook.validation_data = dtest  # optional
#hook.hyperparameters = params  # optional

# IAM execution role that gives SageMaker access to resources in your AWS account.
ROLE = 'arn:aws:iam::170605107178:role/SYN-Datascience-SageMaker-Role'
REGION = 'eu-central-1'
PREFIX_NAME = 'datascience-model-training'


train_channel = sagemaker.session.s3_input(s3_train, content_type='text/csv') 
valid_channel = sagemaker.session.s3_input(s3_validation, content_type='text/csv')


#sess = boto3.Session()
#sm = sess.client('sagemaker')
#role = get_execution_role()

data_channels = {'train': train_channel, 
                 'validation': valid_channel}

In [21]:
import boto3
region_name = boto3.Session(profile_name='mfa').region_name
container = get_image_uri(REGION, 'xgboost', '0.90-1')
print('Using SageMaker container: {} ({})'.format(container, region_name))

Using SageMaker container: 492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3 (eu-west-1)


In [22]:
subnets_config = ['subnet-0bdd33f41f946b22a', 'subnet-0c7c8959343746db7']
security_groups_config = [ "sg-1ad4ea70",
              "sg-99d781f1",
              "sg-dfa1f7b7"]

# the instance type to be used for training. using 'local' will not trigger a job on SageMaker
train_instance_type = 'ml.m4.xlarge'


# create the model object
xgb_model = Estimator(container,
                      ROLE, 
                      train_instance_count=1, 
                      train_instance_type=train_instance_type,
                      train_volume_size = 5,
                      subnets=subnets_config, 
                      security_group_ids=security_groups_config,
                      output_path=s3_model_output,
                      sagemaker_session=sagemaker.Session()
                     )

xgb_model.set_hyperparameters(max_depth = 4,
                                  eta = .2,
                              gamma = 4,
                              min_child_weight = 8,
                              silent = 0,
                              objective = "reg:linear",
                              num_round = 8)

# fit the model: this would start the Training Job
# define the job_name that will also be used for the model file
TRAINING_JOB_NAME = PREFIX_NAME+ '-'  + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

xgb_model.fit(inputs=data_channels, 
              #logs=True, 
              job_name=TRAINING_JOB_NAME 
)

2020-04-17 14:20:56,755 - sagemaker - INFO - Creating training-job with name: datascience-model-training-2020-04-17-14-20-56


2020-04-17 14:20:57 Starting - Starting the training job...
2020-04-17 14:20:59 Starting - Launching requested ML instances...
2020-04-17 14:21:58 Starting - Preparing the instances for training......
2020-04-17 14:22:59 Downloading - Downloading input data
2020-04-17 14:22:59 Training - Downloading the training image...
2020-04-17 14:23:28 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[3

In [None]:
%%sh

profile='mfa'
arn_role='arn:aws:iam::170605107178:role/SYN-Datascience-SageMaker-Role'
training_image='492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3'
bucket='eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com'
region='eu-central-1'

prefix='ml-in-production-sagemaker'
training_job_name=datascience-model-training_`date '+%Y_%m_%d_%H_%M_%S'`

training_data=$bucket/$prefix/train
eval_data=$bucket/$prefix/validation

train_source={S3DataSource={S3DataType=S3Prefix,S3DataDistributionType=FullyReplicated,S3Uri=$training_data}}
eval_source={S3DataSource={S3DataType=S3Prefix,S3DataDistributionType=FullyReplicated,S3Uri=$eval_data}}


aws --profile $profile \
    --region $region \
    sagemaker create-training-job \
    --training-job-name $training_job_name\
    --algorithm-specification TrainingImage=$training_image,TrainingInputMode=File \
    --role-arn $arn_role \
    --input-data-config ChannelName=train,DataSource=$train_source,CompressionType=None,RecordWrapperType=None ChannelName=validation,DataSource=$eval_source,CompressionType=None,RecordWrapperType=None \
    --output-data-config S3OutputPath=$bucket$prefix/model \
    --resource-config InstanceCount=1,InstanceType=ml.c4.8xlarge,VolumeSizeInGB=50 \
    --stopping-condition MaxRuntimeInSeconds=3600 --debug


## 5. train model with boto3 api

a training job can be created using boto3 __and not sagemaker api__. a dictionary with all details must be specified:

In [None]:
import boto3
import time

PREFIX_NAME = 'datascience-model-training'
TRAINING_JOB_NAME = PREFIX_NAME +  '-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

# NEED TO REMOVE THIS TEMP HACK
ROLE = "arn:aws:iam::170605107178:role/SYN-Datascience-SageMaker-Role"
container = "492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3"
REGION ="eu-central-1"

#vpc_config = estimator.get_vpc_config()
#print(vpc_config)

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": ROLE,
    "OutputDataConfig": {
        "S3OutputPath": s3_model_output
    },
    "VpcConfig": {
         "Subnets": [
          "subnet-0bdd33f41f946b22a",
          "subnet-0c7c8959343746db7"
          ],
          "SecurityGroupIds": [
              "sg-1ad4ea70",
              "sg-99d781f1",
              "sg-dfa1f7b7"
              ]
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": TRAINING_JOB_NAME,
    "HyperParameters": {
        "max_depth":"4",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"4",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"8"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        }
    ]
}

client = boto3.session.Session(profile_name='mfa').client('sagemaker', region_name=REGION)
client.create_training_job(**create_training_params)
status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']
print(status)

# this loop will query status until completed, there is no more info available, go to aws console for more...
while status !='Completed' and status!='Failed':
    time.sleep(16)
    status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']
    logger.info('training job created with boto3 api is:' + status)

## 6. serving model with sagemaker api

to deploy a model and create an endpoint using sagemaker api, `deploy` method of sagemaker estimator can be used. takes some time... go grab some cofee!

In [23]:
ENDPOINT_NAME = 'ml-in-production-sagemaker-api-endpoint'
MODEL_NAME = 'ml-in-production-sagemaker-api-model'

xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                 instance_type='ml.t2.medium',
                                 endpoint_name=ENDPOINT_NAME,
                                 model_name=MODEL_NAME)

2020-04-17 14:28:37,592 - sagemaker - INFO - Creating model with name: ml-in-production-sagemaker-api-model
2020-04-17 14:28:38,836 - sagemaker - INFO - Creating endpoint with name ml-in-production-sagemaker-api-endpoint


-------------------------------------------------------------------------------------------------!

## 7. serving model with boto3 api

not straightforward, involves 3 steps, too low level to explain here, does not worth the pain having sagemaker api and mlflow... let's try mlflow with an sklearn model instead :-D

## 8. invoking endpoint (just boto3 api option)

In [25]:
def get_active_endpoints(app_name):
  sage_client = boto3.client('sagemaker', region_name=REGION)
  app_endpoints = sage_client.list_endpoints(NameContains=app_name)["Endpoints"]
  return list(filter(lambda en : en == app_name, [str(endpoint["EndpointName"]) for endpoint in app_endpoints]))

app_name = "ml-in-production-sagemaker-api-endpoint"
print("The following endpoints exist for the `{an}` application: {eps}".format(an=app_name, eps=get_active_endpoints(app_name)))

The following endpoints exist for the `ml-in-production-sagemaker-api-endpoint` application: ['ml-in-production-sagemaker-api-endpoint']


In [None]:
import boto3
runtime = boto3.client('runtime.sagemaker')

logger.info('getting a sample of 100 elements from test split')
test_sample = renfe_test.sample(100)

body = test_sample[features].to_csv(header=False, index=False)

logger.info('calling endpoint...')
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   ContentType='text/csv',
                                   Body=body) # data must be passed as .csv (string)
display(response)
y_pred = list(map(lambda x: float(x), response['Body'].read().decode().split(',')))  # result is a string and must be parsed

let's compare result with reality:

In [None]:
from sklearn.metrics import mean_absolute_error
y_true = test_sample['price']
logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

## 9. deploy model using mlflow the easy way (sklearn version)

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = renfe_train[features]
y = renfe_train[target]

rf = RandomForestRegressor(n_estimators=256, 
                           n_jobs=32,  # adapt to your processor(s)
                           verbose=1,
                           max_depth=8)  # limit max depth to keep serialized model under 100MB (or it will be unable to deploy in AWS)
rf.fit(X, y)

check results for this model:

In [None]:
y_pred=rf.predict(test_sample[features])
y_true=test_sample['price']

logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

In [None]:
import mlflow.sklearn

MODEL_PATH = '../output/price_pred_model'
!rm -rf $MODEL_PATH  # '!' can be used to execute bash commands in jupyter cells

mlflow.sklearn.save_model(sk_model=rf, path=MODEL_PATH)
logger.info("model saved!")

using `mlflow.sagemaker` module, this model can be deployed directly in AWS, with just one line of code...

In [None]:
import mlflow.sagemaker

ENDPOINT_NAME = 'ml-in-prod-mad-mlf-api-ep'


aws_id = "170605107178" # from the aws-cli output
arn = "arn:aws:iam::170605107178:role/SYN-Datascience-SageMaker-Role"
app_name = "iris-rf-1"
image_url = aws_id + ".dkr.ecr." + REGION + ".amazonaws.com/mlflow-pyfunc:1.2.0" 
print(image_url)

s3_model_output ="s3://eu.com.syngenta-datascience-model-training/Preetam.Balijepalli@syngenta.com/ml-in-production-sagemaker/model"




mlflow.sagemaker.deploy(app_name=ENDPOINT_NAME, 
                         model_uri=MODEL_PATH, 
                         execution_role_arn=ROLE, 
                         bucket=s3_model_output,
                         region_name=REGION, 
                         mode='create',  # try 'replace'
                         instance_type='ml.t2.medium', 
                         instance_count=1)



endpoint can be invoked the usual way, using boto3. authenticating against aws using plain curl or requests is much harder than just store its credentials and using aws sdk:

In [None]:
import json
body = test_sample[features].to_json(orient='split')
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   ContentType='application/json',
                                   Body=body)

y_pred = json.loads(response['Body'].read().decode())
y_true = test_sample[target]

logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

## 10. references

- https://www.mlflow.org/docs/latest/models.html#built-in-deployment-tools
- https://aws.amazon.com/sagemaker/features/

In [None]:
# Delete end point with boto3
import boto3

ENDPOINT_NAME = 'ml-in-production-sagemaker-api-endpoint'
client = boto3.session.Session().client('sagemaker', region_name=REGION) #profile_name='mfa'
client.delete_endpoint(EndpointName=ENDPOINT_NAME)
client.delete_endpoint_config(EndpointConfigName=ENDPOINT_NAME)