In [54]:
import pandas as pd  
import numpy as np 
import io
import os

# Sagemaker Python SDK
import sagemaker.amazon.common as smac
import sagemaker
from sagemaker import get_execution_role
import boto3

import matplotlib.pyplot as plt  
%matplotlib inline

In [55]:
role = get_execution_role()
input_bucket = 'wwcteammarywilkeslillian'
in_data_key = 'combined_male_female.csv'
training_data_location = 's3://{}/{}'.format(input_bucket, in_data_key)
df = pd.read_csv(training_data_location, index_col = 0)
print('uploaded training data from location: {}'.format(training_data_location))

uploaded training data from location: s3://wwcteammarywilkeslillian/combined_male_female.csv


In [56]:
#convert category data type to encoded columns, do not drop any columns
#GENDER, OUTCOME, COUNTY, AGE
df_encoded_sg = pd.get_dummies(df, columns=['OUTCOME','GENDER','COUNTY'], drop_first = False)
#For AWS linear learner model the first column of a csv file is assumed to be the label or
# target variable for prediction.  In this use case the target variable is OUTCOME.  
df_encoded_sg['OUTCOME'] = df_encoded_sg['OUTCOME_Case']
outcome = df_encoded_sg['OUTCOME']
df_encoded_sg.drop(labels=['OUTCOME'], axis=1,inplace = True)
df_encoded_sg.insert(0, 'OUTCOME', outcome)
df_encoded_sg.drop(labels = ['OUTCOME_Case', 'OUTCOME_Death'], axis = 1, inplace = True)

# convert dataframe to csv; save the original cleaned file to notebook instance
df_encoded_sg.to_csv('covidsg.csv', header=False, index=False)
#Shuffle the data
df_shuffled_sg = df_encoded_sg.sample(frac=1).reset_index(drop=True)


In [57]:
df_shuffled_sg.head(3)

Unnamed: 0,OUTCOME,AGE,MONTH,DAY,GENDER_Female,GENDER_Male,COUNTY_Alachua,COUNTY_Baker,COUNTY_Bay,COUNTY_Bradford,...,COUNTY_St. Lucie,COUNTY_Sumter,COUNTY_Suwannee,COUNTY_Taylor,COUNTY_Union,COUNTY_Unknown,COUNTY_Volusia,COUNTY_Wakulla,COUNTY_Walton,COUNTY_Washington
0,1,20,7,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,42,5,16,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,48,7,12,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Split the data for training, validation, and test into separate dataframes
# produces a 60%, 20%, 20% split for training, validation and test sets
sg_train_data, sg_validation_data, sg_test_data = np.split(df_shuffled_sg.sample(frac=1), [int(.6*len(df_shuffled_sg)), int(.8*len(df_shuffled_sg))])


In [60]:
sg_train_data.shape

(190653, 74)

In [61]:
sg_validation_data.shape

(63551, 74)

In [62]:
sg_test_data.shape

(63552, 74)

In [64]:
sg_train_data.head(3)

Unnamed: 0,OUTCOME,AGE,MONTH,DAY,GENDER_Female,GENDER_Male,COUNTY_Alachua,COUNTY_Baker,COUNTY_Bay,COUNTY_Bradford,...,COUNTY_St. Lucie,COUNTY_Sumter,COUNTY_Suwannee,COUNTY_Taylor,COUNTY_Union,COUNTY_Unknown,COUNTY_Volusia,COUNTY_Wakulla,COUNTY_Walton,COUNTY_Washington
213209,1,17,7,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174213,1,63,7,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
309260,1,35,4,12,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
#Save dataframes to csv files in local notebook instance
sg_train_data.to_csv('sgtrain.csv', header=False, index=False)
sg_validation_data.to_csv('sgvalidation.csv', header=False, index=False)
sg_test_data.to_csv('sgtest.csv', header=False, index=False)
output_bucket = 'wwcteammarywilkeslillian'
prefix = 'sglinear2'

boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'covidsg.csv')).upload_file('covidsg.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgtrain.csv')).upload_file('sgtrain.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgvalidation.csv')).upload_file('sgvalidation.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgtest.csv')).upload_file('sgtest.csv')
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [66]:
# sess = sagemaker.Session()
session = sagemaker.Session()
artifact_output_location = 's3://{}/{}/output'.format(output_bucket, prefix)

print("The model artifact will be loaded to: ", artifact_output_location)

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type= 'ml.c4.xlarge',
                                       output_path=artifact_output_location,
                                       sagemaker_session=session)
linear.set_hyperparameters(epochs=15,
                           predictor_type='binary_classifier')

#set training data
s3_train_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgtrain.csv')
train_data_file = sagemaker.session.s3_input(s3_data=s3_train_file, content_type='text/csv')


#set evaluation data
s3_eval_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgvalidation.csv')
eval_data_file = sagemaker.session.s3_input(s3_data=s3_eval_file, content_type='text/csv')

#set test data
s3_test_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgtest.csv')
test_data_file = sagemaker.session.s3_input(s3_data=s3_test_file, content_type='text/csv')

#start the training process
linear.fit({'train': train_data_file, 'validation': eval_data_file, 'test': test_data_file})

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


The model artifact will be loaded to:  s3://wwcteammarywilkeslillian/sglinear2/output
2020-08-14 21:26:45 Starting - Starting the training job...
2020-08-14 21:26:47 Starting - Launching requested ML instances......
2020-08-14 21:28:09 Starting - Preparing the instances for training.........
2020-08-14 21:29:29 Downloading - Downloading input data...
2020-08-14 21:30:09 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/14/2020 21:30:13 INFO 140081364297536] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000

# DEPLOY

In [68]:
#delete previous endpoint
import sagemaker
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)

In [69]:

#deploy new endpoint
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: linear-learner-2020-08-14-21-26-45-415


-----------------!

# INFERENCE

In [70]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [71]:
import numpy as np
predictions = []
test_shape = sg_test_data.shape
test_shape

(63552, 74)

In [72]:
sg_test_data.iloc[1, 1:]

AGE                  51
MONTH                 6
DAY                  27
GENDER_Female         0
GENDER_Male           1
                     ..
COUNTY_Unknown        0
COUNTY_Volusia        0
COUNTY_Wakulla        0
COUNTY_Walton         0
COUNTY_Washington     0
Name: 208734, Length: 73, dtype: int64

In [74]:
#check if modelis working
result = linear_predictor.predict(sg_test_data.iloc[1, 1:])
result

{'predictions': [{'score': 0.9917246699333191, 'predicted_label': 1}]}

In [82]:
#Run the test data and evaluate the model
predictions = []
for rowNum in range(0,test_shape[0]):
    array = sg_test_data.iloc[rowNum, 1:]
    result = linear_predictor.predict(array)
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)

In [83]:
len(predictions)

63552

In [84]:
predictions[0]

1

In [85]:
sg_test_data.head(1)

Unnamed: 0,OUTCOME,AGE,MONTH,DAY,GENDER_Female,GENDER_Male,COUNTY_Alachua,COUNTY_Baker,COUNTY_Bay,COUNTY_Bradford,...,COUNTY_St. Lucie,COUNTY_Sumter,COUNTY_Suwannee,COUNTY_Taylor,COUNTY_Union,COUNTY_Unknown,COUNTY_Volusia,COUNTY_Wakulla,COUNTY_Walton,COUNTY_Washington
246623,1,88,4,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
pd.crosstab(np.where(sg_test_data['OUTCOME'] == 1, 1, 0), predictions, rownames=['actuals'], colnames=['predictions'])

predictions,1
actuals,Unnamed: 1_level_1
0,932
1,62620


In [105]:
result = np.where(predictions == 0)
unique, counts = np.unique(result[0], return_counts=True)
dict(zip(unique, counts))


{}

In [106]:
#delete previous endpoint
import sagemaker
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)