In [43]:
import pandas as pd  
import numpy as np 
import io
import os

# Sagemaker Python SDK
import sagemaker.amazon.common as smac
import sagemaker
from sagemaker import get_execution_role
import boto3

import matplotlib.pyplot as plt  
%matplotlib inline

In [44]:
role = get_execution_role()
input_bucket = 'wwcteammarywilkeslillian'
in_data_key = 'combined_male_female.csv'
training_data_location = 's3://{}/{}'.format(input_bucket, in_data_key)
df = pd.read_csv(training_data_location, index_col = 0)
print('uploaded training data from location: {}'.format(training_data_location))

uploaded training data from location: s3://wwcteammarywilkeslillian/combined_male_female.csv


In [45]:
#convert category data type to encoded columns, do not drop any columns
#GENDER, OUTCOME, COUNTY, AGE
df_encoded_sg = pd.get_dummies(df, columns=['OUTCOME','GENDER','COUNTY'], drop_first = False)


In [46]:
#For AWS linear learner model the first column of a csv file is assumed to be the label or
# target variable for prediction.  In this use case the target variable is OUTCOME.  
df_encoded_sg['OUTCOME'] = df_encoded_sg['OUTCOME_Case'] + df_encoded_sg['OUTCOME_Death']
df_encoded_sg = df_encoded_sg.drop(['OUTCOME_Case','OUTCOME_Death'], axis = 1)
outcome = df_encoded_sg['OUTCOME']
df_encoded_sg.drop(labels=['OUTCOME'], axis=1,inplace = True)
df_encoded_sg.insert(0, 'OUTCOME', outcome)


In [47]:
#Shuffle the data
df_shuffled_sg = df_encoded_sg.sample(frac=1).reset_index(drop=True)



In [48]:
 # convert dataframe to csv; save the original cleaned file to notebook instance
df_encoded_sg.to_csv('covidsg.csv', header=False, index=False)

In [49]:
# Split the data for training, validation, and test into separate dataframes
# produces a 60%, 20%, 20% split for training, validation and test sets
sg_train_data, sg_validation_data, sg_test_data = np.split(df_encoded_sg.sample(frac=1), [int(.6*len(df_encoded_sg)), int(.8*len(df_encoded_sg))])

In [50]:
#Save dataframes to csv files in local notebook instance
sg_train_data.to_csv('sgtrain.csv', header=False, index=False)
sg_validation_data.to_csv('sgvalidation.csv', header=False, index=False)
sg_test_data.to_csv('sgtest.csv', header=False, index=False)

In [51]:
output_bucket = 'wwcteammarywilkeslillian'
prefix = 'sglinear2'

boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'covidsg.csv')).upload_file('covidsg.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgtrain.csv')).upload_file('sgtrain.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgvalidation.csv')).upload_file('sgvalidation.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'sgtest.csv')).upload_file('sgtest.csv')

In [52]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')


'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [55]:
# sess = sagemaker.Session()
session = sagemaker.Session()
artifact_output_location = 's3://{}/{}/output'.format(output_bucket, prefix)

print("The model artifact will be loaded to: ", artifact_output_location)

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type= 'ml.c4.xlarge',
                                       output_path=artifact_output_location,
                                       sagemaker_session=session)
linear.set_hyperparameters(epochs=15,
                           predictor_type='binary_classifier')

#set training data
s3_train_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgtrain.csv')
train_data_file = sagemaker.session.s3_input(s3_data=s3_train_file, content_type='text/csv')


#set evaluation data
s3_eval_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgvalidation.csv')
eval_data_file = sagemaker.session.s3_input(s3_data=s3_eval_file, content_type='text/csv')

#set test data
s3_test_file = 's3://{}/{}/'.format(output_bucket, prefix, 'sgtest.csv')
test_data_file = sagemaker.session.s3_input(s3_data=s3_test_file, content_type='text/csv')

#start the training process
# linear.fit({'train': train_data_file, 'validation': eval_data_file, 'test': test_data_file})
#linear.fit({'train': train_data_file})


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


The model artifact will be loaded to:  s3://wwcteammarywilkeslillian/sglinear2/output


In [54]:
linear.fit({'train': train_data_file, 'validation': eval_data_file, 'test': test_data_file})

ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: 1 validation error detected: Value 'ml.t2.medium' at 'resourceConfig.instanceType' failed to satisfy constraint: Member must satisfy enum value set: [ml.p2.xlarge, ml.m5.4xlarge, ml.m4.16xlarge, ml.c5n.xlarge, ml.p3.16xlarge, ml.m5.large, ml.p2.16xlarge, ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.c5.4xlarge, ml.c5n.18xlarge, ml.g4dn.xlarge, ml.g4dn.12xlarge, ml.c4.8xlarge, ml.g4dn.2xlarge, ml.c5.9xlarge, ml.g4dn.4xlarge, ml.c5.xlarge, ml.g4dn.16xlarge, ml.c4.xlarge, ml.g4dn.8xlarge, ml.c5n.2xlarge, ml.c5n.4xlarge, ml.c5.18xlarge, ml.p3dn.24xlarge, ml.p3.2xlarge, ml.m5.xlarge, ml.m4.10xlarge, ml.c5n.9xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.m5.24xlarge, ml.m4.2xlarge, ml.p2.8xlarge, ml.m5.2xlarge, ml.p3.8xlarge, ml.m4.4xlarge]