In [20]:
import boto3
import pandas as pd
import io
from io import StringIO
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner 

In [2]:
#create sagemaker session and get execution role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

In [3]:
#get training data from csv which is saved in your bucket
s3 = boto3.client('s3')
ion_data_obj = s3.get_object(Bucket = 'sunithadatasets', Key = 'data/train_data_with_features.csv')
ion_data = pd.read_csv(io.BytesIO(ion_data_obj['Body'].read()))

In [4]:
#get test data with features from csv which is saved in your bucket
test_data_obj = s3.get_object(Bucket = 'sunithadatasets',Key = 'data/test_data_with_features.csv')
test_data = pd.read_csv(io.BytesIO(test_data_obj['Body'].read()))

In [8]:
 ion_data.head()

Unnamed: 0,time,signal,open_channels,batch,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,0.0001,-2.76,0,0,0.0,0.0,0.0,0.0
1,0.0002,-2.8557,0,0,0.0,0.0,0.0,0.0
2,0.0003,-2.4074,0,0,0.0,0.0,0.0,0.0
3,0.0004,-3.1404,0,0,0.0,0.0,0.0,0.0
4,0.0005,-3.1525,0,0,0.0,0.0,0.0,0.0


In [6]:
#used for predictions
test_values=test_data[['signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']]

In [7]:
test_values.head()

Unnamed: 0,signal,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,-2.6498,0.0,0.0,0.0,0.0
1,-2.8494,0.0,0.0,0.0,0.0
2,-2.86,0.0,0.0,0.0,0.0
3,-2.435,0.0,0.0,0.0,0.0
4,-2.6155,0.0,0.0,0.0,0.0


In [21]:
#get only columns which you want to feed into your model
X = ion_data[['open_channels','signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']]
x_train,x_test = train_test_split(X,test_size = 0.2,stratify=X.open_channels)

In [23]:
# Save this X (training data into a file X_train)
bucket ='sunithadatasets'
csv_buffer = StringIO()
x_train.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'data/x_train.csv').put(Body=csv_buffer.getvalue())


{'ResponseMetadata': {'RequestId': 'C0F859FC6D210D4F',
  'HostId': '3Hdh1iELg6FmZKnwcBO5J26WbzkZTAUAozzJ7MlgOJ5kyX/wRTPlAMJg11q/WOykIk8Bn5Ctnwg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '3Hdh1iELg6FmZKnwcBO5J26WbzkZTAUAozzJ7MlgOJ5kyX/wRTPlAMJg11q/WOykIk8Bn5Ctnwg=',
   'x-amz-request-id': 'C0F859FC6D210D4F',
   'date': 'Wed, 25 Mar 2020 21:21:06 GMT',
   'etag': '"cd6533cf21a87f934ae0fbc8266e8db0"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"cd6533cf21a87f934ae0fbc8266e8db0"'}

In [24]:
bucket ='sunithadatasets'
csv_buffer = StringIO()
x_test.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'data/x_test.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '71097984B7D28D0E',
  'HostId': 'Yti1oiuWFUHBfVK6Y7ac5GXDmYURCUzsO6iO33ywGzhmf4wzCs7DeuPXZ2QXMCTdm8+HqgLvCqg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Yti1oiuWFUHBfVK6Y7ac5GXDmYURCUzsO6iO33ywGzhmf4wzCs7DeuPXZ2QXMCTdm8+HqgLvCqg=',
   'x-amz-request-id': '71097984B7D28D0E',
   'date': 'Wed, 25 Mar 2020 21:21:20 GMT',
   'etag': '"9eb24dd129a0f413bc119a768b9481d9"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"9eb24dd129a0f413bc119a768b9481d9"'}

In [35]:
X.head()

Unnamed: 0,open_channels,signal,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,0,-2.76,0.0,0.0,0.0,0.0
1,0,-2.8557,0.0,0.0,0.0,0.0
2,0,-2.4074,0.0,0.0,0.0,0.0
3,0,-3.1404,0.0,0.0,0.0,0.0
4,0,-3.1525,0.0,0.0,0.0,0.0


In [25]:
#assign region,set client/service and bucket

region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')

bucket = sagemaker.Session().default_bucket()


In [26]:
from sagemaker.amazon.amazon_estimator import get_image_uri

In [27]:
container = get_image_uri(region,'xgboost',repo_version='0.90-1')

In [28]:
#instantiating an sagemaker container where we run our models. speicifying the details.
xgb = sagemaker.estimator.Estimator(
container,
role,
train_instance_count =1,
#train_instance_type = 'ml.m4.xlarge',
train_instance_type = 'ml.m5.4xlarge',
output_path='s3://{}/{}/output'.format('sunithadatasets','data'),
sagemaker_session = sagemaker_session)

In [29]:
xgb.set_hyperparameters(
num_round = 250,
#rate_drop = 0.3,
early_stopping_rounds = 10,
#alpha = 0.25,
num_class = 11,
objective = "multi:softmax" )

objective_metric_name = 'validation:f1'

In [30]:
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    'lambda': ContinuousParameter(0.01, 10, scaling_type="Logarithmic")
}

In [31]:
#now we need to pass the x_data to the model. hence set the path
s3_input_train = sagemaker.s3_input(s3_data = 's3://{}/{}/x_train'.format('sunithadatasets','data'),content_type ='csv')

In [32]:
s3_input_validation = sagemaker.s3_input(s3_data = 's3://{}/{}/x_test'.format('sunithadatasets','data'),content_type ='csv')

In [34]:

tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=10,
    strategy='Random'
)

tuner_log.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

In [37]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# check jobs have finished
#status_log = boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
 #   HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']
#status_linear = boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
 #   HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

#assert status_log == 'Completed', "First must be completed, was {}".format(status_log)
#assert status_linear == 'Completed', "Second must be completed, was {}".format(status_linear)

df_log = sagemaker.HyperparameterTuningJobAnalytics(tuner_log.latest_tuning_job.job_name).dataframe()
#df_linear = sagemaker.HyperparameterTuningJobAnalytics(tuner_linear.latest_tuning_job.job_name).dataframe()
#df_log['scaling'] = 'log'
#df_linear['scaling'] = 'linear'
#df = pd.concat([df_log, df_linear], ignore_index=True)

In [41]:
df_log.sort_values('FinalObjectiveValue',ascending=False)

Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,alpha,lambda
8,0.639808,350.0,2020-03-25 21:44:37+00:00,sagemaker-xgboost-200325-2126-012-5bc39bab,Completed,2020-03-25 21:38:47+00:00,0.134073,5.277374
13,0.639799,348.0,2020-03-25 21:35:26+00:00,sagemaker-xgboost-200325-2126-007-034bbbea,Completed,2020-03-25 21:29:38+00:00,5.491171,0.16614
7,0.639781,345.0,2020-03-25 21:44:19+00:00,sagemaker-xgboost-200325-2126-013-9886dffc,Completed,2020-03-25 21:38:34+00:00,1.037658,2.096381
4,0.639677,343.0,2020-03-25 21:44:15+00:00,sagemaker-xgboost-200325-2126-016-630af7bc,Completed,2020-03-25 21:38:32+00:00,2.335517,5.12663
2,0.639641,342.0,2020-03-25 21:44:26+00:00,sagemaker-xgboost-200325-2126-018-fd740ec1,Completed,2020-03-25 21:38:44+00:00,0.018886,2.176522
6,0.639594,344.0,2020-03-25 21:44:20+00:00,sagemaker-xgboost-200325-2126-014-342f131a,Completed,2020-03-25 21:38:36+00:00,0.01567,5.001017
18,0.63951,340.0,2020-03-25 21:35:28+00:00,sagemaker-xgboost-200325-2126-002-c819a209,Completed,2020-03-25 21:29:48+00:00,0.804632,0.440205
3,0.639467,370.0,2020-03-25 21:45:31+00:00,sagemaker-xgboost-200325-2126-017-6310f5ba,Completed,2020-03-25 21:39:21+00:00,0.251007,0.030732
11,0.639452,345.0,2020-03-25 21:35:14+00:00,sagemaker-xgboost-200325-2126-009-6163657b,Completed,2020-03-25 21:29:29+00:00,0.013693,1.334477
16,0.639449,345.0,2020-03-25 21:35:13+00:00,sagemaker-xgboost-200325-2126-004-efcd30b0,Completed,2020-03-25 21:29:28+00:00,0.010539,0.422354


In [44]:
predictor=tuner_log.deploy(initial_instance_count =1, instance_type = "ml.m4.xlarge")

2020-03-25 21:44:37 Starting - Preparing the instances for training
2020-03-25 21:44:37 Downloading - Downloading input data
2020-03-25 21:44:37 Training - Training image download completed. Training in progress.
2020-03-25 21:44:37 Uploading - Uploading generated training model
2020-03-25 21:44:37 Completed - Training job completed[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter _tuning_objective_metric value validation:f1 to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CS



-------------!

In [52]:
#now we need to pass the x_data to the model. hence set the path
s3_input_train = sagemaker.s3_input(s3_data = 's3://{}/{}/X_train'.format('sunithadatasets','data'),content_type ='csv')
xgb.fit({'train' : s3_input_train})
#now, after training the model, lets deploy
predictor = xgb.deploy(initial_instance_count =1, instance_type = "ml.m4.xlarge")

In [45]:
import numpy as np
from sagemaker.predictor import csv_serializer

In [46]:
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [47]:
y_pred = predict(test_values.to_numpy())

In [50]:
y_pred[0].value_counts()

AttributeError: 'numpy.float64' object has no attribute 'value_counts'

In [51]:
#convert array into a dataframe
df = pd.DataFrame(y_pred)

In [55]:
df[0].value_counts()

0.0     731399
1.0     615463
3.0     202722
2.0     147055
4.0      79686
7.0      59558
5.0      45341
8.0      43300
6.0      42850
9.0      26611
10.0      6015
Name: 0, dtype: int64

In [57]:
submission9 =pd.DataFrame()
submission9['time'] = test_data.time
submission9['open_channels'] = y_pred.astype(int)

submission9.time = submission9.time.apply(lambda x: '{:.4f}'.format(x))
#submission6.to_csv('data/XGBoost_submission5.csv',index=False)

In [58]:
submission9.open_channels.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10,  9])

In [59]:
# Save final kagggle file to csv
bucket ='sunithadatasets'
csv_buffer = StringIO()
submission9.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'data/XGBoost_submission9.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'BA5128CB2547DDC3',
  'HostId': '61eZthbyxRvvaizhj7to6Ihdy21XRB4QM4/Jg5xVmBhx4bdMzkZhRT61fyUVIGFGHD/XuYi5C7I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '61eZthbyxRvvaizhj7to6Ihdy21XRB4QM4/Jg5xVmBhx4bdMzkZhRT61fyUVIGFGHD/XuYi5C7I=',
   'x-amz-request-id': 'BA5128CB2547DDC3',
   'date': 'Wed, 25 Mar 2020 22:24:38 GMT',
   'etag': '"b19c6743c03911ef00b7e957986b2398"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"b19c6743c03911ef00b7e957986b2398"'}