In [80]:
import boto3
import pandas as pd
import io
from io import StringIO
import sagemaker
from sagemaker import get_execution_role

In [81]:
#create sagemaker session and get execution role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

In [82]:
#get training data from csv which is saved in your bucket
s3 = boto3.client('s3')
ion_data_obj = s3.get_object(Bucket = 'sunithadatasets', Key = 'data/train_data_with_features.csv')
ion_data = pd.read_csv(io.BytesIO(ion_data_obj['Body'].read()))

In [83]:
#get test data with features from csv which is saved in your bucket
test_data_obj = s3.get_object(Bucket = 'sunithadatasets',Key = 'data/test_data_with_features.csv')
test_data = pd.read_csv(io.BytesIO(test_data_obj['Body'].read()))

In [84]:
 ion_data.head()

Unnamed: 0,time,signal,open_channels,batch,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,0.0001,-2.76,0,0,0.0,0.0,0.0,0.0
1,0.0002,-2.8557,0,0,0.0,0.0,0.0,0.0
2,0.0003,-2.4074,0,0,0.0,0.0,0.0,0.0
3,0.0004,-3.1404,0,0,0.0,0.0,0.0,0.0
4,0.0005,-3.1525,0,0,0.0,0.0,0.0,0.0


In [85]:
#used for predictions
test_values=test_data[['signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']]

In [86]:
test_values.head()

Unnamed: 0,signal,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,-2.6498,0.0,0.0,0.0,0.0
1,-2.8494,0.0,0.0,0.0,0.0
2,-2.86,0.0,0.0,0.0,0.0
3,-2.435,0.0,0.0,0.0,0.0
4,-2.6155,0.0,0.0,0.0,0.0


In [87]:
#get only columns which you want to feed into your model
X = ion_data[['open_channels','signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']]


In [88]:
# Save this X (training data into a file X_train)
bucket ='sunithadatasets'
csv_buffer = StringIO()
X.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'data/X_train.csv').put(Body=csv_buffer.getvalue())

MemoryError: 

In [None]:
X.head()

In [None]:
#assign region,set client/service and bucket

region = boto3.Session().region_name
smclient = boto3.Session().client('sagemaker')

bucket = sagemaker.Session().default_bucket()


In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

In [None]:
container = get_image_uri(region,'xgboost',repo_version='0.90-1')

In [None]:
#instantiating an sagemaker container where we run our models. speicifying the details.
xgb = sagemaker.estimator.Estimator(
container,
role,
train_instance_count =1,
#train_instance_type = 'ml.m4.xlarge',
train_instance_type = 'ml.m5.4xlarge',
output_path='s3://{}/{}/output'.format('sunithadatasets','data'),
sagemaker_session = sagemaker_session)

In [None]:
xgb.set_hyperparameters(
num_round = 100,
#rate_drop = 0.3,
alpha = 0.25,
num_class = 11,
objective = "multi:softmax" )
objective_metric_name = 'validation:f1'

In [79]:
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    'lambda': ContinuousParameter(0.01, 10, scaling_type="Logarithmic") 
}

NameError: name 'ContinuousParameter' is not defined

In [None]:

tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=10,
    strategy='Random',
    num_class = 11
)

tuner_log.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

In [52]:
#now we need to pass the x_data to the model. hence set the path
s3_input_train = sagemaker.s3_input(s3_data = 's3://{}/{}/X_train'.format('sunithadatasets','data'),content_type ='csv')

In [53]:
xgb.fit({'train' : s3_input_train})

2020-03-24 16:38:28 Starting - Starting the training job...
2020-03-24 16:38:30 Starting - Launching requested ML instances...
2020-03-24 16:39:28 Starting - Preparing the instances for training......
2020-03-24 16:40:20 Downloading - Downloading input data
2020-03-24 16:40:20 Training - Downloading the training image...
2020-03-24 16:40:41 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[16:40:43] 5000001x5 matrix with 25000005 entries loaded

In [54]:
#now, after training the model, lets deploy
predictor = xgb.deploy(initial_instance_count =1, instance_type = "ml.m4.xlarge")

-------------!

In [55]:
import numpy as np
from sagemaker.predictor import csv_serializer

In [65]:
predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [66]:
y_pred = predict(test_values.to_numpy())

In [74]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [71]:
#convert array into a dataframe
df = pd.DataFrame(y_pred)

In [73]:
df[0].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  7.,  6.,  8., 10.,  9.])

In [75]:
submission6 =pd.DataFrame()
submission6['time'] = test_data.time
submission6['open_channels'] = y_pred.astype(int)

submission6.time = submission6.time.apply(lambda x: '{:.4f}'.format(x))
#submission6.to_csv('data/XGBoost_submission5.csv',index=False)

In [76]:
submission6.open_channels.unique()

array([ 0,  1,  2,  3,  4,  5,  7,  6,  8, 10,  9])

In [77]:
# Save final kagggle file to csv
bucket ='sunithadatasets'
csv_buffer = StringIO()
submission6.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'data/XGBoost_submission8.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'B8B68758E2DC4ED4',
  'HostId': '70Onu0JTZUakCsbRMdEy+iH1AfddreAAAPcYezxTkfmFw89ET4rwvLR/RxgXfO5Go0+Ad2dMqDk=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '70Onu0JTZUakCsbRMdEy+iH1AfddreAAAPcYezxTkfmFw89ET4rwvLR/RxgXfO5Go0+Ad2dMqDk=',
   'x-amz-request-id': 'B8B68758E2DC4ED4',
   'date': 'Tue, 24 Mar 2020 17:45:51 GMT',
   'etag': '"4623c60bb7c38588e40165522b47b750"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"4623c60bb7c38588e40165522b47b750"'}