In [None]:
%pip install s3fs

In [None]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri

import numpy as np
import io
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv(
    's3://group3-finalproject/Medicaldataset.csv',
)

In [None]:
data.head()

In [None]:
train, testval = train_test_split(data, train_size=0.8, random_state=1200)
val, test = train_test_split(testval, train_size=0.5, random_state=1200)

In [None]:
train.shape, val.shape, test.shape

In [None]:
s3 = boto3.resource('s3')

def upload_to_s3(df, bucket, filename):
    
    placeholder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())
    

In [None]:
upload_to_s3(train, 'group3-finalproject', 'train.csv')
upload_to_s3(val, 'group3-finalproject', 'val.csv')

In [None]:
example = sagemaker.image_uris.retrieve('xgboost', 'eu-west-3', version='0.90-1')

In [None]:
role = sagemaker.get_execution_role()
region_name = boto3.Session().region_name
#container = get_image_uri(region_name, 'xgboost', '0.90-1')  # Old version. Works anyway but warns.  
container = sagemaker.image_uris.retrieve('xgboost', region_name, version='0.90-1')
output_location = 's3://group3-finalproject/'

#For a list of possible parameters of xgboost, see
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters
hyperparams = {
    'num_round': '20',
    'objective': 'reg:squarederror'
}

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    output_path=output_location,
    hyperparameters=hyperparams,
    sagemaker_session=sagemaker.Session()
)

In [None]:
train_channel = sagemaker.session.s3_input(
    's3://group3-finalproject/train.csv',
    content_type='text/csv'
)
val_channel = sagemaker.session.s3_input(
    's3://group3-finalproject/val.csv',
    content_type='text/csv'
)


channels_for_training = {
    'train': train_channel,
    'validation': val_channel
}

In [None]:
estimator.fit(inputs=channels_for_training, logs=False)

In [None]:
estimator._current_job_name

In [None]:
metrics = sagemaker.analytics.TrainingJobAnalytics(
    estimator._current_job_name,
    metric_names=['train:rmse', 'validation:rmse']
)

In [None]:
metrics.dataframe()

In [None]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=sagemaker.serializers.CSVSerializer())
