# Preprocessing and model creation

Calling al the libraries we need

In [None]:
%pip install s3fs

In [None]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri

import numpy as np
import io
import pandas as pd
from sklearn.model_selection import train_test_split

Calling the dataset we have saved in our s3 bucket

In [None]:
data = pd.read_csv(
    's3://group3-finalproject/Medicaldataset.csv',
)

In [None]:
data.head()

In this exact dataset, we don't need to perform any cleaning process. Even so, we left a cleaning pipeline to perform automatically in anyfuture dataset;

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Numeric features pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler()) 
])

# Categorical features pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['num_feature1', 'num_feature2']),
        ('cat', categorical_transformer, ['cat_feature1', 'cat_feature2'])
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)]) #Use the pipeline to fit transform the data


Splitting the data into train, validation and test sets

In [None]:
train, testval = train_test_split(data, train_size=0.8, random_state=1200)
val, test = train_test_split(testval, train_size=0.5, random_state=1200)

In [None]:
train.shape, val.shape, test.shape

Create a python function to upload the validation and train data to the s3 bucket

In [None]:
s3 = boto3.resource('s3')

def upload_to_s3(df, bucket, filename):
    
    placeholder = io.StringIO()
    df.to_csv(placeholder, header=False, index=False)
    object = s3.Object(bucket, filename)
    object.put(Body=placeholder.getvalue())
    

Use the function to upload the data

In [None]:
upload_to_s3(train, 'group3-finalproject', 'train.csv')
upload_to_s3(val, 'group3-finalproject', 'val.csv')

Retrieve the Amazon ECR URI for a specific version of the XGBoost algorithm Docker image in the AWS region "eu-west-3" using the Amazon SageMaker Python SDK.

In [None]:
example = sagemaker.image_uris.retrieve('xgboost', 'eu-west-3', version='0.90-1')

Initialize an Amazon SageMaker estimator for an XGBoost model and sets up the model's output to be stored in a specified S3 bucket.

In [None]:
role = sagemaker.get_execution_role()
region_name = boto3.Session().region_name
#container = get_image_uri(region_name, 'xgboost', '0.90-1')  # Old version. Works anyway but warns.  
container = sagemaker.image_uris.retrieve('xgboost', region_name, version='0.90-1')
output_location = 's3://group3-finalproject/'

#For a list of possible parameters of xgboost, see
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters
hyperparams = {
    'num_round': '20',
    'objective': 'reg:squarederror'
}

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    output_path=output_location,
    hyperparameters=hyperparams,
    sagemaker_session=sagemaker.Session()
)

We proceed to create the train and validation datasets to train the model

In [None]:
train_channel = sagemaker.session.s3_input(
    's3://group3-finalproject/train.csv',
    content_type='text/csv'
)
val_channel = sagemaker.session.s3_input(
    's3://group3-finalproject/val.csv',
    content_type='text/csv'
)


channels_for_training = {
    'train': train_channel,
    'validation': val_channel
}

Training the model

In [None]:
estimator.fit(inputs=channels_for_training, logs=False)

In [None]:
estimator._current_job_name

Calling the function analytics to get the metrics results

In [None]:
metrics = sagemaker.analytics.TrainingJobAnalytics(
    estimator._current_job_name,
    metric_names=['train:rmse', 'validation:rmse']
)

Metric results

In [None]:
metrics.dataframe()