# Step 4: Splitting into training, testing and evaluating datasets

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("cleaned_data.csv")

In [None]:
# Create train and test data

X = df.drop(columns = ['Life expectancy '])
y = df[['Life expectancy ']]


In [None]:
# Convert the data type to float32

X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [None]:
# spliting the data into training, testing and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

In [None]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

train_data.head()

In [None]:
val_data = pd.DataFrame({'Target':y_val[:,0]})
for i in range(X_val.shape[1]):
    val_data[i] = X_val[:,i]

val_data.head()

In [None]:
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)

In [None]:
# read the data from csv file and then upload the data to s3 bucket
import os

bucket_name = 'awssagemaker-xgboost'
with open('train.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket_name, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

with open('validation.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket_name, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

In [None]:
output_location = 's3://{}/{}/output'.format(bucket_name, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

# Step 5:  Training the XGBoost Model

In [None]:
import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
bucket = Session().default_bucket() 
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
# Roles give learning and hosting access to the data
# This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

In [None]:
print(role)

In [None]:
boto3.Session().region_name

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version='1.0-1')

In [None]:
Xgboost_regressor = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.m4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)

#We can tune the hyper-parameters to improve the performance of the model

Xgboost_regressor.set_hyperparameters(max_depth = 5,
                           objective = 'reg:squarederror',
                           colsample_bytree = 0.3,
                           alpha = 10,
                           eta = 0.1,
                           num_round = 100)



In [None]:
# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

train_input = sagemaker.session.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')


data_channels = {'train': train_input,'validation': valid_input}


Xgboost_regressor.fit(data_channels)

# Step 6: Deploy the model

In [None]:
predictor = Xgboost_regressor.deploy(initial_instance_count = 1, instance_type = "ml.m4.xlarge")

In [None]:
predictor.endpoint_name

# Step 7: Predicting the values using the trained model

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer 

In [None]:
predictor.content_type = "text/csv"
predictor.accept = "text/csv"
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [None]:

type(X_test)
X_test_df = pd.DataFrame(X_test)

# Convert DataFrame to CSV string
X_test_csv = X_test_df.to_csv(index=False, header=False)

In [None]:
response = predictor.sagemaker_session.sagemaker_runtime_client.invoke_endpoint(
    EndpointName="sagemaker-xgboost-2024-08-23-08-12-01-230",
    ContentType='text/csv',
    Body=X_test_csv
)

# Print the raw response
response_body=response['Body'].read().decode('utf-8')
# Split the response by lines and get rid of extra lines
lines = response_body.split('\n')
cleaned_response = '\n'.join(lines[:1])  # Keep only the first line

# Print the cleaned response
print(cleaned_response)

In [None]:
import pandas as pd
import numpy as np

# Step 1: Clean the response (assuming cleaned_response is the CSV string)
# For example, if cleaned_response is a CSV string:
 # Replace with your actual cleaned CSV response

# Step 2: Convert the cleaned CSV response to a Pandas DataFrame
from io import StringIO

# Use StringIO to read the cleaned CSV string into a DataFrame
csv_data = StringIO(cleaned_response)
df = pd.read_csv(csv_data, header=None)  # Adjust header parameter if needed

# Step 3: Convert DataFrame to NumPy Array
predicted_values = df.to_numpy()

print(predicted_values)


In [None]:
predicted_values.shape

In [None]:
import matplotlib.pyplot as plt
plt.scatter(predicted_values, y_test)
plt.show()

In [None]:
# Delete the end-point
predictor.delete_endpoint()