In [5]:
import boto3

s3 = boto3.resource('s3')
bucket = s3.Bucket('preprocessed-online-retail-dataset')
for obj in bucket.objects.all():
    print(obj.key)


Preprocessed_Online_Retail.csv


In [7]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve

# Setup SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Specify your bucket and data location
bucket = 'preprocessed-online-retail-dataset'
data_location = f's3://{bucket}/Preprocessed_Online_Retail.csv'

# Get the XGBoost image URI for the region
container = retrieve('xgboost', boto3.Session().region_name, version='1.0-1')

# Setup Estimator
xgboost = sagemaker.estimator.Estimator(container,
                                         role,
                                         instance_count=1, 
                                         instance_type='ml.m4.xlarge',
                                         output_path=f's3://{bucket}/output',
                                         sagemaker_session=sagemaker_session)

# Set hyperparameters
xgboost.set_hyperparameters(objective='reg:linear',
                            num_round=100)

# Specify the input data configuration using the correct class
s3_input_train = TrainingInput(s3_data=data_location, content_type='csv')

# Start the training
xgboost.fit({'train': s3_input_train})


2024-12-18 10:26:04 Starting - Starting the training job...
2024-12-18 10:26:19 Starting - Preparing the instances for training...
2024-12-18 10:26:43 Downloading - Downloading input data...
2024-12-18 10:27:13 Downloading - Downloading the training image...
2024-12-18 10:28:04 Training - Training image download completed. Training in progress...[34m[2024-12-18 10:28:16.736 ip-10-0-84-200.ec2.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINF

In [9]:
# Format the features as a CSV string for the payload
payload = "12345,10,20.5,67890"

In [12]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve

# Setup SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Specify your bucket and data location
bucket = 'preprocessed-online-retail-dataset'
data_location = f's3://{bucket}/Preprocessed_Online_Retail.csv'

# Get the XGBoost image URI for the region
container = retrieve('xgboost', boto3.Session().region_name, version='1.0-1')

# Setup Estimator for training
xgboost = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=f's3://{bucket}/output',
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
xgboost.set_hyperparameters(objective='reg:linear', num_round=100)

# Specify the input data configuration
s3_input_train = TrainingInput(s3_data=data_location, content_type='csv')

# Start training
xgboost.fit({'train': s3_input_train})

2024-12-18 10:53:04 Starting - Starting the training job...
2024-12-18 10:53:19 Starting - Preparing the instances for training...
2024-12-18 10:53:53 Downloading - Downloading input data...
2024-12-18 10:54:28 Downloading - Downloading the training image......
2024-12-18 10:55:34 Training - Training image download completed. Training in progress.
2024-12-18 10:55:34 Uploading - Uploading generated training model[34m[2024-12-18 10:55:27.848 ip-10-0-174-182.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m

In [14]:
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve

import sagemaker
import boto3

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# S3 Bucket for model artifacts
bucket = 'preprocessed-online-retail-dataset'
data_location = f's3://{bucket}/Preprocessed_Online_Retail.csv'

# Retrieve XGBoost Docker image for your region
container = retrieve('xgboost', boto3.Session().region_name, version='1.0-1')

# Setup XGBoost Estimator
xgboost = Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',  # Change instance type if needed
    output_path=f's3://{bucket}/output',
    sagemaker_session=sagemaker_session
)

# Set model hyperparameters
xgboost.set_hyperparameters(objective='reg:linear', num_round=100)

# Input data configuration
s3_input_train = TrainingInput(s3_data=data_location, content_type='csv')

# Train the model
xgboost.fit({'train': s3_input_train})

# Deploy the model to create a real-time endpoint
predictor = xgboost.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',  # Choose the instance type for deployment
    endpoint_name='xgboost-retail-endpoint'  # Name of the endpoint
)

print("Model has been deployed!")

2024-12-18 10:59:40 Starting - Starting the training job...
2024-12-18 11:00:02 Starting - Preparing the instances for training...
2024-12-18 11:00:45 Downloading - Downloading input data......
2024-12-18 11:01:21 Downloading - Downloading the training image...
2024-12-18 11:02:06 Training - Training image download completed. Training in progress...[34m[2024-12-18 11:02:22.237 ip-10-0-219-188.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34

------!Model has been deployed!


In [20]:
import os

# List all files in the current working directory
print("Current directory:", os.getcwd())
print("Files in the directory:", os.listdir())

Current directory: /home/ec2-user/SageMaker
Files in the directory: ['lost+found', '.ipynb_checkpoints', 'pre-processed-online-retail.ipynb', '.sparkmagic']


In [21]:
import os
print("Files in the directory:", os.listdir())

Files in the directory: ['lost+found', 'Preprocessed_Online_Retail.csv', '.ipynb_checkpoints', 'pre-processed-online-retail.ipynb', '.sparkmagic']


In [23]:
import pandas as pd

# Read the CSV file
file_path = 'Preprocessed_Online_Retail.csv'
data = pd.read_csv(file_path)

# Check the data types of all columns
print(data.dtypes)

InvoiceNo        int64
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID       int64
Country         object
TotalValue     float64
Year             int64
Month            int64
Day              int64
dtype: object


In [24]:
# Drop non-numeric columns (e.g., 'InvoiceNo')
numeric_data = data.select_dtypes(include=['number'])

# Extract the first row as the payload
row = numeric_data.iloc[0]
payload = ','.join(map(str, row.values))
print("Corrected Payload:", payload)

Corrected Payload: 536365.0,6.0,2.55,17850.0,15.3,2010.0,12.0,1.0


In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the original CSV
file_path = 'Preprocessed_Online_Retail.csv'
data = pd.read_csv(file_path)

# Transform categorical features (example using LabelEncoder)
label_encoder = LabelEncoder()

# Encode non-numeric features (example)
data['StockCode'] = label_encoder.fit_transform(data['StockCode'])
data['Description'] = label_encoder.fit_transform(data['Description'])
data['Country'] = label_encoder.fit_transform(data['Country'])

# Verify the transformed dataset
print("Transformed data types:\n", data.dtypes)

# Recheck feature count after preprocessing
numeric_data = data.select_dtypes(include=['number'])
print("Feature count after encoding:", len(numeric_data.columns))

# Prepare the payload with 11 features
row = numeric_data.iloc[0]  # First row for testing
payload = ','.join(map(str, row.values))
print("Corrected Payload:", payload)

Transformed data types:
 InvoiceNo        int64
StockCode        int64
Description      int64
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID       int64
Country          int64
TotalValue     float64
Year             int64
Month            int64
Day              int64
dtype: object
Feature count after encoding: 11
Corrected Payload: 536365.0,71.0,66.0,6.0,2.55,17850.0,1.0,15.3,2010.0,12.0,1.0


In [33]:
import boto3

# SageMaker runtime client
runtime = boto3.client('sagemaker-runtime')

# Endpoint name (replace with your endpoint name)
endpoint_name = 'xgboost-retail-endpoint'

# Payload with 11 features
payload = '536365.0,71.0,66.0,6.0,2.55,17850.0,1.0,15.3,2010.0,12.0,1.0'

# Invoke the endpoint
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/csv',
    Body=payload
)

# Decode and print the prediction result
prediction = response['Body'].read().decode('utf-8')
print("Prediction Result:", prediction)


Prediction Result: 536370.9375


In [34]:
# Clean up the endpoint
import boto3

sagemaker = boto3.client('sagemaker')
sagemaker.delete_endpoint(EndpointName='xgboost-retail-endpoint')
print("Endpoint deleted successfully.")

Endpoint deleted successfully.
