In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install s3fs

Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import pandas as pd
import numpy as np

# Preprocess the dataset

df = pd.read_excel("storedata_total.xlsx")
# Convert to datetime columns
df["firstorder"] = pd.to_datetime(df["firstorder"], errors='coerce')
df["lastorder"] = pd.to_datetime(df["lastorder"], errors='coerce')
# Drop Rows with null values
df = df.dropna()
# Create Column which gives the days between the last order and the first order
df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
# Create Column which gives the days between when the customer record was created and the first order
df['created'] = pd.to_datetime(df['created'])
df['created_first_days_diff'] = (df['created'] - df['firstorder']).dt.days
# Drop Columns
df.drop(['custid', 'created', 'firstorder', 'lastorder'], axis=1, inplace=True)
# Apply one hot encoding on favday and city columns
df = pd.get_dummies(df, prefix=['favday', 'city'], columns=['favday', 'city'])

storedata = df



In [4]:
def split_datasets(df):
    y = df.pop("retained")
    X_pre = df
    y_pre = y.to_numpy().reshape(len(y), 1)
    feature_names = list(X_pre.columns)
    X = np.concatenate((y_pre, X_pre), axis=1)
    np.random.shuffle(X)
    train, validation, test = np.split(X, [int(.7*len(X)), int(.85*len(X))])
    return feature_names, train, validation, test


# Split dataset
feature_names, train, validation, test = split_datasets(storedata)


# Save datasets in Amazon S3
default_bucket = "amazon-sagemaker-438465168169-us-east-1-e9eb0fb68840"


import boto3
from sagemaker import get_execution_role 

role = get_execution_role()
print(role)


pd.DataFrame(train).to_csv(f"s3://{default_bucket}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/train/train.csv",header=False,index=False)
pd.DataFrame(validation).to_csv(f"s3://{default_bucket}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/validation/validation.csv",header=False,index=False)
pd.DataFrame(test).to_csv(f"s3://{default_bucket}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/test/test.csv",header=False,index=False)

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
arn:aws:iam::438465168169:role/datazone_usr_role_dey1k5789rqn1j_6lz335u9nebcx3


In [5]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

# Training and Validation Input for SageMaker Training job
s3_input_train = TrainingInput(
    s3_data=f"s3://{default_bucket}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/train/", content_type="csv")
s3_input_validation = TrainingInput(
    s3_data=f"s3://{default_bucket}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/validation/", content_type="csv")

# Hyperparameter used
fixed_hyperparameters = {
    "eval_metric":"auc",
    "objective":"binary:logistic",
    "num_round":"100",
    "rate_drop":"0.3",
    "tweedie_variance_power":"1.4"
}

# Use the built-in SageMaker algorithm

sess = sagemaker.Session()
container = sagemaker.image_uris.retrieve("xgboost", "us-east-1" ,"1.2-2")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m5.xlarge",
    output_path="s3://{}/dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/output".format(default_bucket),
    sagemaker_session=sess
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}
objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
    estimator, objective_metric_name,
    hyperparameter_ranges, max_jobs=10, max_parallel_jobs=2)

# Tune
tuner.fit({
    "train":s3_input_train,
    "validation":s3_input_validation
    },include_cls_metadata=False)

## Explore the best model generated
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" %job_count)
## 10 training jobs have completed

## Get the best training job

from pprint import pprint
if tuning_job_result.get("BestTrainingJob",None):
    print("Best Model found so far:")
    pprint(tuning_job_result["BestTrainingJob"])
else:
    print("No training jobs have reported results yet.")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.SecurityGroupIds
.....................................................................!
10 training jobs have completed
Best Model found so far:
{'CreationTime': datetime.datetime(2025, 2, 13, 3, 28, 10, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc',
                                                 'Value': 0.97816002368927},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2025, 2, 13, 3, 28, 48, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:438465168169:training-job/sagemaker-xgboost-2502