## Step 1: Import Packages and Declare Constants

In [10]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [11]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

In [12]:
#Replace this value with the S3 Bucket Created
default_bucket = "amazon-sagemaker-438465168169-us-east-1-e9eb0fb68840"

In [13]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


## Step 2: Generate Baseline Dataset

Baseline Data will be used as part of SageMaker Clarify Step to generate SHAP Values 

In [14]:
def preprocess_data(file_path):
    df = pd.read_excel(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [16]:
baseline_data = preprocess_data("storedata_total.xlsx")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [19]:
pd.DataFrame(baseline_sample).to_csv("baseline.csv",header=False,index=False)

## Step 3: Generate Batch Dataset

In [20]:
batch_data = preprocess_data("storedata_total.xlsx")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [21]:
pd.DataFrame(batch_sample).to_csv("batch.csv",header=False,index=False)

## Step 4: Copy Data and Scripts to S3 Bucket

In [24]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("storedata_total.xlsx","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/data/storedata_total.xlsx")
s3_client.Bucket(default_bucket).upload_file("batch.csv","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("baseline.csv","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/input/baseline/baseline.csv")

In [25]:
s3_client.Bucket(default_bucket).upload_file("pipelines/preprocess.py","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/evaluate.py","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/generate_config.py","dzd_4gqe4c7fmeaa5j/dey1k5789rqn1j/dev/input/code/generate_config.py")

## Step 5: Get the Pipeline Instance

In [29]:
from pipelines.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds
sagemaker.config INFO - Applied value from config key = SageMaker.Model.VpcConfig
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Ap

In [30]:
pipeline.definition()

'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-438465168169-us-east-1-e9eb0fb68840/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-438465168169-us-east-1-e9eb0fb68840/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpec

## Step 5: Submit the pipeline to SageMaker and start execution

In [31]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:438465168169:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': '60ef11db-0d77-465c-ae2f-951a59c3c455',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '60ef11db-0d77-465c-ae2f-951a59c3c455',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Thu, 13 Feb 2025 17:51:39 GMT'},
  'RetryAttempts': 0}}

Start Pipeline Execution

In [32]:
execution = pipeline.start()

Now we describe execution instance and list the steps in the execution to find out more about the execution.

In [33]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:438465168169:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:438465168169:pipeline/ChurnModelSMPipeline/execution/ktdv0b5br8wa',
 'PipelineExecutionDisplayName': 'execution-1739469107640',
 'PipelineExecutionStatus': 'Failed',
 'PipelineExperimentConfig': {'ExperimentName': 'churnmodelsmpipeline',
  'TrialName': 'ktdv0b5br8wa'},
 'FailureReason': 'Step failure: One or multiple steps failed.',
 'CreationTime': datetime.datetime(2025, 2, 13, 17, 51, 47, 588000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 2, 13, 17, 51, 50, 316000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:438465168169:user-profile/d-69qrstv0krfc/e37b481b-6b4c-4419-848a-2a0a5995e4a1',
  'UserProfileName': 'e37b481b-6b4c-4419-848a-2a0a5995e4a1',
  'DomainId': 'd-69qrstv0krfc',
  'IamIdentity': {'Arn': 'arn:aws:sts::438465168169:assumed-role/datazone_usr_role_dey1k5789rqn1j_6lz335u9ne

We can list the execution steps to check out the status and artifacts:

In [34]:
execution.list_steps()

[{'StepName': 'ChurnModelProcess',
  'StartTime': datetime.datetime(2025, 2, 13, 17, 51, 48, 311000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2025, 2, 13, 17, 51, 49, 985000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': "ClientError: Failed to invoke sagemaker:CreateProcessingJob. Error Details: The account-level service limit 'ml.m5.xlarge for processing job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.\nRetry not appropriate on execution of step with PipelineExecutionArn arn:aws:sagemaker:us-east-1:438465168169:pipeline/churnmodelsmpipeline/execution/ktdv0b5br8wa and StepId ChurnModelProcess. No retry policy configured for the exception type SAGEMAKER_RESOURCE_LIMIT.",
  'Metadata': {},
  'AttemptCount': 1}]