In [1]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd
from io import BytesIO

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
bucket_name = "sagemaker-studio-619071335465-gekmibeales"

In [3]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

In [4]:
s3 = boto3.client('s3')

file_key = 'storedata_total.csv'

s3_object = s3.get_object(Bucket=bucket_name, Key=file_key)
file_content = s3_object['Body'].read()

df = pd.read_csv(BytesIO(file_content))
df.head()

Unnamed: 0,custid,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city
0,6H6T6N,0,9/28/2012,8/11/2013,8/11/2013,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL
1,APCENR,1,12/19/2010,4/1/2011,1/19/2014,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL
2,7UP6MS,0,10/3/2010,12/1/2010,7/6/2011,0,0.0,0.0,33.58,0.059908,0,0,0,Wednesday,DEL
3,7ZEW8G,0,10/22/2010,3/28/2011,3/28/2011,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM
4,8V726M,1,11/27/2010,11/29/2010,1/28/2013,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM


In [5]:
df = pd.DataFrame(df)
df.to_csv('data/storedata_total.csv', index=False)

In [6]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)
    return df

In [7]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

pd.DataFrame(baseline_sample).to_csv("data/baseline.csv",header=False,index=False)

In [8]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

pd.DataFrame(batch_sample).to_csv("data/batch.csv",header=False,index=False)

In [9]:
s3_client = boto3.resource('s3')
s3_client.Bucket(bucket_name).upload_file("data/storedata_total.csv","data/storedata_total.csv")
s3_client.Bucket(bucket_name).upload_file("data/batch.csv","data/batch/batch.csv")
s3_client.Bucket(bucket_name).upload_file("data/baseline.csv","input/baseline/baseline.csv")

In [10]:
s3_client.Bucket(bucket_name).upload_file("pipelines/customerchurn/Preprocess.py","input/code/Preprocess.py")
s3_client.Bucket(bucket_name).upload_file("pipelines/customerchurn/Evaluate.py","input/code/Evaluate.py")
s3_client.Bucket(bucket_name).upload_file("pipelines/customerchurn/Generate_config.py","input/code/Generate_config.py")

In [11]:
from pipelines.customerchurn.Pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=bucket_name,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [12]:
pipeline.definition()



'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-619071335465-gekmibeales/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-619071335465-gekmibeales/data/batch/batch.csv"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": {"Get": "Parameters.ProcessingInstanceType"}, "InstanceCount": {"Get": "Parameters.ProcessingInstanceCount"}, "VolumeSizeInGB": 30}}, "AppSpecification": {"ImageUri

In [13]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-2:619071335465:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': 'c16b2ab7-4ec6-414b-ba1f-aaf92b6ada0e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c16b2ab7-4ec6-414b-ba1f-aaf92b6ada0e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Sat, 31 Aug 2024 14:25:04 GMT'},
  'RetryAttempts': 0}}

In [14]:
execution = pipeline.start()

In [15]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:619071335465:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:619071335465:pipeline/ChurnModelSMPipeline/execution/lhlcfp7xmjw5',
 'PipelineExecutionDisplayName': 'execution-1725114305621',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 8, 31, 14, 25, 5, 559000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 8, 31, 14, 25, 5, 559000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-2:619071335465:user-profile/d-nhxkawwhz6jg/default-20240813T223656',
  'UserProfileName': 'default-20240813T223656',
  'DomainId': 'd-nhxkawwhz6jg',
  'IamIdentity': {'Arn': 'arn:aws:sts::619071335465:assumed-role/AmazonSageMaker-ExecutionRole-20240813T223657/SageMaker',
   'PrincipalId': 'AROAZAI4G5QUY2D2EWVUE:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-2:619071335465:user-profile/d-nhxkawwhz6jg/default-20240813

In [29]:
execution.list_steps()

[{'StepName': 'ClarifyProcessingStep',
  'StartTime': datetime.datetime(2024, 8, 31, 14, 35, 21, 367000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 31, 14, 52, 55, 787000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:619071335465:processing-job/pipelines-lhlcfp7xmjw5-ClarifyProcessingSte-UED8WHhqA5'}},
  'AttemptCount': 1},
 {'StepName': 'ChurnModelConfigFile',
  'StartTime': datetime.datetime(2024, 8, 31, 14, 32, 48, 225000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 31, 14, 35, 20, 393000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-2:619071335465:processing-job/pipelines-lhlcfp7xmjw5-ChurnModelConfigFile-WLBUbZ46Dv'}},
  'AttemptCount': 1},
 {'StepName': 'ChurnTransform',
  'StartTime': datetime.datetime(2024, 8, 31, 14, 32, 48, 225000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 31, 14, 37, 27, 

In [30]:
sagemaker_client = boto3.client('sagemaker')

tuning_job_name = '0g26hwmpigll-ChurnHyp-VmPm5Wz2y5'

tuning_job_result = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)
best_training_job = tuning_job_result['BestTrainingJob']['TrainingJobName']

print(f"The best training job is: {best_training_job}")

The best training job is: 0g26hwmpigll-ChurnHyp-VmPm5Wz2y5-002-4dea62f1


In [35]:
model_artifact = f"s3://{bucket_name}/output/{best_training_job}/output/model.tar.gz"

print(model_artifact)

s3://sagemaker-studio-619071335465-gekmibeales/output/0g26hwmpigll-ChurnHyp-VmPm5Wz2y5-002-4dea62f1/output/model.tar.gz


In [37]:
image_uri_xgboost = sagemaker.image_uris.retrieve(framework='xgboost',version="1.0-1",region=region)

print(image_uri_xgboost)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


In [46]:
model = sagemaker.model.Model(
    model_data=model_artifact,
    role=role,
    image_uri=image_uri_xgboost,
    sagemaker_session=sagemaker.Session()
)


predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name="churn-prediction-2024-08-31-02"

)

endpoint_name = "churn-prediction-2024-08-31-02"
endpoint_description = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)

print("Endpoint Status:", endpoint_description['EndpointStatus'])
print("Endpoint Config Name:", endpoint_description['EndpointConfigName'])

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-08-31-21-49-40-438


INFO:sagemaker:Creating endpoint-config with name churn-prediction-2024-08-31-02
INFO:sagemaker:Creating endpoint with name churn-prediction-2024-08-31-02


-----!Endpoint Status: InService
Endpoint Config Name: churn-prediction-2024-08-31-02


In [56]:
from io import StringIO

input_data = pd.read_csv('data/baseline.csv', header=None)

csv_buffer = StringIO()
input_data.to_csv(csv_buffer, header=False, index=False)
csv_buffer.seek(0)


client = boto3.client('sagemaker-runtime', region_name=region)

response = client.invoke_endpoint(
    EndpointName='churn-prediction-2024-08-31-02',
    Body=csv_buffer.getvalue(), 
    ContentType='text/csv'
)

result = response['Body'].read().decode('utf-8')
print(result)


0.9733109474182129,0.9569705724716187,0.0025336432736366987,0.9771817326545715,0.0012906994670629501,0.783551037311554
