In [1]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
# using in built algorithms from sagemaker to learn about sagemaker

In [2]:
!pip install mxnet



In [3]:
# sagemaker libraries
import pandas as pd
import numpy as np
import io
import boto3
import sagemaker
from sagemaker import get_execution_role, Session
from sklearn.preprocessing import StandardScaler

In [4]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
bucket_name='sagemaker-us-east-1-729296914025'

In [5]:
# for clustering we also need additional scaling because of the distances
data_unscaled = pd.read_csv("starbucks_data/starbucks_imputed.csv")

In [6]:
data_unscaled

Unnamed: 0,time,value,gender,age,became_member_on,income,reward,offer_received,offer_viewed,offer_completed,...,last_time_offer_received,last_time_offer_completed,first_time_offer_received,first_time_offer_completed,mean_time_between_actions,amount_of_completed_user_paths,email,social,web,mobile
0,696,127.60,M,33.0,20170421,72000.000000,25.0,5,4,3,...,576,576,0,False,26.400000,0,12.0,8.0,10.0,12.0
1,654,79.46,O,40.0,20180109,57000.000000,39.0,5,5,3,...,504,576,0,False,36.333333,2,13.0,5.0,11.0,10.0
2,708,196.86,F,59.0,20160304,90000.000000,50.0,5,3,3,...,504,510,0,False,37.263158,1,11.0,11.0,8.0,11.0
3,672,154.05,F,24.0,20161111,60000.000000,39.0,4,4,3,...,504,600,0,False,27.391304,3,11.0,8.0,9.0,11.0
4,696,48.34,F,26.0,20170621,73000.000000,22.0,5,4,3,...,504,696,0,False,23.200000,0,12.0,8.0,10.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16989,666,20.87,M,50.0,20170928,50324.807692,34.0,5,5,0,...,576,0,0,False,37.000000,0,10.0,2.0,10.0,6.0
16990,708,6.45,M,55.0,20171125,52760.000000,70.0,4,4,0,...,504,0,0,False,70.800000,0,8.0,8.0,6.0,8.0
16991,654,31.33,M,47.0,20161203,46141.666667,36.0,4,3,1,...,576,576,0,False,24.782609,1,8.0,7.0,6.0,8.0
16992,708,9.97,M,54.0,20171027,53399.166667,34.0,5,5,0,...,504,0,0,False,54.461538,0,10.0,2.0,8.0,6.0


In [7]:
data_unscaled.gender = np.where(data_unscaled.gender == "M", 0, np.where(data_unscaled.gender=="F", 1, 2))

In [8]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_unscaled)

In [9]:
data_scaled_df = pd.DataFrame(columns=data_unscaled.columns, data=data_scaled)
data_scaled_df.to_csv("starbucks_data/starbucks_imputed_scaled.csv", index=False)

In [10]:
data = "starbucks_imputed_scaled.csv"
data_dir = "starbucks_data"
prefix = "kmeans"
prefix_cluster = 'customers_clustered'

In [11]:
sagemaker_session.upload_data(path=f"{data_dir}/{data}")

's3://sagemaker-us-east-1-729296914025/data/starbucks_imputed_scaled.csv'

In [12]:
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

customers_clustered/kmeans-2021-12-22-14-21-19-823/output/model.tar.gz
customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/framework/training_job_end.ts
customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/incremental/2021122214/1640183040.algo-1.json
customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/incremental/2021122214/1640183100.algo-1.json
customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/training_job_end.ts
customers_clustered/kmeans-2021-12-22-14-42-50-791/output/model.tar.gz
customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/framework/training_job_end.ts
customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/system/incremental/2021122214/1640184360.algo-1.json
customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/system/incremental/2021122214/1640184420.algo-1.json
customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/system/training_job_end

In [13]:
print(role)

arn:aws:iam::729296914025:role/service-role/AmazonSageMaker-ExecutionRole-20210801T161261


In [14]:
output_path_cluster='s3://{}/{}/'.format(bucket_name, prefix_cluster)

In [15]:
from sagemaker import KMeans
kmeans_customers = KMeans(role=role,
                         instance_count=1,
                         instance_type='ml.c4.xlarge',
                         output_path=output_path_cluster, # specified, above
                         k=2, 
                         epochs=20,
                         sagemaker_session=sagemaker_session)

In [16]:
s3_client = boto3.client('s3')
obj_list=s3_client.list_objects(Bucket=bucket_name)

# print object(s)in S3 bucket
files=[]
for contents in obj_list['Contents']:
    files.append(contents['Key'])
    
print(files)

['customers_clustered/kmeans-2021-12-22-14-21-19-823/output/model.tar.gz', 'customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/framework/training_job_end.ts', 'customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/incremental/2021122214/1640183040.algo-1.json', 'customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/incremental/2021122214/1640183100.algo-1.json', 'customers_clustered/kmeans-2021-12-22-14-21-19-823/profiler-output/system/training_job_end.ts', 'customers_clustered/kmeans-2021-12-22-14-42-50-791/output/model.tar.gz', 'customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/framework/training_job_end.ts', 'customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/system/incremental/2021122214/1640184360.algo-1.json', 'customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-output/system/incremental/2021122214/1640184420.algo-1.json', 'customers_clustered/kmeans-2021-12-22-14-42-50-791/profiler-o

In [17]:
customers = pd.read_csv('s3://sagemaker-us-east-1-729296914025/data/starbucks_imputed_scaled.csv', delimiter=",") 

In [18]:
customers

Unnamed: 0,time,value,gender,age,became_member_on,income,reward,offer_received,offer_viewed,offer_completed,...,last_time_offer_received,last_time_offer_completed,first_time_offer_received,first_time_offer_completed,mean_time_between_actions,amount_of_completed_user_paths,email,social,web,mobile
0,0.948956,0.183743,-0.758233,-1.290239,0.289966,0.404583,-0.842935,0.476725,0.465344,0.637901,...,0.464983,0.703987,-0.019422,0.0,-0.795001,-0.804229,0.697364,0.481531,0.632085,0.943784
1,0.176659,-0.198520,3.145252,-0.844766,1.119536,-0.312878,-0.207975,0.476725,1.236786,0.637901,...,-0.821974,0.703987,-0.019422,0.0,-0.146002,1.053214,1.023416,-0.468083,0.941412,0.303492
2,1.169612,0.733713,1.193510,0.364376,-0.576339,1.265537,0.290922,0.476725,-0.306098,0.637901,...,-0.821974,0.448149,-0.019422,0.0,-0.085252,0.124493,0.371312,1.431146,0.013433,0.623638
3,0.507643,0.393773,1.193510,-1.862990,-0.507236,-0.169386,-0.207975,-0.455228,0.465344,0.637901,...,-0.821974,0.797019,-0.019422,0.0,-0.730234,1.981936,0.371312,0.481531,0.322759,0.623638
4,0.948956,-0.445633,1.193510,-1.735712,0.307092,0.452414,-0.978997,0.476725,0.465344,0.637901,...,-0.821974,1.169147,-0.019422,0.0,-1.004075,-0.804229,0.697364,0.481531,0.632085,0.303492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16989,0.397315,-0.663763,-0.758233,-0.208375,0.333380,-0.632158,-0.434746,0.476725,1.236786,-1.230827,...,0.464983,-1.528780,-0.019422,0.0,-0.102445,-0.804229,0.045260,-1.417698,0.632085,-0.977091
16990,1.169612,-0.778268,-0.758233,0.109820,0.350249,-0.515681,1.198008,-0.455228,0.465344,-1.230827,...,-0.821974,-1.528780,-0.019422,0.0,2.105893,-0.804229,-0.606843,0.481531,-0.605219,-0.336799
16991,0.176659,-0.580704,-0.758233,-0.399292,-0.499359,-0.832241,-0.344038,-0.455228,-0.306098,-0.607918,...,0.464983,0.703987,-0.019422,0.0,-0.900674,0.124493,-0.606843,0.164993,-0.605219,-0.336799
16992,1.169612,-0.750316,-0.758233,0.046181,0.341857,-0.485109,-0.434746,0.476725,1.236786,-1.230827,...,-0.821974,-1.528780,-0.019422,0.0,1.038412,-0.804229,0.045260,-1.417698,0.013433,-0.977091


In [19]:
aws_conform = customers.values.astype("float32")
converted_data = kmeans_customers.record_set(aws_conform)

In [20]:
%%time
kmeans_customers.fit(converted_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-03-27 16:14:26 Starting - Starting the training job...ProfilerReport-1648397666: InProgress
...
2022-03-27 16:15:15 Starting - Preparing the instances for training......
2022-03-27 16:16:23 Downloading - Downloading input data......
2022-03-27 16:17:19 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/27/2022 16:17:25 INFO 140308594788160 integration.py:636] worker started[0m
[34m[03/27/2022 16:17:25 INFO 140308594788160] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true',

In [21]:
%%time

kmeans_predict = kmeans_customers.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium"
)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-----------!CPU times: user 196 ms, sys: 14.8 ms, total: 211 ms
Wall time: 5min 31s


In [22]:
cluster_info=kmeans_predict.predict(aws_conform)

In [23]:
cluster_info

[label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 4.169379234313965
     }
   }
 },
 label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 4.866910457611084
     }
   }
 },
 label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 3.183271884918213
     }
   }
 },
 label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 4.122851848602295
     }
   }
 },
 label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 0.0
     }
   }
 }
 label

In [24]:
cluster_labels = [cluster.label['closest_cluster'].float32_tensor.values[0] for cluster in cluster_info]
distance_values = [distance.label['distance_to_cluster'].float32_tensor.values[0] for distance in cluster_info]

In [25]:
distance_values_filled = [value if np.isnan(value) == False else 0 for value in distance_values ]

In [26]:
distance_values_filled

[4.169379234313965,
 4.866910457611084,
 3.183271884918213,
 4.122851848602295,
 5.289279937744141,
 4.579742431640625,
 4.119615077972412,
 3.6546714305877686,
 3.889218807220459,
 4.129252910614014,
 5.587949752807617,
 5.357957363128662,
 6.065849781036377,
 3.7690048217773438,
 5.154435634613037,
 4.1017537117004395,
 4.363648414611816,
 6.2440409660339355,
 3.3372390270233154,
 4.146577835083008,
 4.276836395263672,
 4.3949480056762695,
 5.29843807220459,
 3.7674293518066406,
 4.133883953094482,
 3.6795554161071777,
 3.8271210193634033,
 3.6707093715667725,
 4.352197170257568,
 4.046826362609863,
 2.787902355194092,
 4.995475769042969,
 3.9467415809631348,
 5.168069362640381,
 3.1303019523620605,
 4.072144031524658,
 4.862765789031982,
 3.236647367477417,
 3.8422188758850098,
 6.062986850738525,
 3.178926944732666,
 2.35575795173645,
 6.03234338760376,
 4.441544532775879,
 3.4444236755371094,
 3.765934705734253,
 5.109836578369141,
 4.088720798492432,
 4.296672821044922,
 4.512427

In [27]:
data_unscaled["cluster"] = list(map(int, cluster_labels))
data_unscaled["distance"] = list(map(int, distance_values_filled))

In [28]:
data_unscaled[data_unscaled.cluster == 0].amount_of_completed_user_paths.describe()

count    6847.000000
mean        1.698262
std         1.125658
min         0.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         6.000000
Name: amount_of_completed_user_paths, dtype: float64

In [29]:
data_unscaled[data_unscaled.cluster == 1].amount_of_completed_user_paths.describe()

count    10147.000000
mean         0.304326
std          0.551349
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          3.000000
Name: amount_of_completed_user_paths, dtype: float64

In [30]:
#two clusters seem good, but we want more

In [31]:
kmeans_customers_3 = KMeans(role=role,
                         instance_count=1,
                         instance_type='ml.c4.xlarge',
                         output_path=output_path_cluster, # specified, above
                         k=3, 
                         epochs=20,
                         sagemaker_session=sagemaker_session)

In [32]:
%%time
kmeans_customers_3.fit(converted_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-03-27 16:23:43 Starting - Starting the training job...
2022-03-27 16:24:00 Starting - Preparing the instances for trainingProfilerReport-1648398223: InProgress
.........
2022-03-27 16:25:44 Downloading - Downloading input data......
2022-03-27 16:26:44 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/27/2022 16:26:40 INFO 140024591497024 integration.py:636] worker started[0m
[34m[03/27/2022 16:26:40 INFO 140024591497024] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', 

In [33]:
%%time

kmeans_predict_3 = kmeans_customers_3.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium"
)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-----------------------!CPU times: user 329 ms, sys: 36.2 ms, total: 365 ms
Wall time: 11min 32s


In [34]:
cluster_info=kmeans_predict_3.predict(aws_conform)

cluster_labels = [cluster.label['closest_cluster'].float32_tensor.values[0] for cluster in cluster_info]
distance_values = [distance.label['distance_to_cluster'].float32_tensor.values[0] for distance in cluster_info]
distance_values_filled = [value if np.isnan(value) == False else 0 for value in distance_values ]
data_unscaled["cluster"] = list(map(int, cluster_labels))
data_unscaled["distance"] = list(map(int, distance_values_filled))

In [35]:
print(data_unscaled[data_unscaled.cluster == 0].amount_of_completed_user_paths.describe())
print(data_unscaled[data_unscaled.cluster == 1].amount_of_completed_user_paths.describe())
print(data_unscaled[data_unscaled.cluster == 2].amount_of_completed_user_paths.describe())

count    4883.000000
mean        1.937948
std         1.132026
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         6.000000
Name: amount_of_completed_user_paths, dtype: float64
count    5625.000000
mean        0.087822
std         0.284940
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         2.000000
Name: amount_of_completed_user_paths, dtype: float64
count    6486.000000
mean        0.733734
std         0.771421
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: amount_of_completed_user_paths, dtype: float64


In [36]:
#even with 3 clusters we do not get the desired results