# Performing Cluster Analysis with the built-in KMeans Algorithm

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
%store -r unlabeled_normalized_df

In [None]:
unlabeled_normalized_df.head()

In [None]:
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()

In [None]:
from sagemaker import KMeans

estimator = KMeans(
    role=role,
    instance_count=1,
    instance_type='ml.c4.xlarge',
    k=2)

In [None]:
data_np = unlabeled_normalized_df.values.astype('float32')
record_set = kmeans.record_set(data_np)

In [None]:
estimator.fit(record_set)

In [None]:
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium')

In [None]:
results = predictor.predict(data_np)
results

In [None]:
def extract_values(item):
    closest_cluster = item.label['closest_cluster']
    cc_value = int(closest_cluster.float32_tensor.values[0])
    distance_to_cluster = item.label['distance_to_cluster']
    dtc_value = distance_to_cluster.float32_tensor.values[0]
    
    return {
        "closest_cluster": cc_value,
        "distance_to_cluster": dtc_value
    }

In [None]:
extract_values(results[0])

In [None]:
closest_cluster_list = []
distance_to_cluster_list = []

for result in results:
    cluster_values = extract_values(result)
    closest_cluster_list.append(cluster_values["closest_cluster"])
    distance_to_cluster_list.append(cluster_values["distance_to_cluster"])

In [None]:
closest_cluster_list[0:6]

In [None]:
df = unlabeled_normalized_df
df = df.assign(closest_cluster=closest_cluster_list)
df = df.assign(distance_to_cluster=distance_to_cluster_list)
df.head()

In [None]:
import matplotlib.pyplot as plt

groups = df.groupby("closest_cluster")

for name, group in groups:
    plt.plot(group["x"], group["y"], marker="o", linestyle="", label=name)
    
plt.legend()