In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from util import k_means_plot

In [None]:
# Generate random data
np.random.seed(0)
Feature_Matrix, Response_Vector = make_blobs(n_samples=5000, 
    centers=[[4,4],[-2, -1], [2, -3], [1, 1]], 
    cluster_std=0.9)
plt.scatter(Feature_Matrix[:, 0], Feature_Matrix[:, 1], marker='.')
plt.show()

### Setting up K-Means

In [None]:
k_means = KMeans(init = "k-means++", n_clusters = 4, n_init = 12)
# n_init: Number of time the k-means algorithm will be run with different centroid seeds.
# n_clusters: The number of clusters to form as well as the number of centroids to generate.
k_means.fit(Feature_Matrix)

In [None]:
labels = k_means.labels_
np.unique(labels)

In [None]:
cluster_centers = k_means.cluster_centers_
print(cluster_centers)
len(cluster_centers)

In [None]:
k_means_plot(labels, cluster_centers, Feature_Matrix)

### Customer Segmentation 

In [None]:
import pandas as pd
df = pd.read_csv("data/cust_segmentation.csv")
df.dtypes

In [None]:
# K-means doesn't like catagorial data
df = df.drop('Address', axis=1)
df.head()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

# Extracting features from the DataFrame
X = df.values[:, 1:]  # Assuming df is already defined
X = np.nan_to_num(X)  # Replace NaN values with 0 or other numerical values

# Standardizing the dataset
Clus_dataSet = StandardScaler().fit_transform(X)

# Setting up K-means
clusterNum = 3
k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12)

# Fitting K-means on the standardized dataset
k_means.fit(Clus_dataSet)

# Getting the labels
labels = k_means.labels_
print(labels)

### Insights

In [None]:
df["Clus_km"] = labels
df.head(5)

In [None]:
# Check the centroid values by averaging the features in each cluster
df.groupby('Clus_km').mean()

In [None]:
area = np.pi * ( X[:, 1])**2  
plt.scatter(Clus_dataSet[:, 0], X[:, 3], s=area, c=labels.astype(float), alpha=0.5)
plt.xlabel('Age', fontsize=18)
plt.ylabel('Income', fontsize=16)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming Clus_dataSet is the standardized dataset and labels are from K-means
# For simplicity, use the first two features for visualization
x_feature = Clus_dataSet[:, 0]  # First feature
y_feature = Clus_dataSet[:, 3]  # Second feature

# Create a scatter plot with colors based on cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(x_feature, y_feature, c=labels, cmap='viridis', marker='o', s=50, alpha=0.7)

# Add cluster centroids
centroids = StandardScaler().fit_transform(k_means.cluster_centers_)
# centroids = k_means.cluster_centers_  # Get centroids
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=200, label='Centroids')

# Add plot details
plt.title("K-Means Clustering Results")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.grid()
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Reduce data to 2 dimensions for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(Clus_dataSet)

# Visualize the clusters in reduced 2D space
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis', marker='o', s=50, alpha=0.7)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.title("K-Means Clustering Results (PCA Reduced)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid()