In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Step 1: Load the dataset
# For this example, let's assume the data has features like:
# 'Total_Purchases', 'Frequency_of_Purchases', 'Avg_Spend_Per_Transaction', etc.
# Replace with your actual dataset file
df = pd.read_csv('customer_purchase_history.csv')

# Step 2: Explore the dataset (optional)
print(df.head())  # Show the first few rows
print(df.info())  # Check for missing values, data types

# Step 3: Preprocess the data
# Let's assume the columns relevant for clustering are 'Total_Purchases', 'Frequency_of_Purchases', 'Avg_Spend_Per_Transaction'
X = df[['Total_Purchases', 'Frequency_of_Purchases', 'Avg_Spend_Per_Transaction']]

# Handle missing values (if any)
X = X.dropna()  # Or you can fill missing values using X.fillna()

# Normalize the data using StandardScaler (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Determine the optimal number of clusters (K) using the Elbow Method
inertia = []  # To store the sum of squared distances for each K
range_k = range(1, 11)  # Try cluster sizes from 1 to 10

for k in range_k:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve to choose the optimal K
plt.figure(figsize=(8, 6))
plt.plot(range_k, inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.show()

# From the plot, select the K where the curve begins to flatten (elbow point)

# Step 5: Apply K-Means with the optimal number of clusters (say K=3)
optimal_k = 3  # For example, we assume K=3 based on the elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Step 6: Evaluate the clustering using Silhouette Score
silhouette_avg = silhouette_score(X_scaled, df['Cluster'])
print(f'Silhouette Score: {silhouette_avg}')

# Step 7: Analyze the clusters
# Let's see the centers of the clusters
centroids = scaler.inverse_transform(kmeans.cluster_centers_)  # Reverse scaling for interpretation
centroids_df = pd.DataFrame(centroids, columns=X.columns)
print(f"Centroids of the clusters:\n{centroids_df}")

# Step 8: Visualize the clusters (if 2 or 3 features are used for simplicity)
plt.figure(figsize=(8, 6))
plt.scatter(df['Total_Purchases'], df['Frequency_of_Purchases'], c=df['Cluster'], cmap='viridis')
plt.title('Customer Segments Based on Purchase History')
plt.xlabel('Total Purchases')
plt.ylabel('Frequency of Purchases')
plt.colorbar(label='Cluster')
plt.show()

# Optional: Display customer data with their cluster
print(df[['Total_Purchases', 'Frequency_of_Purchases', 'Avg_Spend_Per_Transaction', 'Cluster']].head())
