In [None]:
%pip install numpy pandas scikit-learn matplotlib seaborn

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on="CustomerID", how="left")

# Aggregate transaction and profile information for clustering
clustering_data = data.groupby("CustomerID").agg({
    "TotalValue": "sum",    # Total spending
    "Quantity": "sum",      # Total quantity purchased
    "TransactionID": "count",  # Number of transactions
    "Region": "first"       # Region
}).reset_index()

# One-hot encode 'Region'
clustering_data = pd.get_dummies(clustering_data, columns=["Region"], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ["TotalValue", "Quantity", "TransactionID"]
clustering_data[numerical_cols] = scaler.fit_transform(clustering_data[numerical_cols])

# Save CustomerID for later use
customer_ids = clustering_data["CustomerID"]
clustering_data = clustering_data.drop("CustomerID", axis=1)

# Display prepared data
print("Clustering Data (after preprocessing):")
print(clustering_data.head())

In [None]:
# Perform KMeans clustering
optimal_k = 4  # You can experiment with values between 2 and 10
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clustering_data["Cluster"] = kmeans.fit_predict(clustering_data)

# Add cluster labels back to the original data
clustering_data["CustomerID"] = customer_ids

In [None]:
# Calculate Davies-Bouldin Index (DB Index)
db_index = davies_bouldin_score(clustering_data.drop(["Cluster", "CustomerID"], axis=1), clustering_data["Cluster"])
print(f"Davies-Bouldin Index (DB Index): {db_index}")

# Optional: Calculate inertia (Sum of Squared Distances)
inertia = kmeans.inertia_
print(f"Inertia: {inertia}")

In [None]:
# Visualize clusters with pairplots (selecting first 2 features for simplicity)
sns.pairplot(clustering_data, hue="Cluster", vars=["TotalValue", "Quantity"])
plt.title("Cluster Visualization")
plt.show()

# Visualize the number of customers in each cluster
plt.figure(figsize=(8, 6))
sns.countplot(x="Cluster", data=clustering_data, palette="viridis")
plt.title("Customer Count per Cluster")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.show()

In [None]:
# Summarize results for each cluster
cluster_summary = clustering_data.groupby("Cluster").agg({
    "TotalValue": "mean",
    "Quantity": "mean",
    "TransactionID": "mean"
}).reset_index()

print("Cluster Summary:")
print(cluster_summary)

# Save cluster labels to a CSV file
clustering_data[["CustomerID", "Cluster"]].to_csv("CustomerClusters.csv", index=False)