In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

In [9]:
def customer_segmentation():
    # Aggregate transaction data by customer
    customer_summary = transactions.groupby('CustomerID').agg({
        'TotalValue': 'sum',
        'Quantity': 'sum'
    }).reset_index()

    # Add region information from customers dataset
    customer_summary = customer_summary.merge(customers[['CustomerID', 'Region']], on='CustomerID')

    # One-hot encode regions
    customer_summary = pd.get_dummies(customer_summary, columns=['Region'], drop_first=True)

    # Prepare data for clustering
    features = customer_summary.drop('CustomerID', axis=1)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Apply KMeans clustering with a flexible number of clusters
    best_db_index = float("inf")
    best_k = 0
    best_model = None
    cluster_results = []

    for k in range(2, 11):  # Number of clusters between 2 and 10
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(scaled_features)
        db_index = davies_bouldin_score(scaled_features, clusters)
        cluster_results.append((k, db_index))
        print(f"K={k}, DB Index={db_index:.3f}")
        
        if db_index < best_db_index:
            best_db_index = db_index
            best_k = k
            best_model = kmeans

    # Print the best cluster details at the end
    print(f"\nBest clustering result: K={best_k}, DB Index={best_db_index:.3f}\n")

    # Add the best cluster labels to the dataset
    customer_summary['Cluster'] = best_model.labels_

    # Visualize the clusters for the best number of clusters
    plt.figure(figsize=(10, 7))
    sns.scatterplot(
        x=customer_summary['TotalValue'],
        y=customer_summary['Quantity'],
        hue=customer_summary['Cluster'],
        palette='viridis'
    )
    plt.title(f'Customer Segments (Best K={best_k})')
    plt.xlabel('Total Value')
    plt.ylabel('Quantity')
    plt.savefig("Pranav_Dhobi_Clustering_Best.png")
    plt.close()
    print(f"Cluster visualization saved as 'Pranav_Dhobi_Clustering_Best.png'.")
    print(f"Best number of clusters: {best_k}, with DB Index: {best_db_index:.3f}")

    # Save clustering results
    customer_summary.to_csv("Pranav_Dhobi_Clustering_Results.csv", index=False)
    print("Clustering results saved as 'Pranav_Dhobi_Clustering_Results.csv'.")


In [10]:
if __name__ == "__main__":
    customer_segmentation()



K=2, DB Index=1.597




K=3, DB Index=1.059




K=4, DB Index=0.959




K=5, DB Index=0.794




K=6, DB Index=0.766




K=7, DB Index=0.731




K=8, DB Index=0.632




K=9, DB Index=0.633




K=10, DB Index=0.635

Best clustering result: K=8, DB Index=0.632

Cluster visualization saved as 'Pranav_Dhobi_Clustering_Best.png'.
Best number of clusters: 8, with DB Index: 0.632
Clustering results saved as 'Pranav_Dhobi_Clustering_Results.csv'.
