In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

In [3]:
CUSTOMERS_PATH = "Customers.csv"
TRANSACTIONS_PATH = "Transactions.csv"
PRODUCTS_PATH = "Products.csv"

In [4]:
def load_data():
    customers = pd.read_csv(CUSTOMERS_PATH)
    transactions = pd.read_csv(TRANSACTIONS_PATH)
    products = pd.read_csv(PRODUCTS_PATH)
        
    print("Data loaded successfully!")
    return customers, transactions, products

In [5]:
def create_customer_features(customers, transactions, products):
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
        
    le = LabelEncoder()
    customers['Region_encoded'] = le.fit_transform(customers['Region'])
        
    customer_metrics = transactions.groupby('CustomerID').agg({
            'TransactionID': 'count',
            'TotalValue': ['sum', 'mean'],
            'Quantity': ['sum', 'mean'],
            'ProductID': 'nunique'
    }).reset_index()
        
    customer_metrics.columns = ['CustomerID', 'transaction_count', 'total_value', 
                                'avg_value', 'total_quantity', 'avg_quantity', 
                                'unique_products']
        
    pur_dates = transactions.groupby('CustomerID').agg({
            'TransactionDate': ['min', 'max']
    }).reset_index()
    pur_dates.columns = ['CustomerID', 'first_purchase', 'last_purchase']
        
    cur_date = transactions['TransactionDate'].max()
    pur_dates['days_since_first'] = (cur_date - pd.to_datetime(pur_dates['first_purchase'])).dt.days
    pur_dates['days_since_last'] = (cur_date - pd.to_datetime(pur_dates['last_purchase'])).dt.days
        
    customer_features = customers.merge(customer_metrics, on='CustomerID', how='left')
    customer_features = customer_features.merge(pur_dates, on='CustomerID', how='left')
        
    cat_preferences = transactions.merge(products[['ProductID', 'Category']], on='ProductID')\
            .groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)
        
    cat_preferences = cat_preferences.div(cat_preferences.sum(axis=1), axis=0)
        
    final_features = customer_features.merge(cat_preferences, on='CustomerID', how='left')
    final_features = final_features.fillna(0)
        
    return final_features


In [6]:
def normalize_features(df, exclude_cols):
    scaler = StandardScaler()
    df_normalized = df.copy()
        
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    num_cols = [col for col in num_cols if col not in exclude_cols]
        
    df_normalized[num_cols] = scaler.fit_transform(df_normalized[num_cols])
    return df_normalized, scaler

In [7]:
def find_optimal_clusters(data, max_clusters=10):
    db_scores = []
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(data)
        score = davies_bouldin_score(data, clusters)
        db_scores.append(score)
        
    optimal_clusters = np.argmin(db_scores) + 2
    return optimal_clusters, db_scores


In [8]:
def visualize_clusters(data, clusters, save_path=None):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(data)
        
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(data_2d[:, 0], data_2d[:, 1], c=clusters, cmap='viridis')
    plt.colorbar(scatter)
    plt.title('customer Segments')
    plt.xlabel('1st principal component')
    plt.ylabel('2nd principal component')
        
    if save_path:
        plt.savefig(save_path)
    plt.close()


In [9]:
def generate_cluster_profiles(original_data, clusters):
    data_with_clusters = original_data.copy()
    data_with_clusters['Cluster'] = clusters
        
    profiles = []
    for cluster in range(len(np.unique(clusters))):
        cluster_data = data_with_clusters[data_with_clusters['Cluster'] == cluster]
        profile = {
                'Cluster': cluster,
                'Size': len(cluster_data),
                'Avg_Transaction_Value': cluster_data['avg_value'].mean(),
                'Avg_Purchase_Frequency': cluster_data['transaction_count'].mean(),
                'Avg_Recency': cluster_data['days_since_last'].mean(),
                'Total_Revenue': cluster_data['total_value'].sum(),
                'Avg_Products_Purchased': cluster_data['unique_products'].mean()
        }
        profiles.append(profile)
        
    return pd.DataFrame(profiles)


In [10]:
def main():
    output_path = "clustering_results"
    os.makedirs(output_path, exist_ok=True)
    
    try:
        print("Loading data...")
        customers, transactions, products = load_data()
        
        print("Creating customer features...")
        customer_features = create_customer_features(customers, transactions, products)
        
        exclude_cols = ['CustomerID', 'CustomerName', 'Region', 'SignupDate', 
                    'first_purchase', 'last_purchase']
        
        cluster_features = customer_features.drop(exclude_cols, axis=1)
        
        print("Normalizing features...")
        features_normalized, scaler = normalize_features(cluster_features, ['CustomerID'])
        
        print("Finding optimal number of clusters...")
        optimal_clusters, db_scores = find_optimal_clusters(features_normalized)
        
        print(f"Performing clustering with {optimal_clusters} clusters...")
        kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(features_normalized)
        
        db_index = davies_bouldin_score(features_normalized, clusters)
        print(f"DB Index: {db_index:.4f}")
        
        print("generating visualization.")
        visualize_clusters(features_normalized, clusters, f"{output_path}/cluster_visualization.png")
        
        cluster_profiles = generate_cluster_profiles(customer_features, clusters)
        customer_clusters = pd.DataFrame({
            'CustomerID': customer_features['CustomerID'],
            'Cluster': clusters
        })
        
        cluster_profiles.to_csv(f"{output_path}/cluster_profiles.csv", index=False)
        customer_clusters.to_csv(f"{output_path}/customer_clusters.csv", index=False)
        
        plt.figure(figsize=(10, 6))
        plt.plot(range(2, len(db_scores) + 2), db_scores, marker='o')
        plt.xlabel('No. of Clusters')
        plt.ylabel('DB Index')
        plt.title('DB Index vs No. of Clusters')
        plt.savefig(f"{output_path}/db_scores.png")
        plt.close()
        
        print("customer segmentation completed successfully!")
        return {
            'optimal_clusters': optimal_clusters,
            'db_index': db_index,
            'cluster_profiles': cluster_profiles,
            'customer_clusters': customer_clusters
        }
    
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    results = main()

Loading data...
Data loaded successfully!
Creating customer features...
Normalizing features...
Finding optimal number of clusters...
Performing clustering with 8 clusters...
DB Index: 1.5823
generating visualization.
customer segmentation completed successfully!
