In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score


In [None]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

In [None]:
# Convert dates to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [None]:
# Aggregate transaction data by customer
customer_spending = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_spent_per_txn=('TotalValue', 'mean')
).reset_index()

In [None]:
# Merge with customer profile data
data = customers.merge(customer_spending, on='CustomerID', how='left').fillna(0)

In [None]:
# Select relevant numerical features for clustering
features = ['total_spent', 'total_transactions', 'avg_spent_per_txn']
X = data[features]

In [None]:
# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Determine the optimal number of clusters using Davies-Bouldin Index
best_k = 0
best_db_index = float('inf')
db_indices = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    db_index = davies_bouldin_score(X_scaled, labels)
    db_indices.append(db_index)
    if db_index < best_db_index:
        best_db_index = db_index
        best_k = k

print(f"Optimal number of clusters: {best_k} with DB Index: {best_db_index:.4f}")

In [None]:
# Train final model with optimal clusters
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
data['Cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
# Save clustered data
data.to_csv('Customer_Segments.csv', index=False)

In [None]:
# Visualization of Clusters
plt.figure(figsize=(10, 5))
plt.plot(range(2, 11), db_indices, marker='o', linestyle='-')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Davies-Bouldin Index")
plt.title("DB Index vs Number of Clusters")
plt.show()

In [None]:
# Scatter plot of clusters based on two main features
sns.scatterplot(x=data['total_spent'], y=data['total_transactions'], hue=data['Cluster'], palette='viridis')
plt.xlabel("Total Spent")
plt.ylabel("Total Transactions")
plt.title("Customer Segmentation Clusters")
plt.show()