In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers_file = "C:/Users/rajes/Downloads/Customers.csv"
products_file = "C:/Users/rajes/Downloads/Products.csv"
transactions_file = "C:/Users/rajes/Downloads/Transactions.csv"

customers_df = pd.read_csv(customers_file)
products_df = pd.read_csv(products_file)
transactions_df = pd.read_csv(transactions_file)

# Step 1: Merge datasets to create a comprehensive view of transactions
transactions_with_products = transactions_df.merge(products_df, on="ProductID", how="left")
data = transactions_with_products.merge(customers_df, on="CustomerID", how="left")

# Step 2: Feature engineering
# Aggregate transactions by customer
customer_features = (
    data.groupby("CustomerID")
    .agg(
        total_spent=("TotalValue", "sum"),
        total_quantity=("Quantity", "sum"),
        unique_products=("ProductID", "nunique"),
        regions=("Region", "first"),  # Region is constant per customer
    )
    .reset_index()
)

# Add product category preferences
product_preferences = (
    data.groupby(["CustomerID", "Category"])
    .size()
    .unstack(fill_value=0)  # Each customer's interaction with categories
    .reset_index()
)

# Combine customer features and preferences into a single dataset
customer_data = pd.merge(customer_features, product_preferences, on="CustomerID", how="left")

# Step 3: Prepare data for clustering
# Use numerical features for clustering
clustering_features = customer_data.drop(columns=["CustomerID", "regions"])
scaler = StandardScaler()
scaled_clustering_features = scaler.fit_transform(clustering_features)

# Step 4: Perform clustering and evaluate Davies-Bouldin Index
# Evaluate for 2 to 10 clusters using KMeans
db_scores = []
cluster_models = {}

for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(scaled_clustering_features)
    db_index = davies_bouldin_score(scaled_clustering_features, cluster_labels)
    db_scores.append(db_index)
    cluster_models[n_clusters] = (kmeans, cluster_labels, db_index)

# Find the optimal number of clusters with the lowest DB Index
optimal_clusters = min(db_scores, key=lambda x: db_scores.index(x))
optimal_n_clusters = db_scores.index(optimal_clusters) + 2

# Retrieve the clustering results for the optimal number of clusters
best_model, best_labels, best_db_index = cluster_models[optimal_n_clusters]
customer_data["Cluster"] = best_labels

# Step 5: Visualize DB Index values
plt.figure(figsize=(8, 6))
sns.lineplot(x=range(2, 11), y=db_scores, marker="o", color="blue")
plt.title("DB Index vs Number of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Davies-Bouldin Index")
plt.xticks(range(2, 11))
plt.grid()
db_index_plot_path = "db_index_plot.png"
plt.savefig(db_index_plot_path)
plt.close()

# Save clustering results
clustering_results_path = "Clustering_Results.csv"
customer_data.to_csv(clustering_results_path, index=False)

print("Optimal Number of Clusters:", optimal_n_clusters)
print("Davies-Bouldin Index:", best_db_index)
print("Clustering results saved to:", clustering_results_path)
print("DB Index plot saved to:", db_index_plot_path)


FileNotFoundError: [Errno 2] No such file or directory: 'Customers.csv'