In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import os

# Step 1: Load the Dataset
file_path = r"C:\Users\samhe\OneDrive\Documents\AI Project 3 - Customer Behavior\click_behavior_analysis.csv"  
df = pd.read_csv(file_path)

# Convert transaction_date to datetime format (if needed)
df["transaction_date"] = pd.to_datetime(df["transaction_date"], errors='coerce')

# Step 2: Remove the Existing Click Efficiency Column (if it exists)
if "click_efficiency" in df.columns:
    df.drop(columns=["click_efficiency"], inplace=True)

# Step 3: Prepare Data for Clustering (Extract Click-to-Purchase Ratio)
click_purchase_ratio_values = df[["click_to_purchase_ratio"]].values

# Step 4: Apply K-Means Clustering (3 Clusters for Low, Medium, High)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df["click_efficiency_cluster"] = kmeans.fit_predict(click_purchase_ratio_values)

# Step 5: Assign Meaningful Labels (Low, Medium, High)
# Sorting clusters based on the mean Click-to-Purchase Ratio
cluster_order = np.argsort(kmeans.cluster_centers_.flatten())

# Map cluster numbers to efficiency labels
efficiency_labels = {cluster_order[0]: "Low", cluster_order[1]: "Medium", cluster_order[2]: "High"}
df["click_efficiency"] = df["click_efficiency_cluster"].map(efficiency_labels)

# Drop the temporary cluster column
df.drop(columns=["click_efficiency_cluster"], inplace=True)

# Step 6: Save the Updated Dataset
updated_file_path = r"C:\Users\samhe\OneDrive\Documents\AI Project 3 - Customer Behavior\click_behavior_analysis_kmeans.csv"
df.to_csv(updated_file_path, index=False)

print(f"✅ Updated dataset saved as: {updated_file_path}")
print(df["click_efficiency"].value_counts())  # Show distribution of Low, Medium, High

✅ Updated dataset saved as: C:\Users\samhe\OneDrive\Documents\AI Project 3 - Customer Behavior\click_behavior_analysis_kmeans.csv
click_efficiency
Low       45658
Medium    39530
High      14812
Name: count, dtype: int64
