In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import joblib

In [3]:
df = pd.read_excel("Online Retail.xlsx")

In [4]:
df.dropna(subset=['CustomerID'], inplace=True)
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

In [5]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
customer_df = df.groupby('CustomerID').agg({
    'InvoiceNo': 'nunique',
    'Quantity': 'sum',
    'TotalPrice': 'sum',
    'StockCode': 'nunique',
    'InvoiceDate': 'max'
}).rename(columns={
    'InvoiceNo': 'Frequency',
    'Quantity': 'TotalQuantity',
    'StockCode': 'Variety',
    'TotalPrice': 'TotalSpending'
})

latest_date = df['InvoiceDate'].max()
customer_df['Recency'] = (latest_date - customer_df['InvoiceDate']).dt.days
customer_df.drop(columns='InvoiceDate', inplace=True)


In [6]:
for col in ['Frequency', 'TotalQuantity', 'TotalSpending']:
    customer_df[col] = np.log1p(customer_df[col])

In [7]:
features = ['Frequency', 'TotalQuantity', 'Variety', 'TotalSpending', 'Recency']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_df[features])

In [11]:
inertia = []
K_range = range(2, 10)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(scaled_features)
    inertia.append(km.inertia_)

optimal_k = 4  # Default fallback
for i in range(1, len(inertia)):
    if abs(inertia[i] - inertia[i-1]) < 0.05 * inertia[i-1]:
        optimal_k = K_range[i]
        break

print(f"Optimal number of clusters: {optimal_k}")


Optimal number of clusters: 4


In [16]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_df['Cluster'] = kmeans.fit_predict(scaled_features)


In [17]:
joblib.dump(kmeans, "kmeans_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [19]:
customer_df.to_csv("customer_segments.csv", index=True)


In [None]:

print("✅ Model and Scaler saved successfully!")

✅ Model and Scaler saved successfully!
