In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1. Load data
df = pd.read_csv("OnlineRetail.csv", encoding='latin-1')

# 2. RFM Preprocessing
df = df.dropna(subset=['CustomerID'])
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
monetary = df.groupby('CustomerID')['TotalAmount'].sum()
frequency = df.groupby('CustomerID')['InvoiceNo'].nunique()
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d-%m-%Y %H:%M')
recency = (df['InvoiceDate'].max() - df.groupby('CustomerID')['InvoiceDate'].max()).dt.days
rfm = pd.DataFrame({'Recency': recency, 'Frequency': frequency, 'Monetary': monetary})

# 3. Hierarchical Clustering (on a sample for speed)
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm.head(2000))
h_model = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
cluster_labels = h_model.fit_predict(rfm_scaled)

# 4. Train a classifier so we can predict for new inputs
clf = RandomForestClassifier()
clf.fit(rfm_scaled, cluster_labels)

# 5. Save
joblib.dump(clf, "retail_h_classifier.pkl")
joblib.dump(scaler, "retail_scaler.pkl")

['retail_scaler.pkl']