## Exploratory Data Cleaning

In [2]:
import pandas as pd

# 1. Load the raw Excel
df = pd.read_excel("/Users/varun/Desktop/dynamic-audience-segmentation/data/raw/Online Retail.xlsx")

# 2. Peek at the data
print(df.shape)
print(df.columns.tolist())
df.head()

(541909, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df = df[(df.Quantity > 0) & (df.UnitPrice > 0)]

In [4]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

In [5]:
df = df.dropna(subset=["CustomerID"])
df["CustomerID"] = df["CustomerID"].astype(int).astype(str)

In [6]:
df = df.sort_values(["CustomerID","InvoiceDate"])
df["time_diff"] = df.groupby("CustomerID")["InvoiceDate"] \
                    .diff().dt.total_seconds().div(60)
# flag new sessions
df["new_session"] = (df["time_diff"] > 30) | df["time_diff"].isna()
df["session_id"] = df.groupby("CustomerID")["new_session"] \
                      .cumsum().astype(int).astype(str)

## Feature Aggregation & Export

In [7]:
# Example: compute RFM per session
session_agg = df.groupby(["CustomerID","session_id"]).agg(
    session_start=("InvoiceDate","min"),
    session_end=("InvoiceDate","max"),
    total_items=("Quantity","sum"),
    total_value=("UnitPrice","sum"),
    num_products=("StockCode","nunique")
).reset_index()

# Derive more features
session_agg["session_duration_mins"] = (
    (session_agg["session_end"] - session_agg["session_start"])
    .dt.total_seconds() / 60
)
session_agg["hour_of_day"] = session_agg["session_start"].dt.hour

In [9]:
session_agg.to_csv("/Users/varun/Desktop/dynamic-audience-segmentation/data/processed/sessions.csv", index=False)

In [10]:
def stream_batches(df, batch_size=20):
    for start in range(0, len(df), batch_size):
        yield df.iloc[start:start+batch_size].to_dict(orient="records")

def extract_features(sess):
    return {
        "total_items": sess["total_items"],
        "total_value": sess["total_value"],
        "num_products": sess["num_products"],
        "session_duration": sess["session_duration_mins"],
        "hour_of_day": sess["hour_of_day"]
    }

In [14]:
from river import cluster as river_cluster
from sklearn.cluster import MiniBatchKMeans

In [15]:
river_kmeans = cluster.KMeans(n_clusters=5)
minibatch_kmeans = MiniBatchKMeans(n_clusters=5, batch_size=100, random_state=42)

NameError: name 'cluster' is not defined

In [None]:
history_centers_river = []
history_centers_sklearn = []

for batch in stream_batches(session_agg, batch_size=20):
    for sess in batch:
        feats = extract_features(sess)
        
        # River KMeans
        river_kmeans.learn_one(feats)
        
        # MiniBatchKMeans
        X = np.array([list(feats.values())])
        minibatch_kmeans.partial_fit(X)
        
    # Record centroids over time
    history_centers_river.append(river_kmeans.centers)
    history_centers_sklearn.append(minibatch_kmeans.cluster_centers_)

In [None]:
def plot_centroids(history, title):
    for t, centers in enumerate(history):
        arr = np.array(centers)
        plt.scatter(arr[:, 0], arr[:, 1], label=f"t={t}")
    plt.title(title)
    plt.legend()
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

# Plot drift for each model
plot_centroids(history_centers_river, "River KMeans Centroid Drift")
plot_centroids(history_centers_sklearn, "MiniBatchKMeans Centroid Drift")

In [13]:
!pip install river

Collecting river
  Downloading river-0.22.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting pandas<3.0.0,>=2.2.3 (from river)
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting scipy<2.0.0,>=1.14.1 (from river)
  Downloading scipy-1.15.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (61 kB)
Downloading river-0.22.0-cp312-cp312-macosx_11_0_arm64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading scipy-1.15.3-cp312-cp312-macosx_14_0_arm64.whl (22.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.4/22.4 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packa