# Analysing customer behaviour: Next Purchase in 30 Days
This notebook sets up a reproducible workflow to predict whether a customer will make a purchase within the next 30 days using the existing `data/transaction-dataset.csv`. It includes environment setup, data ingestion, label generation, simple RFM-style features, a Logistic Regression baseline, evaluation, and outputs.

> Plain-English naming used throughout:
> - snapshot_date: the reference date for a customer (their most recent transaction inside a split window). This was previously called anchor_date.
> - label_next_30d: 1 if the customer purchases within 30 days after the snapshot_date, else 0.
> - predicted_probability_30d: the model’s estimated probability that a customer will purchase within 30 days.
> - is_top_10_percent: whether the customer falls in the top 10% by predicted_probability_30d for a given split.
> - split windows: train/val/test time ranges built from the transaction dates, using only “safe” snapshot dates that are at least 30 days before the dataset’s max date.


In [1]:
# 1) Set Up Environment and Paths
from pathlib import Path
import os
import datetime as dt

# Heuristic project root detection: prefer parent of notebooks/ if data/ exists there
_nb_cwd = Path.cwd().resolve()
_candidate_root = _nb_cwd if (_nb_cwd / "data").exists() else _nb_cwd.parent
project_root = Path(os.getenv("PROJECT_ROOT", str(_candidate_root))).resolve()

data_dir = project_root / "data"
output_dir = project_root / "outputs"
logs_dir = project_root / "logs"
output_dir.mkdir(parents=True, exist_ok=True)
logs_dir.mkdir(parents=True, exist_ok=True)

# Detect if running inside VS Code Jupyter
in_vscode = bool(os.getenv("VSCODE_PID"))
print(f"Project root: {project_root}")
print(f"Running in VS Code: {in_vscode}")

Project root: /Users/primeiscrime/Desktop/customer
Running in VS Code: True


In [2]:
# 2) Install/Verify Dependencies
%pip install -q --disable-pip-version-check --no-input --upgrade numpy pandas matplotlib seaborn scikit-learn pytest pyarrow || true

import numpy as _np
import pandas as _pd
import matplotlib as _mpl
import seaborn as _sns
import sklearn as _sk

print({
    "numpy": _np.__version__,
    "pandas": _pd.__version__,
    "matplotlib": _mpl.__version__,
    "seaborn": _sns.__version__,
    "scikit_learn": _sk.__version__,
})

Note: you may need to restart the kernel to use updated packages.
{'numpy': '2.0.2', 'pandas': '2.3.3', 'matplotlib': '3.9.4', 'seaborn': '0.13.2', 'scikit_learn': '1.6.1'}
{'numpy': '2.0.2', 'pandas': '2.3.3', 'matplotlib': '3.9.4', 'seaborn': '0.13.2', 'scikit_learn': '1.6.1'}


In [3]:
# 3) Imports and Global Config
import sys
import json
import logging
from typing import Tuple, Dict, Any, Optional
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import timedelta

# Reproducibility
RANDOM_SEED = int(os.getenv("SEED", 42))
np.random.seed(RANDOM_SEED)

# Pandas display options
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)
sns.set_theme(style="whitegrid")

In [4]:
# 4) Logging Configuration
log_level = os.getenv("LOGLEVEL", "INFO").upper()
logger = logging.getLogger("customer_behaviour")
logger.setLevel(log_level)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")

# Console handler
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(log_level)
ch.setFormatter(formatter)
logger.addHandler(ch)

# File handler
fh = logging.FileHandler(logs_dir / "notebook.log")
fh.setLevel(log_level)
fh.setFormatter(formatter)
logger.addHandler(fh)

logger.info(f"Python {sys.version}")
logger.info(f"Project root: {project_root}")
logger.info(f"Outputs: {output_dir}")

2025-10-25 23:30:57,183 | INFO | Python 3.9.6 (default, Aug  8 2025, 19:06:38) 
[Clang 17.0.0 (clang-1700.3.19.1)]
2025-10-25 23:30:57,184 | INFO | Project root: /Users/primeiscrime/Desktop/customer
2025-10-25 23:30:57,185 | INFO | Outputs: /Users/primeiscrime/Desktop/customer/outputs
2025-10-25 23:30:57,184 | INFO | Project root: /Users/primeiscrime/Desktop/customer
2025-10-25 23:30:57,185 | INFO | Outputs: /Users/primeiscrime/Desktop/customer/outputs


In [6]:
# 5) Data I/O Helpers
from typing import List
def load_csv(path: Path, **kwargs) -> pd.DataFrame:
    try:
        df = pd.read_csv(path, **kwargs)
        if df.empty:
            raise ValueError(f"CSV at {path} loaded but is empty.")
        return df
    except FileNotFoundError as e:
        raise FileNotFoundError(f"CSV not found at {path}") from e
    except Exception as e:
        raise RuntimeError(f"Failed to load CSV {path}: {e}") from e

def save_csv(df: pd.DataFrame, path: Path, index: bool = False) -> None:
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=index)
    except Exception as e:
        raise RuntimeError(f"Failed to save CSV to {path}: {e}") from e

def load_json(path: Path) -> Dict[str, Any]:
    try:
        with open(path, "r") as f:
            return json.load(f)
    except FileNotFoundError as e:
        raise FileNotFoundError(f"JSON not found at {path}") from e
    except Exception as e:
        raise RuntimeError(f"Failed to load JSON {path}: {e}") from e

def save_json(obj: Dict[str, Any], path: Path) -> None:
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w") as f:
            json.dump(obj, f, indent=2)
    except Exception as e:
        raise RuntimeError(f"Failed to save JSON to {path}: {e}") from e

In [7]:
# 6) Data Ingestion and Preview (clean summaries)
CSV_PATH = Path(os.getenv("TX_CSV", data_dir / "transaction-dataset.csv"))
logger.info(f"Loading CSV from {CSV_PATH}")
df = load_csv(CSV_PATH)
logger.info(f"Loaded shape: {df.shape}")

# Basic parsing and types
if df['TransactionDate'].dtype != 'datetime64[ns]':
    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')
missing_dates = int(df['TransactionDate'].isna().sum())
if missing_dates > 0:
    logger.warning(f"Rows with unparsable TransactionDate: {missing_dates}")
    df = df.dropna(subset=['TransactionDate']).copy()

# Column presence check
expected_cols = [
    'CustomerID','ProductID','Quantity','Price','TransactionDate',
    'PaymentMethod','StoreLocation','ProductCategory','DiscountApplied(%)','TotalAmount'
]
missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing expected columns: {missing_cols}")

min_date = df['TransactionDate'].min()
max_date = df['TransactionDate'].max()
logger.info(f"Date range: {min_date.date()} to {max_date.date()}")

display(df.head())

# Clean numeric summary (avoids NaNs for non-numeric fields)
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
if numeric_cols:
    display(df[numeric_cols].describe(percentiles=[.25,.5,.75]).T)

# Clean categorical summary: n_unique and top-5 values per column
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
if cat_cols:
    cat_summary_rows = []
    for c in cat_cols:
        vc = df[c].value_counts(dropna=False)
        top5 = ", ".join([f"{idx}:{cnt}" for idx, cnt in vc.head(5).items()])
        cat_summary_rows.append({
            'column': c,
            'n_unique': df[c].nunique(dropna=True),
            'top5': top5
        })
    display(pd.DataFrame(cat_summary_rows))

# Missingness summary (only show columns with >0% missing)
missing_pct = df.isna().mean().sort_values(ascending=False)
missing_pct = missing_pct[missing_pct > 0]
if not missing_pct.empty:
    display(missing_pct)
else:
    print("No missing values detected across columns.")

2025-10-25 23:31:20,326 | INFO | Loading CSV from /Users/primeiscrime/Desktop/customer/data/transaction-dataset.csv
2025-10-25 23:31:20,456 | INFO | Loaded shape: (200000, 10)
2025-10-25 23:31:20,469 | INFO | Date range: 2023-01-01 to 2023-11-26
2025-10-25 23:31:20,456 | INFO | Loaded shape: (200000, 10)
2025-10-25 23:31:20,469 | INFO | Date range: 2023-01-01 to 2023-11-26


Unnamed: 0,CustomerID,ProductID,Quantity,Price,TransactionDate,PaymentMethod,StoreLocation,ProductCategory,DiscountApplied(%),TotalAmount
0,C000047,P000474,1,39.05,2023-01-01,Credit Card,Phoenix,Beauty,21.07,30.83
1,C000116,P000332,1,21.07,2023-01-01,Credit Card,Miami,Books,6.25,19.75
2,C000116,P000019,1,50.52,2023-01-01,Cash,Miami,Beauty,0.0,50.52
3,C000227,P000379,2,33.04,2023-01-01,Cash,Phoenix,Beauty,6.82,61.58
4,C000343,P000027,4,108.6,2023-01-01,Credit Card,Seattle,Home & Kitchen,24.54,327.81


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,200000.0,1.99845,1.069071,1.0,1.0,2.0,3.0,4.0
Price,200000.0,64.83465,76.479172,2.74,17.72,39.6,78.59,455.54
DiscountApplied(%),200000.0,4.905064,8.19717,0.0,0.0,0.0,7.54,30.0
TotalAmount,200000.0,123.078226,177.904795,2.17,26.73,62.91,139.92,1790.25


Unnamed: 0,column,n_unique,top5
0,CustomerID,26360,"C021143:147, C000720:144, C001928:112, C001429..."
1,ProductID,600,"P000213:383, P000470:381, P000512:379, P000090..."
2,PaymentMethod,4,"Credit Card:110230, Debit Card:39902, Digital ..."
3,StoreLocation,8,"Los Angeles:25250, Seattle:25125, Miami:25076,..."
4,ProductCategory,8,"Clothing:28871, Home & Kitchen:28232, Books:26..."


No missing values detected across columns.


In [8]:
# 7) Core Computation: Splits, Labels, and Features
from dataclasses import dataclass
from collections import Counter
from sklearn.metrics import roc_auc_score, average_precision_score

LABEL_HORIZON_DAYS = 30
FEATURE_WINDOW_DAYS = 90

@dataclass
class SplitWindows:
    train_start: pd.Timestamp
    val_start: pd.Timestamp
    test_start: pd.Timestamp
    last_safe_snapshot: pd.Timestamp  # last date we can safely take a snapshot (max_date - label_horizon)

def compute_split_windows(df: pd.DataFrame, label_horizon_days: int = 30) -> SplitWindows:
    min_d = df['TransactionDate'].min().normalize()
    max_d = df['TransactionDate'].max().normalize()
    last_safe_snapshot = max_d - pd.Timedelta(days=label_horizon_days)
    # Proportional cut points at 60% and 80% of the safe window
    safe_days = (last_safe_snapshot - min_d).days
    cut1 = min_d + pd.Timedelta(days=int(safe_days * 0.6))
    cut2 = min_d + pd.Timedelta(days=int(safe_days * 0.8))
    return SplitWindows(train_start=min_d, val_start=cut1, test_start=cut2, last_safe_snapshot=last_safe_snapshot)

splits = compute_split_windows(df, LABEL_HORIZON_DAYS)
logger.info(f"Split windows: train>={splits.train_start.date()} val>={splits.val_start.date()} test>={splits.test_start.date()} last_safe_snapshot={splits.last_safe_snapshot.date()}")

def filter_by_window(df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    m = (df['TransactionDate'] >= start) & (df['TransactionDate'] <= end)
    return df.loc[m].copy()

def last_snapshot_per_customer(df_window: pd.DataFrame) -> pd.DataFrame:
    """Pick the last transaction per customer within the window as their snapshot (reference) date."""
    idx = df_window.sort_values('TransactionDate').groupby('CustomerID')['TransactionDate'].idxmax()
    snapshots = df_window.loc[idx].copy()
    snapshots = snapshots[['CustomerID', 'TransactionDate']]
    snapshots = snapshots.rename(columns={'TransactionDate': 'snapshot_date'})
    return snapshots.reset_index(drop=True)

def has_purchase_within(df_cust: pd.DataFrame, start_exclusive: pd.Timestamp, end_inclusive: pd.Timestamp) -> bool:
    return ((df_cust['TransactionDate'] > start_exclusive) & (df_cust['TransactionDate'] <= end_inclusive)).any()

def label_next_30d(df_all: pd.DataFrame, snapshots: pd.DataFrame, horizon_days: int = 30) -> pd.DataFrame:
    labels = []
    by_cust = {cid: grp.sort_values('TransactionDate') for cid, grp in df_all.groupby('CustomerID')}
    for row in snapshots.itertuples(index=False):
        cid = row.CustomerID
        t = row.snapshot_date
        future = has_purchase_within(by_cust.get(cid, pd.DataFrame(columns=df_all.columns)), t, t + pd.Timedelta(days=horizon_days))
        labels.append(int(future))
    out = snapshots.copy()
    out['label_next_30d'] = labels
    return out

def window_stats(df_hist: pd.DataFrame) -> Dict[str, Any]:
    # Basic RFM-like stats and behavioral mixes for the historical window
    if df_hist.empty:
        return {
            'recency_days': np.nan,
            'orders_30d': 0, 'orders_90d': 0,
            'amount_30d': 0.0, 'amount_90d': 0.0,
            'avg_discount_90d': np.nan,
            **{f"cat_{c}": 0 for c in CAT_VALUES},
            **{f"pay_{p}": 0 for p in PAY_VALUES},
        }
    latest = df_hist['TransactionDate'].max()
    recency = (latest - df_hist['TransactionDate'].max()).days  # will be 0 (snapshot on last txn); include for consistency
    # 30d/90d windows
    w30 = df_hist['TransactionDate'] >= (latest - pd.Timedelta(days=30))
    w90 = df_hist['TransactionDate'] >= (latest - pd.Timedelta(days=90))
    orders_30d = int(w30.sum())
    orders_90d = int(w90.sum())
    amount_30d = float(df_hist.loc[w30, 'TotalAmount'].sum())
    amount_90d = float(df_hist.loc[w90, 'TotalAmount'].sum())
    avg_discount_90d = float(df_hist.loc[w90, 'DiscountApplied(%)'].mean()) if orders_90d>0 else np.nan
    # Category and payment mixes in 90d
    cat_counts = Counter(df_hist.loc[w90, 'ProductCategory'])
    pay_counts = Counter(df_hist.loc[w90, 'PaymentMethod'])
    feats = {
        'recency_days': recency,
        'orders_30d': orders_30d, 'orders_90d': orders_90d,
        'amount_30d': amount_30d, 'amount_90d': amount_90d,
        'avg_discount_90d': avg_discount_90d,
    }
    feats.update({f"cat_{c}": int(cat_counts.get(c, 0)) for c in CAT_VALUES})
    feats.update({f"pay_{p}": int(pay_counts.get(p, 0)) for p in PAY_VALUES})
    return feats

CAT_VALUES = sorted(df['ProductCategory'].astype(str).unique().tolist())
PAY_VALUES = sorted(df['PaymentMethod'].astype(str).unique().tolist())

def build_features(df_all: pd.DataFrame, snapshots: pd.DataFrame, feature_window_days: int = 90) -> pd.DataFrame:
    records = []
    by_cust = {cid: grp.sort_values('TransactionDate') for cid, grp in df_all.groupby('CustomerID')}
    for row in snapshots.itertuples(index=False):
        cid = row.CustomerID
        t = row.snapshot_date
        hist = by_cust.get(cid, pd.DataFrame(columns=df_all.columns))
        if not hist.empty:
            hist = hist.loc[hist['TransactionDate'] <= t]
            hist = hist.loc[hist['TransactionDate'] >= (t - pd.Timedelta(days=feature_window_days))]
        feats = window_stats(hist)
        feats.update({'CustomerID': cid, 'snapshot_date': t})
        records.append(feats)
    return pd.DataFrame.from_records(records)

# Build split-specific snapshots
train_df = filter_by_window(df, splits.train_start, splits.val_start - pd.Timedelta(days=1))
val_df   = filter_by_window(df, splits.val_start, splits.test_start - pd.Timedelta(days=1))
test_df  = filter_by_window(df, splits.test_start, splits.last_safe_snapshot)

snapshots_train = last_snapshot_per_customer(train_df)
snapshots_val   = last_snapshot_per_customer(val_df)
snapshots_test  = last_snapshot_per_customer(test_df)
logger.info(f"Snapshots: train={len(snapshots_train)} val={len(snapshots_val)} test={len(snapshots_test)}")

# Labels
labels_train = label_next_30d(df, snapshots_train, LABEL_HORIZON_DAYS)
labels_val   = label_next_30d(df, snapshots_val, LABEL_HORIZON_DAYS)
labels_test  = label_next_30d(df, snapshots_test, LABEL_HORIZON_DAYS)
logger.info(f"Positives (train/val/test): {labels_train['label_next_30d'].mean():.3f} / {labels_val['label_next_30d'].mean():.3f} / {labels_test['label_next_30d'].mean():.3f}")

# Features
X_train = build_features(df, snapshots_train, FEATURE_WINDOW_DAYS)
X_val   = build_features(df, snapshots_val, FEATURE_WINDOW_DAYS)
X_test  = build_features(df, snapshots_test, FEATURE_WINDOW_DAYS)

y_train = labels_train['label_next_30d'].values
y_val   = labels_val['label_next_30d'].values
y_test  = labels_test['label_next_30d'].values

logger.info(f"Feature shapes: X_train={X_train.shape}, X_val={X_val.shape}, X_test={X_test.shape}")

2025-10-25 23:31:24,701 | INFO | Split windows: train>=2023-01-01 val>=2023-06-29 test>=2023-08-28 last_safe_snapshot=2023-10-27
2025-10-25 23:31:24,794 | INFO | Snapshots: train=23962 val=16693 test=16794
2025-10-25 23:31:24,794 | INFO | Snapshots: train=23962 val=16693 test=16794
2025-10-25 23:31:56,220 | INFO | Positives (train/val/test): 0.175 / 0.253 / 0.253
2025-10-25 23:31:56,220 | INFO | Positives (train/val/test): 0.175 / 0.253 / 0.253
2025-10-25 23:33:10,885 | INFO | Feature shapes: X_train=(23962, 20), X_val=(16693, 20), X_test=(16794, 20)
2025-10-25 23:33:10,885 | INFO | Feature shapes: X_train=(23962, 20), X_val=(16693, 20), X_test=(16794, 20)


In [None]:
# 8) Model: Logistic Regression Baseline and Evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Select numeric feature columns (exclude id/date)
non_feature_cols = {'CustomerID','snapshot_date'}
feature_cols = [c for c in X_train.columns if c not in non_feature_cols]
numeric_features = feature_cols  # all are numeric counts or amounts

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('scale', StandardScaler(with_mean=False)),
])
preprocess = ColumnTransformer([
    ('num', num_pipe, numeric_features)
], remainder='drop')

clf = LogisticRegression(max_iter=1000, class_weight='balanced', C=1.0, solver='lbfgs')
pipe = Pipeline([('prep', preprocess), ('clf', clf)])
pipe.fit(X_train[numeric_features], y_train)

def evaluate_split(name: str, X: pd.DataFrame, y: np.ndarray) -> Dict[str, float]:
    proba = pipe.predict_proba(X[numeric_features])[:,1]
    roc = roc_auc_score(y, proba)
    pr  = average_precision_score(y, proba)
    # Recall@Top10%
    k = max(1, int(0.10 * len(proba)))
    topk_idx = np.argsort(proba)[-k:]
    recall_at_k = float(y[topk_idx].sum()) / float(y.sum()) if y.sum() > 0 else np.nan
    logger.info(f"{name} ROC-AUC={roc:.3f} PR-AUC={pr:.3f} Recall@Top10%={recall_at_k:.3f}")
    return {"roc_auc": float(roc), "pr_auc": float(pr), "recall_at_10pct": recall_at_k}

metrics = {
    'val': evaluate_split('VAL', X_val, y_val),
    'test': evaluate_split('TEST', X_test, y_test),
}
save_json(metrics, output_dir / 'metrics_next_purchase_30d.json')
metrics

In [None]:
# 9) Curves and Plots
def plot_curves(X: pd.DataFrame, y: np.ndarray, tag: str):
    proba = pipe.predict_proba(X[numeric_features])[:,1]
    fpr, tpr, _ = roc_curve(y, proba)
    prec, rec, _ = precision_recall_curve(y, proba)
    fig, axes = plt.subplots(1,2, figsize=(12,5))
    axes[0].plot(fpr, tpr, label=f'ROC-AUC={roc_auc_score(y, proba):.3f}')
    axes[0].plot([0,1],[0,1],'k--', alpha=0.5)
    axes[0].set_title(f'ROC – Next purchase in 30 days – {tag}')
    axes[0].set_xlabel('False Positive Rate (FPR)')
    axes[0].set_ylabel('True Positive Rate (TPR)')
    axes[0].legend()
    axes[1].plot(rec, prec, label=f'PR-AUC={average_precision_score(y, proba):.3f}')
    axes[1].set_title(f'Precision–Recall – Next 30 days – {tag}')
    axes[1].set_xlabel('Recall')
    axes[1].set_ylabel('Precision')
    axes[1].legend()
    fig.tight_layout()
    out_path = output_dir / f'curves_{tag.lower()}.png'
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
    logger.info(f"Saved curves to {out_path}")

plot_curves(X_val, y_val, 'VAL')
plot_curves(X_test, y_test, 'TEST')

In [None]:
# 10) Save Outputs and Artifacts
snapshots_train.assign(split='train').to_csv(output_dir / 'snapshots_train.csv', index=False)
snapshots_val.assign(split='val').to_csv(output_dir / 'snapshots_val.csv', index=False)
snapshots_test.assign(split='test').to_csv(output_dir / 'snapshots_test.csv', index=False)
save_json(metrics, output_dir / 'metrics_next_purchase_30d.json')
print("Saved snapshots and metrics to outputs/")

In [None]:
# 11) Inline Unit Tests (pytest) for Helpers — Python-only runner
import sys, tempfile, subprocess, textwrap
import json as _json
import pandas as pd

TEST_CODE = textwrap.dedent(
    """
    import json as _json
    from pathlib import Path
    import pandas as pd
    import pytest

    def test_json_roundtrip(tmp_path: Path):
        obj = {"a": 1, "b": [1,2,3]}
        p = tmp_path / "x.json"
        p.write_text(_json.dumps(obj))
        got = _json.loads(p.read_text())
        assert got == obj

    def test_pandas_read_missing(tmp_path: Path):
        with pytest.raises(FileNotFoundError):
            _ = pd.read_csv(tmp_path / "missing.csv")
    """
)

with tempfile.NamedTemporaryFile("w", suffix="_test.py", delete=False) as f:
    f.write(TEST_CODE)
    test_path = f.name

print("Running pytest on", test_path)
res = subprocess.run([sys.executable, "-m", "pytest", "-q", test_path], text=True, capture_output=True)
print(res.stdout)
if res.returncode != 0:
    print(res.stderr, file=sys.stderr)
    raise SystemExit(res.returncode)

In [None]:
# 12) Profiling and Timing
import time, pstats, cProfile
start = time.perf_counter()
_ = build_features(df, snapshots_val.head(50), FEATURE_WINDOW_DAYS)  # small sample for speed
elapsed = time.perf_counter() - start
print(f"Feature build (50 snapshots) took {elapsed:.3f}s")

prof = cProfile.Profile()
prof.enable()
_ = build_features(df, snapshots_val.head(100), FEATURE_WINDOW_DAYS)
prof.disable()
ps = pstats.Stats(prof).sort_stats('cumulative')
ps.print_stats(10)

In [None]:
# 13) Export/Run from VS Code Terminal
import subprocess, sys
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
export_py = output_dir / f"next_purchase_30d_{timestamp}.py"
cmd = [sys.executable, "-m", "jupyter", "nbconvert", "--to", "script", str(project_root / "notebooks" / "next_purchase_30d.ipynb"), "--output", str(export_py)]
print("Exporting to script:", " ".join(cmd))
try:
    subprocess.run(cmd, check=False)
except Exception as e:
    print("Export failed (ok to ignore if nbconvert not installed):", e)

print("You can run the exported script or prefer running this notebook interactively.")

## 14) Refine feature functions (fix recency and windows)
This section refines how recency and rolling windows are computed to ensure no label leakage and clearer feature semantics.

In [None]:
# 14) Refine feature functions (fix recency and windows)
def window_stats(snapshot_date: pd.Timestamp, df_hist: pd.DataFrame) -> Dict[str, Any]:
    if df_hist.empty:
        return {
            'recency_days': np.nan,
            'orders_30d': 0, 'orders_90d': 0,
            'amount_30d': 0.0, 'amount_90d': 0.0,
            'avg_discount_90d': np.nan,
            **{f"cat_{c}": 0 for c in CAT_VALUES},
            **{f"pay_{p}": 0 for p in PAY_VALUES},
        }
    prev_dates = df_hist.loc[df_hist['TransactionDate'] < snapshot_date, 'TransactionDate']
    last_prev = prev_dates.max() if not prev_dates.empty else pd.NaT
    recency = (snapshot_date - last_prev).days if pd.notna(last_prev) else np.nan
    w30 = (df_hist['TransactionDate'] > (snapshot_date - pd.Timedelta(days=30))) & (df_hist['TransactionDate'] <= snapshot_date)
    w90 = (df_hist['TransactionDate'] > (snapshot_date - pd.Timedelta(days=90))) & (df_hist['TransactionDate'] <= snapshot_date)
    orders_30d = int(w30.sum())
    orders_90d = int(w90.sum())
    amount_30d = float(df_hist.loc[w30, 'TotalAmount'].sum())
    amount_90d = float(df_hist.loc[w90, 'TotalAmount'].sum())
    avg_discount_90d = float(df_hist.loc[w90, 'DiscountApplied(%)'].mean()) if orders_90d>0 else np.nan
    cat_counts = Counter(df_hist.loc[w90, 'ProductCategory'])
    pay_counts = Counter(df_hist.loc[w90, 'PaymentMethod'])
    feats = {
        'recency_days': recency,
        'orders_30d': orders_30d, 'orders_90d': orders_90d,
        'amount_30d': amount_30d, 'amount_90d': amount_90d,
        'avg_discount_90d': avg_discount_90d,
    }
    feats.update({f"cat_{c}": int(cat_counts.get(c, 0)) for c in CAT_VALUES})
    feats.update({f"pay_{p}": int(pay_counts.get(p, 0)) for p in PAY_VALUES})
    return feats

def build_features(df_all: pd.DataFrame, snapshots: pd.DataFrame, feature_window_days: int = 90) -> pd.DataFrame:
    records = []
    by_cust = {cid: grp.sort_values('TransactionDate') for cid, grp in df_all.groupby('CustomerID')}
    for row in snapshots.itertuples(index=False):
        cid = row.CustomerID
        t = row.snapshot_date
        hist = by_cust.get(cid, pd.DataFrame(columns=df_all.columns))
        if not hist.empty:
            hist = hist.loc[(hist['TransactionDate'] <= t) & (hist['TransactionDate'] >= (t - pd.Timedelta(days=feature_window_days)))]
        feats = window_stats(t, hist)
        feats.update({'CustomerID': cid, 'snapshot_date': t})
        records.append(feats)
    return pd.DataFrame.from_records(records)

## 15) Transform API
Wraps the label and feature building into a reusable transform(transactions, params) function.

In [None]:
# 15) Transform API
def transform(transactions: pd.DataFrame, params: Optional[Dict[str, Any]] = None) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Build features and labels for next-30d prediction using one snapshot per customer per split
    (the customer's last transaction inside the split window).

    transactions: DataFrame with required columns:
      ['CustomerID','ProductID','Quantity','Price','TransactionDate',
       'PaymentMethod','StoreLocation','ProductCategory','DiscountApplied(%)','TotalAmount']
    params: optional dict with keys:
      - label_horizon_days (int)
      - feature_window_days (int)
      - split ('train'|'val'|'test') to select snapshots
    Returns: (X, y) where X are features and y is the binary label Series.
    """
    p = {
        'label_horizon_days': LABEL_HORIZON_DAYS,
        'feature_window_days': FEATURE_WINDOW_DAYS,
        'split': 'train',
    }
    if params: p.update(params)
    splits_local = compute_split_windows(transactions, p['label_horizon_days'])
    if p['split'] == 'train':
        dwin = filter_by_window(transactions, splits_local.train_start, splits_local.val_start - pd.Timedelta(days=1))
    elif p['split'] == 'val':
        dwin = filter_by_window(transactions, splits_local.val_start, splits_local.test_start - pd.Timedelta(days=1))
    else:
        dwin = filter_by_window(transactions, splits_local.test_start, splits_local.last_safe_snapshot)
    snapshots = last_snapshot_per_customer(dwin)
    labels = label_next_30d(transactions, snapshots, p['label_horizon_days'])['label_next_30d']
    X = build_features(transactions, snapshots, p['feature_window_days'])
    return X, labels

## 16) Plot helper adjustments
Clarifies the plot_curves helper to use the numeric_features list consistently.

In [None]:
# 16) Plot helper adjustments
def plot_curves(X: pd.DataFrame, y: np.ndarray, tag: str):
    proba = pipe.predict_proba(X[numeric_features])[:,1]
    fpr, tpr, _ = roc_curve(y, proba)
    prec, rec, _ = precision_recall_curve(y, proba)
    fig, axes = plt.subplots(1,2, figsize=(12,5))
    axes[0].plot(fpr, tpr, label=f'ROC-AUC={roc_auc_score(y, proba):.3f}')
    axes[0].plot([0,1],[0,1],'k--', alpha=0.5)
    axes[0].set_title(f'ROC – Next purchase in 30 days – {tag}')
    axes[0].set_xlabel('False Positive Rate (FPR)')
    axes[0].set_ylabel('True Positive Rate (TPR)')
    axes[0].legend()
    axes[1].plot(rec, prec, label=f'PR-AUC={average_precision_score(y, proba):.3f}')
    axes[1].set_title(f'Precision–Recall – Next 30 days – {tag}')
    axes[1].set_xlabel('Recall')
    axes[1].set_ylabel('Precision')
    axes[1].legend()
    fig.tight_layout()
    out_path = output_dir / f'curves_{tag.lower()}.png'
    fig.savefig(out_path, dpi=120)
    plt.close(fig)
    logger.info(f"Saved curves to {out_path}")

## 17) Rebuild features and retrain (refined)
Recomputes features with the refined logic and retrains the model; metrics are compared to the initial baseline.

In [None]:
# 17) Rebuild features with refined logic and retrain
# Recompute features after refined window functions
X_train = build_features(df, snapshots_train, FEATURE_WINDOW_DAYS)
X_val   = build_features(df, snapshots_val, FEATURE_WINDOW_DAYS)
X_test  = build_features(df, snapshots_test, FEATURE_WINDOW_DAYS)
logger.info(f"[REFINED] Feature shapes: X_train={X_train.shape}, X_val={X_val.shape}, X_test={X_test.shape}")

# Retrain baseline
pipe.fit(X_train[numeric_features], y_train)
metrics_refined = {
    'val': evaluate_split('VAL-REFINED', X_val, y_val),
    'test': evaluate_split('TEST-REFINED', X_test, y_test),
}
save_json(metrics_refined, output_dir / 'metrics_next_purchase_30d_refined.json')
metrics_refined

## 18) Export ranked predictions (who is most likely to buy in 30 days)
Creates per-customer ranked prediction lists for Validation and Test splits with scores, true labels, and a Top-10% flag. Files are saved to the outputs/ folder.

In [None]:
# 18) Export ranked predictions for VAL and TEST
import numpy as _np
import pandas as _pd

def export_ranked_predictions(split_name: str, X: pd.DataFrame, snapshots: pd.DataFrame, labels: pd.DataFrame, top_pct: float = 0.10):
    proba = pipe.predict_proba(X[numeric_features])[:,1]
    df_pred = _pd.DataFrame({
        'CustomerID': X['CustomerID'].values if 'CustomerID' in X.columns else snapshots['CustomerID'].values,
        'snapshot_date': X['snapshot_date'].values if 'snapshot_date' in X.columns else snapshots['snapshot_date'].values,
        'predicted_probability_30d': proba,
        'label_next_30d': labels['label_next_30d'].values,
    })
    df_pred = df_pred.sort_values('predicted_probability_30d', ascending=False).reset_index(drop=True)
    k = max(1, int(len(df_pred) * top_pct))
    cutoff = df_pred.iloc[k-1]['predicted_probability_30d'] if len(df_pred) >= k else df_pred['predicted_probability_30d'].min()
    df_pred['is_top_10_percent'] = df_pred['predicted_probability_30d'] >= cutoff
    out_path = output_dir / f'predictions_{split_name.lower()}.csv'
    df_pred.to_csv(out_path, index=False)
    print(f"Saved {split_name} predictions to {out_path} (top {int(top_pct*100)}% flagged)")
    return df_pred.head(20)

display(export_ranked_predictions('VAL', X_val, snapshots_val, labels_val))
display(export_ranked_predictions('TEST', X_test, snapshots_test, labels_test))

## 19) What gets saved where (outputs map)
- metrics_next_purchase_30d.json: created in Section 8 (baseline metrics).
- metrics_next_purchase_30d_refined.json: created in Section 17 (refined features retrain).
- curves_val.png and curves_test.png: created in Sections 9/16 (ROC and PR plots for Validation/Test).
- snapshots_train.csv, snapshots_val.csv, snapshots_test.csv: created in Section 10 (one snapshot per customer per split).
- predictions_val.csv, predictions_test.csv: created in Section 18 (ranked customers with predicted_probability_30d and is_top_10_percent).

Tip: Open these files in `outputs/` after running the corresponding section to see the latest artifacts.
