In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("../data/raw/data.csv")
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"], errors="coerce")

In [3]:
snapshot_date = df["TransactionStartTime"].max() + pd.Timedelta(days=1)

In [4]:
rfm = df.groupby("CustomerId").agg({
    "TransactionStartTime": lambda x: (snapshot_date - x.max()).days,
    "TransactionId": "count",
    "Amount": "sum"
}).rename(columns={
    "TransactionStartTime": "Recency",
    "TransactionId": "Frequency",
    "Amount": "Monetary"
}).reset_index()

In [5]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

In [6]:
kmeans = KMeans(n_clusters=3, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

In [7]:
cluster_stats = rfm.groupby("Cluster")[["Recency", "Frequency", "Monetary"]].mean()
high_risk_cluster = cluster_stats["Recency"].idxmax()

In [8]:
rfm["is_high_risk"] = (rfm["Cluster"] == high_risk_cluster).astype(int)

In [9]:
rfm.to_csv("../data/processed/rfm_with_labels.csv", index=False)

In [11]:
features = pd.read_csv("../data/processed/final_features.csv")

In [12]:
final_df = features.merge(rfm[["CustomerId", "is_high_risk"]], on="CustomerId", how="left")
final_df["is_high_risk"] = final_df["is_high_risk"].fillna(0).astype(int)

In [13]:
final_df.to_csv("../data/processed/final_with_target.csv", index=False)

print("✅ final_with_target.csv created with RFM proxy label.")

✅ final_with_target.csv created with RFM proxy label.
