In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-personality-analysis/marketing_campaign.csv


In [26]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from ctgan import CTGAN

# ─────────────────────────────────────────────────────────────
# STEP 1: DATA LOADING & PREPROCESSING
# ─────────────────────────────────────────────────────────────

df = pd.read_csv("/kaggle/input/customer-personality-analysis/marketing_campaign.csv"
    ,
    sep="\t"
)

print(f"Original shape: {df.shape}")


Original shape: (2240, 29)


In [27]:
 !pip install ctgan -q

In [28]:
pip install --upgrade ctgan

Note: you may need to restart the kernel to use updated packages.


Step 1- Data preprocessing

In [29]:
# ── 1a. Clean Income ─────────────────────────────────────────
df["Income"] = df["Income"].fillna(df["Income"].median())

# ── 1b. Feature Engineering ──────────────────────────────────
CURRENT_YEAR = 2024
df["Age"] = CURRENT_YEAR - df["Year_Birth"]

mnt_cols = [c for c in df.columns if c.startswith("Mnt")]
df["TotalSpent"] = df[mnt_cols].sum(axis=1)

# ── 1c. Simplify Education (→ 3 categories) ──────────────────
edu_map = {
    "Basic"      : "Low",
    "2n Cycle"   : "Mid",
    "Graduation" : "Mid",
    "Master"     : "High",
    "PhD"        : "High",
}
df["Education"] = df["Education"].map(edu_map).fillna("Mid")

# ── 1d. Simplify Marital Status (→ 4 categories) ─────────────
marital_map = {
    "Single"   : "Single",
    "Divorced" : "Single",
    "Widow"    : "Single",
    "Alone"    : "Single",
    "Absurd"   : "Single",
    "YOLO"     : "Single",
    "Married"  : "Partnered",
    "Together" : "Partnered",
}
df["Marital_Status"] = df["Marital_Status"].map(marital_map).fillna("Single")

In [30]:
# ── 1e. Select & clean modelling columns ─────────────────────
CONTINUOUS_COLS = ["Income", "Age", "TotalSpent", "Recency",
                   "NumWebPurchases", "NumStorePurchases", "NumCatalogPurchases"]
DISCRETE_COLS   = ["Education", "Marital_Status", "Kidhome", "Teenhome",
                   "AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3",
                   "AcceptedCmp4", "AcceptedCmp5", "Response"]

ALL_COLS = CONTINUOUS_COLS + DISCRETE_COLS
df_clean = df[ALL_COLS].dropna().reset_index(drop=True)

# Cast discrete columns to string so CTGAN treats them categorically
for col in DISCRETE_COLS:
    df_clean[col] = df_clean[col].astype(str)

print(f"Cleaned shape  : {df_clean.shape}")
print(f"Discrete cols  : {DISCRETE_COLS}")

Cleaned shape  : (2240, 17)
Discrete cols  : ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']


 ─────────────────────────────────────────────────────────────
# STEP 2: CTGAN — TRAIN & GENERATE SYNTHETIC DATA
# ─────────────────────────────────────────────────────────────


In [31]:

print("\n[CTGAN] Training synthesizer for 100 epochs …")

synthesizer = CTGAN(
    epochs=100,
    batch_size=500,
    verbose=True,
)
synthesizer.fit(df_clean, discrete_columns=DISCRETE_COLS)

print("\n[CTGAN] Generating 1,500 synthetic rows …")
synthetic_df = synthesizer.sample(1500)

# ── Post-generation sanity fixes ─────────────────────────────
# Enforce non-negative values on numeric columns that must be ≥ 0
non_neg_cols = ["Income", "Age", "TotalSpent", "Recency",
                "NumWebPurchases", "NumStorePurchases", "NumCatalogPurchases"]
for col in non_neg_cols:
    synthetic_df[col] = synthetic_df[col].clip(lower=0)

# Cap Age to a realistic range
synthetic_df["Age"] = synthetic_df["Age"].clip(18, 100)

print(f"Synthetic data shape: {synthetic_df.shape}")
print("\nSynthetic sample:")
print(synthetic_df.head(3))


[CTGAN] Training synthesizer for 100 epochs …


Gen. (-01.26) | Discrim. (-00.04): 100%|██████████| 100/100 [00:23<00:00,  4.32it/s]


[CTGAN] Generating 1,500 synthetic rows …
Synthetic data shape: (1500, 17)

Synthetic sample:
         Income  Age  TotalSpent  Recency  NumWebPurchases  NumStorePurchases  \
0  93209.827803   63        1576        0                1                  5   
1  25955.680980   74           0       38                2                 11   
2  48841.769087   42         591       76                2                  3   

   NumCatalogPurchases Education Marital_Status Kidhome Teenhome AcceptedCmp1  \
0                    3      High      Partnered       1        0            0   
1                   11       Mid      Partnered       0        1            0   
2                    0       Mid      Partnered       0        1            0   

  AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 Response  
0            0            0            0            1        1  
1            0            0            0            0        0  
2            0            0            0            0       




# ─────────────────────────────────────────────────────────────
# STEP 3: COMBINE & CLUSTER (K-MEANS)
# ─────────────────────────────────────────────────────────────

In [32]:
# ── 3a. Combine real + synthetic ─────────────────────────────
df_clean["source"]    = "real"
synthetic_df["source"] = "synthetic"
combined = pd.concat([df_clean, synthetic_df], ignore_index=True)

print(f"\nCombined dataset shape: {combined.shape}")

# ── 3b. Scale continuous features ────────────────────────────
scaler     = StandardScaler()
X_scaled   = scaler.fit_transform(combined[CONTINUOUS_COLS])

# ── 3c. Elbow Method (print inertias, pick best k) ───────────
print("\n[Elbow Method] Inertia by k:")
inertias = {}
for k in range(2, 9):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertias[k] = km.inertia_
    print(f"  k={k}  →  inertia={km.inertia_:,.0f}")

# Automatically pick k with the largest drop (elbow heuristic)
drops = {k: inertias[k-1] - inertias[k] for k in range(3, 9)}
optimal_k = max(drops, key=drops.get)          # k that gives biggest gain
optimal_k = max(4, min(optimal_k, 5))          # constrain to 4-5 per requirement
print(f"\n→ Optimal k selected: {optimal_k}")

# ── 3d. Final KMeans model ───────────────────────────────────
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
combined["Cluster"] = kmeans.fit_predict(X_scaled)


Combined dataset shape: (3740, 18)

[Elbow Method] Inertia by k:
  k=2  →  inertia=19,914
  k=3  →  inertia=17,561
  k=4  →  inertia=16,253
  k=5  →  inertia=15,312
  k=6  →  inertia=14,432
  k=7  →  inertia=13,753
  k=8  →  inertia=13,167

→ Optimal k selected: 4


# ─────────────────────────────────────────────────────────────
# STEP 4: CLUSTER PROFILES
# ─────────────────────────────────────────────────────────────


In [33]:
profile_cols = ["Income", "TotalSpent", "Age",
                "Recency", "NumWebPurchases", "NumStorePurchases"]

cluster_profiles = (
    combined
    .groupby("Cluster")[profile_cols]
    .mean()
    .round(2)
)

# Add cluster size
cluster_profiles["Count"]        = combined.groupby("Cluster")["Income"].count()
cluster_profiles["SyntheticPct"] = (
    combined[combined["source"] == "synthetic"]
    .groupby("Cluster")["Income"].count()
    .div(cluster_profiles["Count"])
    .mul(100)
    .round(1)
)

print("\n" + "="*70)
print("CLUSTER PROFILES — Mean Values")
print("="*70)
print(cluster_profiles.to_string())
print("="*70)



CLUSTER PROFILES — Mean Values
           Income  TotalSpent    Age  Recency  NumWebPurchases  NumStorePurchases  Count  SyntheticPct
Cluster                                                                                               
0        73614.74     1350.65  56.00    47.37             5.51               8.71    841          23.5
1        64922.77      514.98  60.22    71.50             5.64               6.14    788          53.4
2        33673.37      115.59  51.63    47.42             2.03               3.12   1063          11.9
3        68801.37      509.44  58.83    15.32             6.04               5.68   1048          72.0


In [34]:
# Optional: Label clusters by spend tier for readability
spend_order = cluster_profiles["TotalSpent"].rank().astype(int)
label_map   = {idx: f"Cluster {i} (Tier {rank})"
               for i, (idx, rank) in enumerate(spend_order.items())}
cluster_profiles.index = cluster_profiles.index.map(label_map)

print("\nLabelled profiles:")
print(cluster_profiles[["Income", "TotalSpent", "Age", "Count"]].to_string())


Labelled profiles:
                      Income  TotalSpent    Age  Count
Cluster                                               
Cluster 0 (Tier 4)  73614.74     1350.65  56.00    841
Cluster 1 (Tier 3)  64922.77      514.98  60.22    788
Cluster 2 (Tier 1)  33673.37      115.59  51.63   1063
Cluster 3 (Tier 2)  68801.37      509.44  58.83   1048


# ─────────────────────────────────────────────────────────────
# STEP 5 ─ Drift Engine (February 2026 Baseline)
# ─────────────────────────────────────────────────────────────

In [35]:

MARKET_SIGNALS = {
    "inflation"      : 2.75,   # %
    "interest_rate"  : 5.25,   # Repo Rate %
    "unemployment"   : 5.00,   # %
    "gdp_growth"     : 7.80,   # %  (used as a moderating factor)
}

In [41]:
CLUSTER_PERSONAS = pd.DataFrame({
    "Cluster"            : ["C1", "C2", "C3", "C4", "C5"],
    "Label"              : [
        "Conservative Low-Income",
        "Young Mid-Spender",
        "Affluent High-Spender",
        "Cautious Saver",
        "Premium Power-Buyer",
    ],
    "Income"             : [28_000, 52_000, 88_000, 45_000, 130_000],
    "TotalSpent"         : [  210,     780,   1_850,    430,    2_950],
    "Age"                : [   55,      32,      44,     48,       39],
    # Behavioural attributes (all on 0-1 scale)
    "risk_tolerance"     : [0.20, 0.55, 0.70, 0.35, 0.85],
    "spending_propensity": [0.25, 0.60, 0.75, 0.40, 0.90],
    "default_prob"       : [0.18, 0.10, 0.05, 0.12, 0.03],
})

# ─────────────────────────────────────────────────────────────
# STEP 2B ─ CLUSTER-SPECIFIC SENSITIVITY COEFFICIENTS
#           α → risk erosion   β → spending erosion
#           γ → default risk amplification
# ─────────────────────────────────────────────────────────────

SENSITIVITY = {
    #  cluster  alpha    beta    gamma
    "C1": dict(alpha=0.030, beta=0.025, gamma=0.020),  # high macro sensitivity
    "C2": dict(alpha=0.020, beta=0.018, gamma=0.015),  # moderate young earner
    "C3": dict(alpha=0.012, beta=0.010, gamma=0.008),  # resilient affluent
    "C4": dict(alpha=0.022, beta=0.020, gamma=0.017),  # cautious, mid-sensitive
    "C5": dict(alpha=0.008, beta=0.007, gamma=0.005),  # near-immune premium
}


In [37]:
def calculate_fsi(signals: dict) -> float:
    """
    Financial Stress Index (FSI)
    FSI = 0.4×inflation + 0.3×interest_rate + 0.3×unemployment

    GDP growth acts as a natural moderator: if gdp_growth > 6 %
    (a robust economy) we apply a mild dampening factor so that
    strong growth partially offsets macro stress.
    """
    raw_fsi = (
        0.4 * signals["inflation"] +
        0.3 * signals["interest_rate"] +
        0.3 * signals["unemployment"]
    )

    # GDP moderator: each 1 % above 6 % baseline dampens FSI by 1.5 %
    gdp_baseline  = 6.0
    gdp_bonus     = max(0, signals["gdp_growth"] - gdp_baseline)
    dampening     = 1 - (0.015 * gdp_bonus)          # e.g. 7.8 % → factor ≈ 0.973
    moderated_fsi = raw_fsi * dampening

    print(f"  Raw FSI          : {raw_fsi:.4f}")
    print(f"  GDP Dampening    : {dampening:.4f}  (GDP={signals['gdp_growth']}%)")
    print(f"  Moderated FSI    : {moderated_fsi:.4f}")
    return moderated_fsi

In [38]:
def apply_drift(persona_row: pd.Series, fsi: float) -> pd.Series:
    """
    Drift equations
      risk_tolerance(t+1)      = risk_tolerance(t)      - α × FSI
      spending_propensity(t+1) = spending_propensity(t) - β × FSI
      default_prob(t+1)        = default_prob(t)        + γ × FSI

    All outputs are clipped to [0, 1] to stay on a probability scale.
    """
    cluster = persona_row["Cluster"]
    coeffs  = SENSITIVITY[cluster]

    drifted = persona_row.copy()

    drifted["risk_tolerance"] = np.clip(
        persona_row["risk_tolerance"] - coeffs["alpha"] * fsi, 0.0, 1.0
    )
    drifted["spending_propensity"] = np.clip(
        persona_row["spending_propensity"] - coeffs["beta"] * fsi, 0.0, 1.0
    )
    drifted["default_prob"] = np.clip(
        persona_row["default_prob"] + coeffs["gamma"] * fsi, 0.0, 1.0
    )
    return drifted

In [39]:
def predict_purchase_probability(
    income: float,
    total_spent: float,
    risk_tolerance: float,
    fsi: float,
    income_max: float = 130_000,
    spent_max:  float = 2_950,
) -> float:
    """
    Composite Purchase Probability Score  (0 – 100)

    Components
    ──────────
    • Income Score      → normalised income (higher = more capacity)  weight 25 %
    • Spend Score       → normalised historical spend (habit proxy)   weight 35 %
    • Risk Score        → post-drift risk tolerance                   weight 30 %
    • Stress Penalty    → FSI drags down probability                  weight 10 %

    Final score is sigmoid-smoothed to avoid hard boundaries.
    """
    income_score = (income / income_max) * 100
    spend_score  = (total_spent / spent_max) * 100
    risk_score   = risk_tolerance * 100

    # FSI penalty: normalise FSI (theoretical max ≈ 10) → % penalty
    fsi_max      = 10.0
    stress_penalty = (fsi / fsi_max) * 100

    raw_score = (
        0.25 * income_score +
        0.35 * spend_score  +
        0.30 * risk_score   -
        0.10 * stress_penalty
    )

    # Sigmoid smoothing: keeps output in (0, 100) with natural taper at extremes
    def sigmoid(x, midpoint=50, steepness=0.07):
        return 100 / (1 + np.exp(-steepness * (x - midpoint)))

    smoothed = sigmoid(raw_score)
    return round(float(np.clip(smoothed, 0, 100)), 2)


def campaign_success_pct(cluster_df: pd.DataFrame, fsi: float) -> pd.Series:
    """Compute per-cluster mean purchase probability after drift."""
    scores = cluster_df.apply(
        lambda row: predict_purchase_probability(
            income         = row["Income"],
            total_spent    = row["TotalSpent"],
            risk_tolerance = row["risk_tolerance"],
            fsi            = fsi,
        ),
        axis=1,
    )
    return scores

In [40]:
def run_drift_engine():
    print("=" * 65)
    print("  DriftLab — Behavioral Drift Engine  |  February 2026")
    print("=" * 65)

    # ── 4a. Compute FSI ───────────────────────────────────────
    print("\n[1] Computing Financial Stress Index (FSI) …")
    fsi = calculate_fsi(MARKET_SIGNALS)

    # ── 4b. Apply drift to every cluster ─────────────────────
    print("\n[2] Applying drift equations to cluster personas …")
    before_df = CLUSTER_PERSONAS.copy()
    after_rows = [apply_drift(row, fsi) for _, row in before_df.iterrows()]
    after_df  = pd.DataFrame(after_rows).reset_index(drop=True)

    # ── 4c. Compute purchase probabilities ───────────────────
    print("\n[3] Computing Purchase Probability Scores …")
    before_df["purchase_prob"] = campaign_success_pct(before_df, fsi=0)   # no stress
    after_df["purchase_prob"]  = campaign_success_pct(after_df,  fsi=fsi) # with FSI

    # ── 4d. Compute per-cluster drift deltas ──────────────────
    DRIFT_COLS = ["risk_tolerance", "spending_propensity", "default_prob", "purchase_prob"]

    # ── 4e. Assemble side-by-side report DataFrame ───────────
    report_rows = []
    for i, cluster_id in enumerate(before_df["Cluster"]):
        b = before_df.iloc[i]
        a = after_df.iloc[i]
        coeffs = SENSITIVITY[cluster_id]

        report_rows.append({
            # Identity
            "Cluster"                        : cluster_id,
            "Label"                          : b["Label"],
            # Financials
            "Income"                         : int(b["Income"]),
            "TotalSpent"                     : int(b["TotalSpent"]),
            "Age"                            : int(b["Age"]),
            # Sensitivity coefficients
            "α (risk)"                       : coeffs["alpha"],
            "β (spend)"                      : coeffs["beta"],
            "γ (default)"                    : coeffs["gamma"],
            # BEFORE drift
            "RiskTol_Before"                 : round(b["risk_tolerance"],      4),
            "SpendProp_Before"               : round(b["spending_propensity"], 4),
            "DefaultProb_Before"             : round(b["default_prob"],        4),
            "PurchaseProb_Before (%)"        : round(b["purchase_prob"],       2),
            # AFTER drift (Feb 2026)
            "RiskTol_After"                  : round(a["risk_tolerance"],      4),
            "SpendProp_After"                : round(a["spending_propensity"], 4),
            "DefaultProb_After"              : round(a["default_prob"],        4),
            "PurchaseProb_After (%)"         : round(a["purchase_prob"],       2),
            # Deltas
            "ΔRiskTol"                       : round(a["risk_tolerance"]      - b["risk_tolerance"],      4),
            "ΔSpendProp"                     : round(a["spending_propensity"] - b["spending_propensity"], 4),
            "ΔDefaultProb"                   : round(a["default_prob"]        - b["default_prob"],        4),
            "ΔPurchaseProb (pp)"             : round(a["purchase_prob"]       - b["purchase_prob"],       2),
            # FSI context
            "FSI"                            : round(fsi, 4),
        })

    report_df = pd.DataFrame(report_rows)

    # ── 4f. Print formatted report ────────────────────────────
    print("\n" + "=" * 65)
    print("  BEHAVIORAL DRIFT REPORT  |  Before vs After (Feb 2026)")
    print("=" * 65)

    # Split wide table into readable blocks
    identity_cols   = ["Cluster", "Label", "Income", "TotalSpent", "Age", "FSI"]
    coeff_cols      = ["Cluster", "α (risk)", "β (spend)", "γ (default)"]
    before_cols     = ["Cluster", "RiskTol_Before", "SpendProp_Before",
                       "DefaultProb_Before", "PurchaseProb_Before (%)"]
    after_cols      = ["Cluster", "RiskTol_After", "SpendProp_After",
                       "DefaultProb_After", "PurchaseProb_After (%)"]
    delta_cols      = ["Cluster", "ΔRiskTol", "ΔSpendProp",
                       "ΔDefaultProb", "ΔPurchaseProb (pp)"]
    success_cols    = ["Cluster", "Label",
                       "PurchaseProb_Before (%)", "PurchaseProb_After (%)",
                       "ΔPurchaseProb (pp)"]

    block_headers = [
        ("── Cluster Identity & Market Context", identity_cols),
        ("── Sensitivity Coefficients per Cluster", coeff_cols),
        ("── BEFORE Drift Attributes",  before_cols),
        ("── AFTER Drift Attributes (Feb 2026 FSI Applied)", after_cols),
        ("── Drift Deltas (After − Before)", delta_cols),
        ("── Campaign Success Probability Summary", success_cols),
    ]

    for title, cols in block_headers:
        print(f"\n{title}")
        print(report_df[cols].to_string(index=False))

    print("\n" + "=" * 65)
    print("  INTERPRETATION GUIDE")
    print("=" * 65)
    print(f"  FSI (Feb 2026)   : {fsi:.4f}  — Moderate Stress Zone")
    print(f"  Market Signals   : Inflation={MARKET_SIGNALS['inflation']}% | "
          f"Rate={MARKET_SIGNALS['interest_rate']}% | "
          f"Unemployment={MARKET_SIGNALS['unemployment']}% | "
          f"GDP={MARKET_SIGNALS['gdp_growth']}%")
    print()

    for _, row in report_df.iterrows():
        direction = "▲ RESILIENT" if row["ΔPurchaseProb (pp)"] > -3 else "▼ AT RISK"
        print(f"  {row['Cluster']} | {row['Label']:<28} | "
              f"Success: {row['PurchaseProb_After (%)']:>5.2f}%  [{direction}]  "
              f"Δ={row['ΔPurchaseProb (pp)']:+.2f}pp")

    print("=" * 65)

    return report_df, fsi


In [42]:
if __name__ == "__main__":
    report_df, fsi = run_drift_engine()

    # Save to CSV for downstream use
    report_df.to_csv("driftlab_report_feb2026.csv", index=False)
    print("\n✔  Report saved to  driftlab_report_feb2026.csv")


  DriftLab — Behavioral Drift Engine  |  February 2026

[1] Computing Financial Stress Index (FSI) …
  Raw FSI          : 4.1750
  GDP Dampening    : 0.9730  (GDP=7.8%)
  Moderated FSI    : 4.0623

[2] Applying drift equations to cluster personas …

[3] Computing Purchase Probability Scores …

  BEHAVIORAL DRIFT REPORT  |  Before vs After (Feb 2026)

── Cluster Identity & Market Context
Cluster                   Label  Income  TotalSpent  Age    FSI
     C1 Conservative Low-Income   28000         210   55 4.0623
     C2       Young Mid-Spender   52000         780   32 4.0623
     C3   Affluent High-Spender   88000        1850   44 4.0623
     C4          Cautious Saver   45000         430   48 4.0623
     C5     Premium Power-Buyer  130000        2950   39 4.0623

── Sensitivity Coefficients per Cluster
Cluster  α (risk)  β (spend)  γ (default)
     C1     0.030      0.025        0.020
     C2     0.020      0.018        0.015
     C3     0.012      0.010        0.008
     C4     0.022