In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load data (reuse Stage 08 synthetic if you don't have real project data)
DATA = Path("data/raw/eda_homework.csv")
if DATA.exists():
    df = pd.read_csv(DATA, parse_dates=["date"])
else:
    # fallback synthetic data
    np.random.seed(0)
    n = 100
    df = pd.DataFrame({
        'income': np.random.normal(60000, 15000, n).astype(int),
        'spend': np.random.normal(2000, 600, n).astype(int),
        'credit_score': np.random.normal(680, 50, n).astype(int)
    })
df.head()

Unnamed: 0,date,region,age,income,transactions,spend
0,2021-02-01,West,37.6,28086.81,4,73.35
1,2021-02-02,North,43.0,33034.75,1,52.37
2,2021-02-03,South,38.2,50045.39,2,131.85
3,2021-02-04,South,24.9,39467.28,4,147.58
4,2021-02-05,South,59.8,31201.65,1,86.76


In [10]:
df['spend_income_ratio'] = df['spend'] / df['income']
df[['income','spend','spend_income_ratio']].head()

Unnamed: 0,income,spend,spend_income_ratio
0,28086.81,73.35,0.002612
1,33034.75,52.37,0.001585
2,50045.39,131.85,0.002635
3,39467.28,147.58,0.003739
4,31201.65,86.76,0.002781


### Feature 1 — Spend-to-Income Rati
- **Rationale:** Captures proportionality of spending relative to earnings.  
- **Why it matters:** Someone spending 3,000 with income 30,000 is different from spending 3,000 with income 100,000.  
- **Connection to EDA:** In Stage 08, income was right-skewed and positively related to spend. Ratio normalizes that relationship.

In [11]:
df = df.sort_values('date').copy()  # ensure date order if using synthetic Stage 08 data
df['rolling_spend_mean'] = df['spend'].rolling(window=3, min_periods=1).mean()
df[['date','spend','rolling_spend_mean']].head(10)

Unnamed: 0,date,spend,rolling_spend_mean
0,2021-02-01,73.35,73.35
1,2021-02-02,52.37,62.86
2,2021-02-03,131.85,85.856667
3,2021-02-04,147.58,110.6
4,2021-02-05,86.76,122.063333
5,2021-02-06,156.58,130.306667
6,2021-02-07,187.93,143.756667
7,2021-02-08,75.47,139.993333
8,2021-02-09,134.59,132.663333
9,2021-02-10,40.18,83.413333


### Feature 2 — Rolling Spend Mean (3-month)
- **Rationale:** Smooths noise and captures short-term spending trends.  
- **Why it matters:** Helps models detect momentum or seasonality in spending behavior.  
- **Connection to EDA:** Stage 08 scatter plots showed noisy relationships; a rolling average can reveal clearer patterns.

In [12]:
if 'credit_score' in df.columns:
    df['income_x_credit'] = df['income'] * df['credit_score']
    df[['income','credit_score','income_x_credit']].head()

### Feature 3 — Income × Credit Score (Interaction)
- **Rationale:** Combines earning power with creditworthiness.  
- **Why it matters:** High income with poor credit may differ from moderate income with excellent credit.  
- **Connection to EDA:** Correlations between income and credit were weak individually; interaction can capture nonlinear effects.

In [13]:
OUT = Path("data/processed")
OUT.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT / "eda_with_features.csv", index=False)
print("Saved to", OUT/"eda_with_features.csv")

Saved to data/processed/eda_with_features.csv
