In [1]:
# 02-Feature-Engineering.ipynb
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 0) Create the folder for feature CSVs
os.makedirs("data/features", exist_ok=True)

# 1) Load raw data
df = pd.read_csv("data/raw/churn_data.csv", parse_dates=["join_date"])

# 2) Outlier removal (IQR method on key numerics)
num_cols = ["tenure_days", "num_logins", "avg_session_min", "monthly_spend"]
df_clean = df.copy()
for col in num_cols:
    q1, q3 = df_clean[col].quantile([0.25, 0.75])
    iqr    = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
print(f"Rows before/after outlier removal: {len(df)} → {len(df_clean)}")

# 3) Feature creation
# 3a) Tenure in months
df_clean["tenure_months"] = (df_clean["tenure_days"] / 30).round(1)

# 3b) Simulate last_login & compute recency
np.random.seed(42)
df_clean["last_login"] = (
    pd.to_datetime(df_clean["join_date"])
    + pd.to_timedelta(
        np.random.randint(0, df_clean["tenure_days"], size=len(df_clean)),
        unit="D"
    )
)
df_clean["recency_days"] = (pd.Timestamp.now().normalize() - df_clean["last_login"]).dt.days

# 3c) Total spend over tenure
df_clean["monetary_total"] = (df_clean["monthly_spend"] * df_clean["tenure_months"]).round(2)

# 3d) Age buckets
df_clean["age_bin"] = pd.cut(
    df_clean["age"],
    bins=[17, 25, 35, 50, 70],
    labels=["18-25","26-35","36-50","51-70"]
)

# 4) One-hot encode categorical features
df_feat = pd.get_dummies(
    df_clean,
    columns=["gender", "age_bin"],
    drop_first=True
)

# 5) Select & scale features
numeric_feats = ["tenure_months", "recency_days", "num_logins", "avg_session_min", "monetary_total"]
cat_feats     = [c for c in df_feat.columns if c.startswith("gender_") or c.startswith("age_bin_")]

feature_cols = numeric_feats + cat_feats
X = df_feat[feature_cols]
y = df_feat["churn"]

scaler = StandardScaler()
X[numeric_feats] = scaler.fit_transform(X[numeric_feats])

# 6) Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 7) Persist to CSV for modeling
X_train.to_csv("data/features/X_train.csv", index=False)
X_test .to_csv("data/features/X_test.csv",  index=False)
y_train.to_csv("data/features/y_train.csv", index=False)
y_test .to_csv("data/features/y_test.csv",  index=False)

print("✅ Saved feature files to data/features/")


Rows before/after outlier removal: 500 → 493
✅ Saved feature files to data/features/


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_feats] = scaler.fit_transform(X[numeric_feats])
