# FE Validation Notebook

이 노트북에서는 **좋은 FE(Feature Engineering)를 찾는 것**에만 집중합니다.

- 공통 전처리: 간단한 결측치 처리 + 현재 설계된 FE 4개 생성
- 기본 모델: `StandardScaler + LogisticRegression(class_weight="balanced")`
- 목표: 어떤 FE 조합이 F1 / AUC 향상에 도움이 되는지 순차적으로 비교

> 최종적으로 괜찮다고 판단된 FE만, 나중에 2번이 만드는 파이프라인에 반영하도록 제안하는 용도입니다.


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score

pd.set_option("display.max_columns", None)

DATA_PATH = "../data/raw_data.csv"

# 원본 데이터 로드
df = pd.read_csv(DATA_PATH)
print("df shape =", df.shape)
print(df.columns)


df shape = (8000, 12)
Index(['user_id', 'gender', 'age', 'country', 'subscription_type',
       'listening_time', 'songs_played_per_day', 'skip_rate', 'device_type',
       'ads_listened_per_week', 'offline_listening', 'is_churned'],
      dtype='object')


In [6]:
def make_fe_dataframe() -> pd.DataFrame:
    """결측치 median 처리 + 설계서 기준 FE 4개 생성한 df 반환
    - 기존 preprocessing_validation.ipynb의 FE 생성 로직을 재사용
    """
    df_tmp = df.copy()

    # 결측치 median 처리 (수치형)
    num_cols = [
        "age",
        "listening_time",
        "songs_played_per_day",
        "skip_rate",
        "ads_listened_per_week",
        "offline_listening",
    ]
    for c in num_cols:
        if c in df_tmp.columns and df_tmp[c].isnull().any():
            df_tmp[c] = df_tmp[c].fillna(df_tmp[c].median())

    # FE 4개 생성
    # 1) engagement_score
    if {"listening_time", "songs_played_per_day"}.issubset(df_tmp.columns):
        df_tmp["engagement_score"] = (
            df_tmp["listening_time"] * df_tmp["songs_played_per_day"]
        )

    # 2) listening_time_bin
    if "listening_time" in df_tmp.columns:
        try:
            df_tmp["listening_time_bin"] = pd.qcut(
                df_tmp["listening_time"], 3, labels=["low", "mid", "high"]
            )
        except Exception:
            bins = [0, 60, 180, df_tmp["listening_time"].max()]
            df_tmp["listening_time_bin"] = pd.cut(
                df_tmp["listening_time"], bins=bins, labels=["low", "mid", "high"], include_lowest=True
            )

    # 3) skip_rate_cap
    if "skip_rate" in df_tmp.columns:
        df_tmp["skip_rate_cap"] = df_tmp["skip_rate"].clip(lower=0, upper=1.5)

    # 4) ads_pressure
    if {"ads_listened_per_week", "listening_time"}.issubset(df_tmp.columns):
        lt_nonzero = df_tmp["listening_time"].replace(0, np.nan)
        df_tmp["ads_pressure"] = df_tmp["ads_listened_per_week"] / lt_nonzero

    return df_tmp


def evaluate_with_logistic(feature_cols):
    """주어진 feature 컬럼 리스트로 LogisticRegression 성능(F1, AUC)을 계산"""
    df_fe = make_fe_dataframe()

    X = df_fe[feature_cols]
    y = df_fe["is_churned"]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced")),
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)
    y_proba = pipe.predict_proba(X_valid)[:, 1]

    f1 = f1_score(y_valid, y_pred)
    auc = roc_auc_score(y_valid, y_proba)

    return f1, auc


In [7]:
# 기본 수치형만 사용한 Set A
base_num = [
    "age",
    "listening_time",
    "songs_played_per_day",
    "skip_rate",
    "ads_listened_per_week",
    "offline_listening",
]

# 기존 FE 4개 중 수치형 3개만 (bin은 나중에 one-hot 별도 실험 예정)
fe_num = ["engagement_score", "skip_rate_cap", "ads_pressure"]

# Set A: 기본 수치형만
f1_A, auc_A = evaluate_with_logistic(base_num)

# Set B: 기본 수치형 + FE 3개 (ads_pressure 포함)
fe_exist = [c for c in fe_num if c in make_fe_dataframe().columns]
feature_cols_B = base_num + fe_exist
f1_B, auc_B = evaluate_with_logistic(feature_cols_B)

print("[Set A - 기본 수치형]   F1 = {:.4f}, AUC = {:.4f}".format(f1_A, auc_A))
print("[Set B - 기본+FE 3개] F1 = {:.4f}, AUC = {:.4f}".format(f1_B, auc_B))


[Set A - 기본 수치형]   F1 = 0.3320, AUC = 0.4895
[Set B - 기본+FE 3개] F1 = 0.3351, AUC = 0.4893


In [8]:
# Set C: 기본 수치형 + FE 3개 + listening_time_bin 원-핫
df_fe = make_fe_dataframe()

base_num = [
    "age",
    "listening_time",
    "songs_played_per_day",
    "skip_rate",
    "ads_listened_per_week",
    "offline_listening",
]
fe_num = ["engagement_score", "skip_rate_cap", "ads_pressure"]

X_num = df_fe[base_num + [c for c in fe_num if c in df_fe.columns]]

# listening_time_bin 원-핫
if "listening_time_bin" in df_fe.columns:
    lt_dummies = pd.get_dummies(df_fe["listening_time_bin"], prefix="lt_bin", drop_first=True)
    X_C = pd.concat([X_num, lt_dummies], axis=1)
else:
    X_C = X_num  # 혹시라도 없으면 수치형만

y = df_fe["is_churned"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X_C, y, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced")),
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_valid)
y_proba = pipe.predict_proba(X_valid)[:, 1]

f1_C = f1_score(y_valid, y_pred)
auc_C = roc_auc_score(y_valid, y_proba)

print("[Set C - 기본+FE 3개+bin] F1 = {:.4f}, AUC = {:.4f}".format(f1_C, auc_C))

[Set C - 기본+FE 3개+bin] F1 = 0.3369, AUC = 0.4886


In [10]:
from sklearn.ensemble import RandomForestClassifier

def evaluate_with_rf(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=5,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
    )
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_valid)
    y_proba = rf.predict_proba(X_valid)[:, 1]

    f1 = f1_score(y_valid, y_pred)
    auc = roc_auc_score(y_valid, y_proba)
    return f1, auc

df_fe = make_fe_dataframe()
y = df_fe["is_churned"]

# Set A
X_A = df_fe[base_num]
f1_A_rf, auc_A_rf = evaluate_with_rf(X_A, y)

# Set B
fe_exist = [c for c in fe_num if c in df_fe.columns]
X_B = df_fe[base_num + fe_exist]
f1_B_rf, auc_B_rf = evaluate_with_rf(X_B, y)

# Set C (bin 원-핫 포함)
if "listening_time_bin" in df_fe.columns:
    lt_dummies = pd.get_dummies(df_fe["listening_time_bin"], prefix="lt_bin", drop_first=True)
    X_num_C = df_fe[base_num + fe_exist]
    X_C = pd.concat([X_num_C, lt_dummies], axis=1)
else:
    X_C = df_fe[base_num + fe_exist]

f1_C_rf, auc_C_rf = evaluate_with_rf(X_C, y)

print("[RF Set A] F1 = {:.4f}, AUC = {:.4f}".format(f1_A_rf, auc_A_rf))
print("[RF Set B] F1 = {:.4f}, AUC = {:.4f}".format(f1_B_rf, auc_B_rf))
print("[RF Set C] F1 = {:.4f}, AUC = {:.4f}".format(f1_C_rf, auc_C_rf))

[RF Set A] F1 = 0.0994, AUC = 0.5241
[RF Set B] F1 = 0.0950, AUC = 0.5355
[RF Set C] F1 = 0.0996, AUC = 0.5375


In [11]:
import numpy as np
from sklearn.metrics import f1_score

def evaluate_with_rf_best_f1(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=5,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
    )
    rf.fit(X_train, y_train)
    
    y_proba = rf.predict_proba(X_valid)[:, 1]

    best_f1 = 0.0
    best_th = 0.5
    for th in np.linspace(0.1, 0.9, 17):
        y_pred_th = (y_proba >= th).astype(int)
        f1 = f1_score(y_valid, y_pred_th)
        if f1 > best_f1:
            best_f1 = f1
            best_th = th

    auc = roc_auc_score(y_valid, y_proba)
    return best_f1, best_th, auc

# 예: Set C 기준으로 확인
best_f1_C, best_th_C, auc_C = evaluate_with_rf_best_f1(X_C, y)
print("[RF Set C] best F1 = {:.4f} @ th={:.2f}, AUC = {:.4f}".format(best_f1_C, best_th_C, auc_C))

[RF Set C] best F1 = 0.4090 @ th=0.15, AUC = 0.5375


In [12]:
# 1) make_fe_dataframe에 새 FE 2개 추가
def make_fe_dataframe_v2() -> pd.DataFrame:
    df_tmp = make_fe_dataframe().copy()

    # listening_time 0 나누기 방지
    lt_safe = df_tmp["listening_time"].replace(0, np.nan)

    # songs_per_minute
    df_tmp["songs_per_minute"] = (
        df_tmp["songs_played_per_day"] / lt_safe
    ).fillna(0.0)

    # skip_intensity = skip_rate * songs_played_per_day
    df_tmp["skip_intensity"] = (
        df_tmp["skip_rate"] * df_tmp["songs_played_per_day"]
    )

    return df_tmp

df_fe2 = make_fe_dataframe_v2()
y2 = df_fe2["is_churned"]

# Set D: Set C + songs_per_minute + skip_intensity
base_plus_fe = base_num + fe_num + ["songs_per_minute", "skip_intensity"]

X_num_D = df_fe2[base_plus_fe]

if "listening_time_bin" in df_fe2.columns:
    lt_dummies2 = pd.get_dummies(
        df_fe2["listening_time_bin"], prefix="lt_bin", drop_first=True
    )
    X_D = pd.concat([X_num_D, lt_dummies2], axis=1)
else:
    X_D = X_num_D

best_f1_D, best_th_D, auc_D = evaluate_with_rf_best_f1(X_D, y2)
print("[RF Set D] best F1 = {:.4f} @ th={:.2f}, AUC = {:.4f}".format(best_f1_D, best_th_D, auc_D))

[RF Set D] best F1 = 0.4117 @ th=0.10, AUC = 0.5396


In [16]:
# Set D용 X_D, y2가 이미 위에서 만들어졌다고 가정
X_train, X_valid, y_train, y_valid = train_test_split(
    X_D, y2, test_size=0.2, random_state=42, stratify=y2
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42,
)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feat_names = X_D.columns

imp_df = pd.DataFrame({"feature": feat_names, "importance": importances})
imp_df.sort_values("importance", ascending=False).head(20)

Unnamed: 0,feature,importance
9,songs_per_minute,0.134559
6,engagement_score,0.13274
10,skip_intensity,0.127132
1,listening_time,0.120712
0,age,0.11747
2,songs_played_per_day,0.103746
3,skip_rate,0.08712
7,skip_rate_cap,0.08531
8,ads_pressure,0.038112
4,ads_listened_per_week,0.032824


In [17]:
# 중요도가 괜찮아 보이는 FE만 골라서 직접 리스트 작성
important_fe = ["engagement_score", "songs_per_minute", "skip_intensity"]

df_fe3 = make_fe_dataframe_v2()
y3 = df_fe3["is_churned"]

# Set E: 기본 수치형 + 중요한 FE만 + bin
X_num_E = df_fe3[base_num + important_fe]

if "listening_time_bin" in df_fe3.columns:
    lt_dummies3 = pd.get_dummies(
        df_fe3["listening_time_bin"], prefix="lt_bin", drop_first=True
    )
    X_E = pd.concat([X_num_E, lt_dummies3], axis=1)
else:
    X_E = X_num_E

best_f1_E, best_th_E, auc_E = evaluate_with_rf_best_f1(X_E, y3)
print("[RF Set E] best F1 = {:.4f} @ th={:.2f}, AUC = {:.4f}".format(best_f1_E, best_th_E, auc_E))

[RF Set E] best F1 = 0.4106 @ th=0.15, AUC = 0.5339


### FE 실험 요약 (RandomForest + best F1 기준)

- Set C (기본 + FE4 + bin): best F1 ≈ 0.409, AUC ≈ 0.538  
- Set D (C + songs_per_minute + skip_intensity): best F1 ≈ 0.412, AUC ≈ 0.540  
- Set E (중요 FE만 선택): best F1 ≈ 0.411, AUC ≈ 0.534  

→ 성능/복잡도 균형 상, **최종 추천 FE 세트**:
- base_num: age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening
- fe_selected: (여기에 실제로 쓴 FE 이름들 정리)
- 모델: RandomForest + threshold 튜닝 (약 th ≈ 0.10~0.15 구간)