In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add project root to path
sys.path.append(os.path.abspath(".."))

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', None)

## 1. Load Data

In [2]:
# Load the processed user features
data_path = '../data/user_features.parquet'
df = pd.read_parquet(data_path)

print(f"Dataset Shape: {df.shape}")
print(f"Churn Rate: {df['target'].mean():.2%}")
df.head()

Dataset Shape: (19140, 47)
Churn Rate: 22.31%


Unnamed: 0_level_0,gender,level,platform,is_thumbs_up,is_thumbs_down,is_ad,is_error,is_song,length,downgrade,songs_last_1d,errors_last_1d,listen_time_last_1d,unique_artists_last_1d,unique_songs_last_1d,songs_last_3d,errors_last_3d,listen_time_last_3d,unique_artists_last_3d,unique_songs_last_3d,songs_last_7d,errors_last_7d,listen_time_last_7d,unique_artists_last_7d,unique_songs_last_7d,songs_last_14d,errors_last_14d,listen_time_last_14d,unique_artists_last_14d,unique_songs_last_14d,songs_last_30d,errors_last_30d,listen_time_last_30d,unique_artists_last_30d,unique_songs_last_30d,account_lifetime,avg_songs_per_day,thumbs_ratio,errors_per_song,trend_songs_7d_vs_30d,trend_listen_time_7d_vs_30d,total_sessions,avg_days_between_sessions,avg_songs_per_session,avg_session_duration,target,state_freq
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
1000025,M,paid,Windows,94,13,7,1,1662,417296.59169,0,212,0,52648.4459,197,208,535,1,132961.53374,440,502,687,1,173744.74643,552,645,1049,1,265573.95039,803,965,1662,1,417296.59169,1162,1468,100.460382,16.380778,0.878505,0.000602,1.653032,1.66543,17,5.909434,97.764706,24546.858335,1,0.012487
1000035,F,paid,Linux,117,15,6,1,1266,310364.8659,0,170,0,41213.98983,160,168,170,0,41213.98983,160,168,405,1,99705.547,347,388,574,1,140551.62871,472,544,1133,1,278412.10335,835,1042,63.350567,19.673486,0.886364,0.00079,1.429328,1.432487,21,3.016694,60.285714,14779.279329,0,0.010972
1000083,M,paid,Windows,21,2,8,0,501,122606.27093,0,213,0,52170.90103,203,211,250,0,61312.53977,236,247,406,0,100331.33604,358,391,501,0,122606.27093,427,478,501,0,122606.27093,427,478,34.668854,14.045868,0.913043,0.0,3.238931,3.273275,11,3.151714,45.545455,11146.02463,1,0.007524
1000103,F,paid,Linux,2,1,3,0,57,13554.73009,0,5,0,984.08263,5,5,5,0,984.08263,5,5,5,0,984.08263,5,5,18,0,3785.68327,18,18,18,0,3785.68327,18,18,47.459201,1.176247,0.666667,0.0,1.086957,1.039684,3,15.819734,19.0,4518.243363,0,0.031714
1000164,F,paid,Windows,38,6,20,1,847,209060.65753,0,184,0,42443.54099,173,183,216,0,50847.03993,202,215,313,0,75785.07918,286,306,479,1,117207.44962,400,450,513,1,126008.87092,426,480,99.1475,8.457525,0.863636,0.001181,2.438644,2.405699,15,6.609833,56.466667,13937.377169,0,0.021003


## 2. Preprocessing & Splitting

We will use a **Stratified Split** to maintain the churn ratio in both training and test sets.
We will also define a `ColumnTransformer` to handle:
- **Numerical Features**: Standard Scaling.
- **Categorical Features**: One-Hot Encoding.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1. Separate Features and Target
# Check if 'userId' exists before dropping (it might be the index or already dropped)
X = df.drop(columns=["target"])
y = df['target']

# 2. Identify Column Types
# Categorical: 'gender', 'level', 'platform' (low cardinality)
# Note: 'state_freq' is numerical, so it goes to num_cols
categorical_cols = ['gender', 'level', 'platform']
numerical_cols = [c for c in X.columns if c not in categorical_cols]

print(f"Categorical Columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical Columns ({len(numerical_cols)}): {numerical_cols[:5]} ...")

# 3. Stratified Train-Test Split
# 20% Test, 80% Train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED
)

print(f"\nTraining Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")
print(f"Train Churn Rate: {y_train.mean():.2%}")
print(f"Test Churn Rate: {y_test.mean():.2%}")

Categorical Columns (3): ['gender', 'level', 'platform']
Numerical Columns (43): ['is_thumbs_up', 'is_thumbs_down', 'is_ad', 'is_error', 'is_song'] ...

Training Shape: (15312, 46)
Test Shape: (3828, 46)
Train Churn Rate: 22.32%
Test Churn Rate: 22.31%


In [4]:
# 4. Define Preprocessing Pipeline
# We use a Pipeline to prevent data leakage (scaling parameters learned only on train)

# Numerical Transformer: Impute missing values with median, then scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Transformer: Impute missing values with 'missing', then OneHotEncode
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

# Verify the pipeline works on training data
X_train_processed = preprocessor.fit_transform(X_train)
print(f"Processed Feature Matrix Shape: {X_train_processed.shape}")

Processed Feature Matrix Shape: (15312, 50)


## 3. Baseline Model Evaluation

We will evaluate the following industry-standard models:
1.  **Logistic Regression**: Simple baseline for interpretability.
2.  **Random Forest**: Robust bagging ensemble.
3.  **XGBoost**: Gradient boosting (often SOTA for tabular data).
4.  **LightGBM**: Faster and often more accurate gradient boosting.
5.  **CatBoost**: Excellent for categorical features (though we OHE them here).

**Metrics**:
- **F1-Score**: Harmonic mean of precision and recall (crucial for imbalanced churn).
- **ROC-AUC**: Ability to distinguish between classes.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_SEED),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED),
    # Removed use_label_encoder=False as it's deprecated and causing warnings
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=RANDOM_SEED),
    "LightGBM": LGBMClassifier(random_state=RANDOM_SEED, verbose=-1),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=RANDOM_SEED)
}

# Evaluation Function
def evaluate_models(models, X, y, preprocessor):
    results = []
    for name, model in models.items():
        # Create full pipeline
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        
        # Cross-Validation (5-fold)
        # Added 'accuracy' to scoring
        cv_results = cross_validate(clf, X, y, cv=5, scoring=['f1', 'roc_auc', 'accuracy'])
        
        results.append({
            "Model": name,
            "F1 Score (Mean)": cv_results['test_f1'].mean(),
            "F1 Score (Std)": cv_results['test_f1'].std(),
            "ROC-AUC (Mean)": cv_results['test_roc_auc'].mean(),
            "ROC-AUC (Std)": cv_results['test_roc_auc'].std(),
            "Accuracy (Mean)": cv_results['test_accuracy'].mean(),
            "Accuracy (Std)": cv_results['test_accuracy'].std(),
            "CV F1 Scores": np.round(cv_results['test_f1'], 3) # Show individual scores
        })
        print(f"Evaluated {name}...")
        
    return pd.DataFrame(results).sort_values(by="F1 Score (Mean)", ascending=False)

# Run Evaluation
results_df = evaluate_models(models, X_train, y_train, preprocessor)
results_df

## 4. Hyperparameter Tuning

We will now optimize the hyperparameters for our top two performing models: **XGBoost** and **CatBoost**.
We use `RandomizedSearchCV` which is more efficient than Grid Search as it samples a fixed number of parameter settings from specified distributions.


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grids
xgb_params = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5, 6, 8],
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.1, 0.2, 0.5]
}

cat_params = {
    'classifier__iterations': [100, 200, 300, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__depth': [4, 6, 8, 10],
    'classifier__l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Helper function for tuning
def tune_model(model, params, X, y, preprocessor, n_iter=20):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    search = RandomizedSearchCV(
        pipeline, 
        param_distributions=params, 
        n_iter=n_iter, 
        scoring='f1', 
        cv=3, 
        verbose=1, 
        random_state=RANDOM_SEED, 
        n_jobs=-1
    )
    
    search.fit(X, y)
    return search

# Tune XGBoost
print("--- Tuning XGBoost ---")
xgb_model = XGBClassifier(eval_metric='logloss', random_state=RANDOM_SEED)
xgb_search = tune_model(xgb_model, xgb_params, X_train, y_train, preprocessor)

print(f"Best XGBoost F1: {xgb_search.best_score_:.4f}")
print(f"Best XGBoost Params: {xgb_search.best_params_}")

# Tune CatBoost
print("\n--- Tuning CatBoost ---")
cat_model = CatBoostClassifier(verbose=0, random_state=RANDOM_SEED)
cat_search = tune_model(cat_model, cat_params, X_train, y_train, preprocessor)

print(f"Best CatBoost F1: {cat_search.best_score_:.4f}")
print(f"Best CatBoost Params: {cat_search.best_params_}")


--- Tuning XGBoost ---
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGBoost F1: 0.5699
Best XGBoost Params: {'classifier__subsample': 0.9, 'classifier__n_estimators': 500, 'classifier__max_depth': 6, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.9}


--- Tuning CatBoost ---
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best CatBoost F1: 0.5661
Best CatBoost Params: {'classifier__learning_rate': 0.1, 'classifier__l2_leaf_reg': 5, 'classifier__iterations': 500, 'classifier__depth': 6}

## 5. Ensemble Modeling (Stacking)

We will now combine our tuned **XGBoost** and **CatBoost** models using a `StackingClassifier`.
This technique uses a meta-model (Logistic Regression) to learn the best combination of the base models' predictions.
We use the optimal hyperparameters found in the previous step.

In [6]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate


# 1. Define Tuned Models with Hardcoded Parameters
best_xgb_params = {
    'n_estimators': 500,
    'learning_rate': 0.2,
    'max_depth': 6,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'gamma': 0.1,
    'eval_metric': 'logloss',
    'random_state': RANDOM_SEED
}

best_cat_params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'depth': 6,
    'l2_leaf_reg': 5,
    'verbose': 0,
    'random_state': RANDOM_SEED
}

best_xgb = XGBClassifier(**best_xgb_params)
best_cat = CatBoostClassifier(**best_cat_params)

# 2. Create Stacking Ensemble
# We wrap each base model in the preprocessor pipeline so they can handle the raw data
estimators = [
    ('xgb', Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_xgb)])),
    ('cat', Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_cat)]))
]

# The final estimator uses the predictions of the base estimators
# We use Logistic Regression as the meta-learner
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=RANDOM_SEED),
    cv=5,
    n_jobs=-1
)

# 3. Evaluate Stacking Model
print("Evaluating Stacking Classifier (this may take a moment)...")
cv_results_stack = cross_validate(stacking_clf, X_train, y_train, cv=5, scoring=['f1', 'roc_auc', 'accuracy'])

print(f"Stacking F1 Score: {cv_results_stack['test_f1'].mean():.4f} (+/- {cv_results_stack['test_f1'].std():.4f})")
print(f"Stacking ROC-AUC: {cv_results_stack['test_roc_auc'].mean():.4f}")
print(f"Stacking Accuracy: {cv_results_stack['test_accuracy'].mean():.4f}")

Evaluating Stacking Classifier (this may take a moment)...
Stacking F1 Score: 0.5794 (+/- 0.0204)
Stacking ROC-AUC: 0.8507
Stacking Accuracy: 0.8467
