# Churn Modeling Notebook

This notebook performs:
- EDA (activity trends, retention curves)
- Feature engineering (avg session length, recency, purchases, level progression)
- Modeling (Logistic Regression, Random Forest, XGBoost)
- Evaluation (Accuracy, ROC-AUC), feature importance
- Save best model to `models/churn_xgb.pkl`


In [None]:
# Imports and setup
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure project root on path
root = os.path.abspath(os.getcwd())
if root not in sys.path:
    sys.path.insert(0, root)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier

from src.preprocess import load_data, create_features

%matplotlib inline
sns.set(style='whitegrid')

DATA_DIR = 'data'


In [None]:
# Load data and create features
raw = load_data(DATA_DIR)
train_df, dev_df, test_df = create_features(raw)

print('train/dev/test shapes:', train_df.shape, dev_df.shape, test_df.shape)
train_df.head()


In [None]:
# EDA: churn distribution and retention proxy
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

if 'churn' in train_df.columns:
    train_df['churn'].value_counts(normalize=True).plot(kind='bar', ax=ax[0], title='Churn Rate')
else:
    # try common alt label column
    alt = 'label' if 'label' in train_df.columns else None
    if alt:
        train_df[alt].value_counts(normalize=True).plot(kind='bar', ax=ax[0], title=f'{alt} Rate')
    else:
        ax[0].set_title('No churn/label column found')

if 'recency_days' in train_df.columns and train_df['recency_days'].notna().any():
    train_df['recency_days'].plot(kind='hist', bins=30, ax=ax[1], title='Recency (days)')
else:
    if 'purchase_count' in train_df.columns:
        train_df['purchase_count'].plot(kind='hist', bins=30, ax=ax[1], title='Purchase Count')
    else:
        ax[1].set_title('No recency/purchase features')

plt.tight_layout()
plt.show()

# Retention proxy
if 'recency_days' in train_df.columns and 'label' in train_df.columns:
    tmp = train_df[['recency_days', 'label']].dropna().sort_values('recency_days')
    if not tmp.empty:
        tmp['active'] = 1 - tmp['label']
        tmp['cum_active_rate'] = tmp['active'].expanding().mean()
        tmp[['recency_days','cum_active_rate']].plot(x='recency_days', y='cum_active_rate', figsize=(6,4), title='Retention proxy by recency')



In [None]:
# Prepare modeling data with robust coercion

def prepare_xy(df, label_col='churn'):
    if label_col not in df.columns:
        for alt in ['is_churn','label','target']:
            if alt in df.columns:
                label_col = alt
                break
    y = df[label_col].values if label_col in df.columns else None
    feat_cols = [c for c in df.columns if c not in {label_col, 'player_id'}]
    Xdf = df[feat_cols].copy()
    for c in Xdf.columns:
        Xdf[c] = pd.to_numeric(Xdf[c], errors='coerce')
    Xdf = Xdf.fillna(0.0)
    return Xdf.values.astype('float32'), y, feat_cols

X, y, feature_cols = prepare_xy(train_df)
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
# Train models and evaluate
models = {
    'logreg': LogisticRegression(max_iter=1000),
    'rf': RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    'xgb': XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.08, subsample=0.9,
                         colsample_bytree=0.9, reg_lambda=1.0, objective='binary:logistic',
                         eval_metric='auc', random_state=42, n_jobs=-1, tree_method='hist'),
}

metrics = {}
for name, clf in models.items():
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_va)
    proba = clf.predict_proba(X_va)[:,1]
    acc = accuracy_score(y_va, pred)
    auc = roc_auc_score(y_va, proba)
    metrics[name] = {'accuracy': acc, 'roc_auc': auc}
    print(f"{name}: acc={acc:.4f}, auc={auc:.4f}")

best_name = max(metrics.keys(), key=lambda k: metrics[k]['roc_auc'])
print('Best model:', best_name, metrics[best_name])


In [None]:
# Feature importance for XGB
if 'xgb' in models:
    xgb = models['xgb']
    importances = xgb.feature_importances_
    fi = pd.DataFrame({'feature': feature_cols, 'importance': importances}).sort_values('importance', ascending=False)
    plt.figure(figsize=(8,6))
    sns.barplot(data=fi.head(20), x='importance', y='feature')
    plt.title('Top 20 Feature Importances (XGB)')
    plt.tight_layout()
    plt.show()


In [None]:
# Save best model
import joblib, os
best_clf = models[best_name]
os.makedirs('../models', exist_ok=True)
joblib.dump({'model': best_clf, 'features': feature_cols}, '../models/churn_xgb.pkl')
print('Saved model to ../models/churn_xgb.pkl')


# Churn Modeling Notebook

This notebook performs:
- EDA (activity trends, retention curves)
- Feature engineering (avg session length, recency, purchases, level progression)
- Modeling (Logistic Regression, Random Forest, XGBoost)
- Evaluation (Accuracy, ROC-AUC), feature importance
- Save best model to `models/churn_xgb.pkl`


In [1]:
# Imports and setup
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier

from src.preprocess import load_data, create_features

%matplotlib inline
sns.set(style='whitegrid')

DATA_DIR = 'data'



ModuleNotFoundError: No module named 'src'

In [None]:
# Load raw data and create features
raw = load_data(DATA_DIR)
train_df, dev_df, test_df = create_features(raw)

# Peek
display(train_df.head())
print(train_df.shape, dev_df.shape, test_df.shape)


In [None]:
# EDA: retention curve (toy version)
# Expect columns: player_id, churn (1=churned), and optionally last_active or last_login timestamps
eda_df = train_df.copy()

# Distribution of churn
fig, ax = plt.subplots(1,2, figsize=(12,4))
eda_df['churn'].value_counts(normalize=True).plot(kind='bar', ax=ax[0], title='Churn Rate')
ax[0].set_xticklabels(['Active(0)','Churn(1)'], rotation=0)

# Activity trend: simple histogram of recency if present
if 'recency_days' in eda_df.columns:
    eda_df['recency_days'].plot(kind='hist', bins=30, ax=ax[1], title='Recency (days)')
else:
    eda_df['purchase_count'].plot(kind='hist', bins=30, ax=ax[1], title='Purchase count distribution')
plt.tight_layout()
plt.show()

# Retention-like curve proxy: cumulative fraction of active by recency
if 'recency_days' in eda_df.columns:
    tmp = eda_df[['recency_days','churn']].dropna().sort_values('recency_days')
    tmp['active'] = 1 - tmp['churn']
    tmp['cum_active_rate'] = tmp['active'].expanding().mean()
    tmp[['recency_days','cum_active_rate']].plot(x='recency_days', y='cum_active_rate', figsize=(6,4), title='Retention proxy by recency')



In [None]:
# Prepare modeling data

def prepare_xy(df, label_col='churn'):
    if label_col not in df.columns:
        for alt in ['is_churn','label','target']:
            if alt in df.columns:
                label_col = alt
                break
    y = df[label_col].values if label_col in df.columns else None
    feature_cols = [c for c in df.columns if c not in {label_col, 'player_id'}]
    X = df[feature_cols].values
    return X, y, feature_cols

X, y, feature_cols = prepare_xy(train_df)
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
# Train baseline models
models = {
    'logreg': LogisticRegression(max_iter=1000, n_jobs=None),
    'rf': RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1, random_state=42),
    'xgb': XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.08, subsample=0.9,
                         colsample_bytree=0.9, reg_lambda=1.0, objective='binary:logistic',
                         eval_metric='auc', random_state=42, n_jobs=-1, tree_method='hist'),
}

metrics = {}
for name, clf in models.items():
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_va)
    proba = clf.predict_proba(X_va)[:,1]
    acc = accuracy_score(y_va, pred)
    auc = roc_auc_score(y_va, proba)
    metrics[name] = {'accuracy': acc, 'roc_auc': auc}
    print(f"{name}: acc={acc:.4f}, auc={auc:.4f}")

best_name = max(metrics.keys(), key=lambda k: metrics[k]['roc_auc'])
print('Best model:', best_name, metrics[best_name])



In [None]:
# Feature importance (XGBoost)
if 'xgb' in models:
    xgb = models['xgb']
    importances = xgb.feature_importances_
    fi = pd.DataFrame({'feature': feature_cols, 'importance': importances}).sort_values('importance', ascending=False)
    plt.figure(figsize=(8,6))
    sns.barplot(data=fi.head(20), x='importance', y='feature')
    plt.title('Top 20 Feature Importances (XGB)')
    plt.tight_layout()
    plt.show()



In [None]:
# Save best model as models/churn_xgb.pkl (prefer XGB if best)
import joblib, os
best_clf = models[best_name]
os.makedirs('models', exist_ok=True)
joblib.dump({'model': best_clf, 'features': feature_cols}, 'models/churn_xgb.pkl')
print('Saved model to models/churn_xgb.pkl')
