In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import recall_score
warnings.filterwarnings('ignore')

In [10]:
df_train = pd.read_csv('train.csv')

In [11]:
def feature_eng(df:pd.DataFrame):
  df.rename(columns = {'signup_date':'days_active'}, inplace=True)
  df['days_active'] = -1*df['days_active']
  df_copy = df.copy()
  df_copy.drop(columns = ['location','payment_method','customer_id','payment_plan'], inplace=True)
  inquiry_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
  df_copy['customer_service_inquiries'] = df_copy['customer_service_inquiries'].map(inquiry_mapping)
  sub_mapping = {'Free': 0, 'Student': 1, 'Family': 2, 'Premium': 3}
  df_copy['subscription_type'] = df_copy['subscription_type'].map(sub_mapping)
  bins = [18, 25, 35, 45, 55, 65, 80]
  labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65-79']
  df_copy['age_group'] = pd.cut(df_copy['age'], bins=bins, labels=labels, right=False)
  risk_mapping = {'18-24': 2,'65-79': 2,
                  '25-34': 1,'55-64': 1,
                  '35-44': 0,'45-54': 0 }
  df_copy['age_group_risk'] = df_copy['age_group'].astype(str).map(risk_mapping)
  df_copy.drop(columns = ['age','age_group'], inplace=True)
  def segment_notifications(x):
      if x < 5:
          return 'Non-Engager'
      elif x < 45:
          return 'Standard'
      else:
          return 'Extreme'
  df_copy['notif_segment'] = df_copy['notifications_clicked'].apply(segment_notifications)
  notif_map = {'Non-Engager': 0, 'Standard': 1, 'Extreme': 2}
  df_copy['notif_segment'] = df_copy['notif_segment'].map(notif_map)
  df_copy['extrovertness_index'] = (df_copy['num_shared_playlists'].div(df_copy['num_platform_friends']+1))
  df_copy.drop(columns = ['days_active','average_session_length',
                          'num_favorite_artists',
                          'num_playlists_created'], inplace=True)
  df_copy.drop(columns = ['weekly_songs_played', 'weekly_unique_songs',
                          'notifications_clicked','num_shared_playlists',
                          'num_platform_friends'], inplace=True)
  return df_copy

In [12]:
df_train = feature_eng(df_train)

#### Let's look at out feature distribution

In [13]:
# df_corr = df_train.corr()
# plt.figure(figsize=(15,15))
# sns.heatmap(df_corr, vmin = -1, vmax = 1, annot = True, cmap = 'viridis')
# plt.show()

In [14]:
x = df_train.drop(columns = ['churned'])
y = df_train[['churned']]

In [15]:
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42, stratify=y
)

x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)


In [16]:

vt = VarianceThreshold(threshold=0.01)

X_train_v = vt.fit_transform(x_train)
X_test_v  = vt.transform(x_test)

kept_features = x_train.columns[~vt.get_support()]
dropped_features = x_train.columns[~vt.get_support()]

print("Dropped:", dropped_features)

Dropped: Index([], dtype='object')


#### So no columns need be dropped.


In [17]:
x = x_train.copy()

x = x.fillna(0)

vif_df = pd.DataFrame()
vif_df["feature"] = x.columns
vif_df["vif"] = [
    variance_inflation_factor(x.values, i)
    for i in range(x.shape[1])
]

vif_df.sort_values("vif", ascending=False)

Unnamed: 0,feature,vif
6,notif_segment,4.384745
3,weekly_hours,3.377705
4,song_skip_rate,3.374001
1,num_subscription_pauses,2.674287
0,subscription_type,2.535508
5,age_group_risk,2.358031
2,customer_service_inquiries,2.286171
7,extrovertness_index,1.078451


#### Let's keep outliers, since we'll use tree based models.

In [18]:
x_train.head()

Unnamed: 0,subscription_type,num_subscription_pauses,customer_service_inquiries,weekly_hours,song_skip_rate,age_group_risk,notif_segment,extrovertness_index
74492,2,3,0,4.545908,0.350329,0,2,0.093458
71355,2,3,0,34.921727,0.823311,1,1,0.157895
20708,3,3,2,24.431803,0.02765,0,1,0.109589
6832,3,1,2,37.691211,0.510033,0,1,0.597403
9974,3,3,2,12.854966,0.532784,2,1,0.347368


#### Let's standardize and not normalize, since we'll use ensamble, and they do not necessarily assume normalized dist.

In [19]:
features_to_scale = ['weekly_hours', 'song_skip_rate', 'extrovertness_index']
x_train_scaled = x_train.copy()
x_test_scaled = x_test.copy()
scaler = StandardScaler()
x_train_scaled[features_to_scale] = scaler.fit_transform(x_train[features_to_scale])
x_test_scaled[features_to_scale] = scaler.transform(x_test[features_to_scale])

In [20]:
# x_train_scaled.isna().sum(),x_test_scaled.isna().sum()
## All good

In [21]:

models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}
for name, model in models.items():
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred  = model.predict(x_test)

    recall_train = recall_score(y_train, y_train_pred)
    recall_test  = recall_score(y_test, y_test_pred)

    print(f"{name} trained successfully!")
    print(f"Recall on Training Data: {recall_train:.4f}")
    print(f"Recall on Test Data:     {recall_test:.4f}\n")

KeyboardInterrupt: 

#### Adaboost is giving most robust baseline.

we'll take these 3 and go with them.

In [22]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

# Hyperparameter grids
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

adb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.5, 1.0]
}

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'tree_method': ['gpu_hist']  # GPU acceleration
}

# Models
models = {
    'RandomForest': (RandomForestClassifier(random_state=42, n_jobs=-1), rf_params),
    'AdaBoost': (AdaBoostClassifier(random_state=42), adb_params),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), xgb_params)
}

# Fit GridSearchCV for each model
best_models = {}

for name, (model, params) in models.items():
    print(f"Training {name} with GridSearchCV...")
    grid = GridSearchCV(model, params, cv=2, scoring='recall', n_jobs=-1)
    grid.fit(x_train, y_train)

    best_models[name] = grid.best_estimator_

    # Evaluate on train and test
    y_train_pred = grid.predict(x_train)
    y_test_pred = grid.predict(x_test)

    print(f"{name} Best Params: {grid.best_params_}")
    print(f"{name} Recall - Train: {recall_score(y_train, y_train_pred):.4f}, Test: {recall_score(y_test, y_test_pred):.4f}\n")


Training RandomForest with GridSearchCV...
RandomForest Best Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 100}
RandomForest Recall - Train: 0.8694, Test: 0.8651

Training AdaBoost with GridSearchCV...
AdaBoost Best Params: {'learning_rate': 1.0, 'n_estimators': 100}
AdaBoost Recall - Train: 0.8338, Test: 0.8428

Training XGBoost with GridSearchCV...


ValueError: 
All the 64 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
64 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/sklearn.py", line 1806, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/training.py", line 199, in train
    bst.update(dtrain, iteration=i, fobj=obj)
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 2433, in update
    _check_call(
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 323, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: Invalid Input: 'gpu_hist', valid values are: {'approx', 'auto', 'exact', 'hist'}


In [25]:
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Hyperparameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 5]
}

# GridSearchCV (2-fold)
grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    scoring='recall',
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Train
grid_xgb.fit(x_train, y_train)

# Best model
best_xgb = grid_xgb.best_estimator_

# Evaluation
y_train_pred = best_xgb.predict(x_train)
y_test_pred  = best_xgb.predict(x_test)

print("Best Params:", grid_xgb.best_params_)
print(f"Recall on Train: {recall_score(y_train, y_train_pred):.4f}")
print(f"Recall on Test:  {recall_score(y_test, y_test_pred):.4f}")


Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best Params: {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 200}
Recall on Train: 0.8617
Recall on Test:  0.8646


#### We're good to move ahead with the model architecture.