In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import os
import optuna
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool

optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = os.path.join(os.getcwd(),"..", "data", "raw", "churn-data.csv")
df=pd.read_csv(DATA_PATH)

In [3]:
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

In [4]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# feature engineering based on previous analysis

In [5]:
class FeatureEngineering:
    def __init__(self, df):
        self.df = df.copy()

    # Age groups mapping
    def age_group(self, age):
        if age < 30:
            return "Young"
        elif age < 45:
            return "Adult"
        elif age < 60:
            return "Senior"
        else:
            return "Elderly"
    # Credit score tiers mapping based on VantageScore
    def credit_score_tier(self,score):
        if score >= 781:
            return "superprime"
        elif score >= 661:
            return "prime"
        elif score >= 601:
            return "near prime"
        elif score >= 300:
            return "subprime"
        else:
            return "Very Poor"
    def make_feature_extraction(self):
        df = self.df.copy()

        # -------------------------
        # 1. Zero balance indicator
        # -------------------------
        df["IsZeroBalance"] = (df["Balance"] == 0).astype(int)

        # -------------------------
        # 2. Age groups
        # -------------------------
        df["AgeGroup"] = df["Age"].apply(self.age_group)

        # -------------------------
        # 3. Credit score tiers VantageScore Credit Score
        # -------------------------

        df["CreditTier"] = df["CreditScore"].apply(self.credit_score_tier)

        # -------------------------
        # 4. Customer Value (improved)
        # Balance + Salary is better indicator
        # -------------------------
        df["CustomerValue"] = df["Balance"] + df["EstimatedSalary"]

        # -------------------------
        # 5. Age Ã— NumOfProducts
        # -------------------------
        df["AgeProduct"] = np.log1p(df["Age"] * df["NumOfProducts"])

        # -------------------------
        # 6. Activity Score 
        # -------------------------
        df["ActivityScore"] = df["IsActiveMember"] * df["NumOfProducts"]

        # -------------------------
        # 7. Log Balance/Salary Ratio (BEST version)
        # More stable, avoids skew
        # -------------------------
        df["LogBalanceSalaryRatio"] = (
            np.log1p(df["Balance"]) - np.log1p(df["EstimatedSalary"])
        )

        # -------------------------
        # 8. High balance flag
        # -------------------------
        df["HighBalance"] = (df["Balance"] > df["Balance"].median()).astype(int)

        # -------------------------
        # 9. Customer Lifetime Value (CLV)
        # -------------------------
        df["CLV"] = df["Tenure"] * df["Balance"]

        return df
fe = FeatureEngineering(df)
df_new_features = fe.make_feature_extraction()
df_new_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CreditScore            10000 non-null  int64  
 1   Geography              10000 non-null  object 
 2   Gender                 10000 non-null  object 
 3   Age                    10000 non-null  int64  
 4   Tenure                 10000 non-null  int64  
 5   Balance                10000 non-null  float64
 6   NumOfProducts          10000 non-null  int64  
 7   HasCrCard              10000 non-null  int64  
 8   IsActiveMember         10000 non-null  int64  
 9   EstimatedSalary        10000 non-null  float64
 10  Exited                 10000 non-null  int64  
 11  IsZeroBalance          10000 non-null  int64  
 12  AgeGroup               10000 non-null  object 
 13  CreditTier             10000 non-null  object 
 14  CustomerValue          10000 non-null  float64
 15  Age

# split dataset

In [6]:
df_new_features.to_csv(os.path.join(os.getcwd(),"..", "data", "processed", "churn-data-features.csv"), index=False)

In [7]:
X = df_new_features.drop('Exited', axis=1)
y = df_new_features['Exited']

#split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# check shape of the splits
print("df shape:", df_new_features.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

df shape: (10000, 20)
X_train shape: (8000, 19)
X_test shape: (2000, 19)
y_train shape: (8000,)
y_test shape: (2000,)


In [8]:
# Define feature categories
numerical_features = ['CreditScore', 'Tenure','Age', 'Balance', 'EstimatedSalary','LogBalanceSalaryRatio','CustomerValue','AgeProduct','CLV']
categorical_features=['Geography', 'Gender','AgeGroup', 'CreditTier']
ready_cols = list(set(X_train.columns.tolist()) - set(numerical_features) - set(categorical_features))

In [9]:
X_train[numerical_features].head()

Unnamed: 0,CreditScore,Tenure,Age,Balance,EstimatedSalary,LogBalanceSalaryRatio,CustomerValue,AgeProduct,CLV
2151,753,7,57,0.0,159475.08,-11.979649,159475.08,4.060443,0.0
8392,739,3,32,102128.27,63981.37,0.467632,166109.64,3.496508,306384.81
5006,755,0,37,113865.23,117396.25,-0.030539,231261.48,4.317488,0.0
4117,561,5,37,0.0,83093.25,-11.327731,83093.25,4.317488,0.0
7182,692,6,49,110540.43,107472.99,0.028142,218013.42,4.59512,663242.58


In [10]:
len(X_train.columns.tolist()) == len(ready_cols)+len(numerical_features)+len(categorical_features)

True

In [11]:
# check skewness of numerical features
skewed_feats = X_train[numerical_features].skew().sort_values(ascending=False)
print("Skewness of numerical features:\n", skewed_feats)    

Skewness of numerical features:
 Age                      1.035706
CLV                      0.877814
AgeProduct               0.259080
Tenure                   0.013124
EstimatedSalary          0.007126
CustomerValue           -0.074331
CreditScore             -0.079540
Balance                 -0.141721
LogBalanceSalaryRatio   -0.538151
dtype: float64


# pipline

In [12]:
# for non skewed numerical features
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])


#for categorical features

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first'))
])


#for ready to use features
ready_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Combine all pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features),
        ('ready', ready_pipeline, ready_cols)
    ]
)


In [13]:
#---------------------------------------------------------
# add noise features to test preprocessing robustness
#---------------------------------------------------------

# num_noise_features = 1
# for i in range(num_noise_features):
#     noise_feature_name = f'NoiseFeature_{i+1}'
#     X_train[noise_feature_name] = np.random.rand(X_train.shape[0])
#     X_test[noise_feature_name] = np.random.rand(X_test.shape[0])
#     ready_cols.append(noise_feature_name)


#---------------------------------------------------------
# Fit and transform the training data
#---------------------------------------------------------
X_train_processed = preprocessor.fit_transform(X_train) 
X_test_processed = preprocessor.transform(X_test)
print("Processed X_train shape:", X_train_processed.shape)
print("Processed X_test shape:", X_test_processed.shape)

Processed X_train shape: (8000, 24)
Processed X_test shape: (2000, 24)


In [14]:
#compute class weights to handle class imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weights_dict)

Class weights: {0: np.float64(0.6279434850863422), 1: np.float64(2.4539877300613497)}


-------------

# baseline xgb

In [16]:
# xgboost model with smote
xgb_model = xgb.XGBClassifier(scale_pos_weight=class_weights_dict[1]/class_weights_dict[0], random_state=42)
xgb_model.fit(X_train_processed, y_train)
y_test_pred_xgb = xgb_model.predict(X_test_processed)
print("Classification Report for Test Set:\n", classification_report(y_test, y_test_pred_xgb))
print('f1 ', f1_score(y_test, y_test_pred_xgb))
print('===========================================================')
y_train_pred_xgb = xgb_model.predict(X_train_processed)
print("Classification Report for Training Set:\n", classification_report(y_train, y_train_pred_xgb))
print('f1 ', f1_score(y_train, y_train_pred_xgb))

Classification Report for Test Set:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90      1593
           1       0.59      0.60      0.60       407

    accuracy                           0.83      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.84      0.83      0.84      2000

f1  0.5955882352941176
Classification Report for Training Set:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      6370
           1       0.87      0.99      0.93      1630

    accuracy                           0.97      8000
   macro avg       0.93      0.98      0.95      8000
weighted avg       0.97      0.97      0.97      8000

f1  0.925236321970782


#  XGBoost hyperparameters tunning  

In [20]:
def xgb_objective(trial):
    # XGBoost hyperparameters
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
        # Class imbalance handling
        "scale_pos_weight": class_weights_dict[1] / class_weights_dict[0],

        # Trial suggestions
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10)
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train_processed, y_train)

    preds = model.predict(X_test_processed)
    f1 = f1_score(y_test, preds)

    return f1


In [21]:
# Create and run study
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, show_progress_bar=False)
# Best results
print("Best F1 Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)
xgb_best_params = study.best_params

Best F1 Score: 0.6338185890257558
Best Hyperparameters: {'n_estimators': 279, 'max_depth': 9, 'learning_rate': 0.04539013027168558, 'subsample': 0.739458666139444, 'colsample_bytree': 0.7848215837184851, 'gamma': 3.3877904103418386, 'min_child_weight': 2}


In [23]:
xgb_model = xgb.XGBClassifier(**xgb_best_params,random_state=42)
xgb_model.fit(X_train_processed, y_train)
y_test_pred_xgb = xgb_model.predict(X_test_processed)
print("Classification Report for Test Set:\n", classification_report(y_test, y_test_pred_xgb))
print('f1 ', f1_score(y_test, y_test_pred_xgb))
print('===========================================================')
y_train_pred_xgb = xgb_model.predict(X_train_processed)
print("Classification Report for Training Set:\n", classification_report(y_train, y_train_pred_xgb))
print('f1 ', f1_score(y_train, y_train_pred_xgb))

Classification Report for Test Set:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.76      0.50      0.60       407

    accuracy                           0.87      2000
   macro avg       0.82      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000

f1  0.6044444444444445
Classification Report for Training Set:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      6370
           1       0.86      0.57      0.68      1630

    accuracy                           0.89      8000
   macro avg       0.88      0.77      0.81      8000
weighted avg       0.89      0.89      0.88      8000

f1  0.6834319526627219


# SHAP feature importance

In [24]:
feature_names=preprocessor.get_feature_names_out().tolist()

In [25]:
explainer = shap.Explainer(xgb_model, X_train_processed)
shap_values = explainer(X_train_processed)
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

# Create DF
shap_df = pd.DataFrame({
    'Feature': feature_names,
    'SHAP_Value': mean_abs_shap
}).sort_values(by='SHAP_Value', ascending=False)



In [None]:
shap_df_xgb = shap_df.copy()
shap_df_xgb['cumsum'] = shap_df_xgb['SHAP_Value'].cumsum()
shap_df_xgb['cumsum_percent'] = 100 * shap_df_xgb['cumsum'] / shap_df_xgb['SHAP_Value'].sum()

top_features = shap_df_xgb[shap_df_xgb['cumsum_percent'] <= 90]['Feature'].tolist()
print(top_features)


['ready__NumOfProducts', 'num__Age', 'ready__ActivityScore', 'cat__Gender_Male', 'num__AgeProduct', 'cat__Geography_Germany', 'num__LogBalanceSalaryRatio', 'num__Balance', 'cat__AgeGroup_Senior', 'ready__IsActiveMember']


In [28]:
shap_df_xgb['Feature'][:10]


0          ready__NumOfProducts
1                      num__Age
2          ready__ActivityScore
3              cat__Gender_Male
4               num__AgeProduct
5        cat__Geography_Germany
6    num__LogBalanceSalaryRatio
7                  num__Balance
8          cat__AgeGroup_Senior
9         ready__IsActiveMember
Name: Feature, dtype: object

In [32]:
top_features = shap_df_xgb['Feature'][:10].tolist()
top_feature_indices = [feature_names.index(feat) for feat in top_features]
X_train_top = X_train_processed[:, top_feature_indices]
X_test_top = X_test_processed[:, top_feature_indices] 

In [33]:
  
# xgboost model with class weights
final_xgb_model = xgb.XGBClassifier(**xgb_best_params)
final_xgb_model.fit(X_train_top, y_train)
y_test_pred_xgb = final_xgb_model.predict(X_test_top)
print("Classification Report for test:\n", classification_report(y_test, y_test_pred_xgb))    
print('f1 ', f1_score(y_test, y_test_pred_xgb))
y_train_pred_xgb = final_xgb_model.predict(X_train_top)
print("Classification Report for train:\n", classification_report(y_train, y_train_pred_xgb))
print('f1 ', f1_score(y_train, y_train_pred_xgb))

Classification Report for test:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.78      0.48      0.59       407

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.85      2000

f1  0.5914634146341463
Classification Report for train:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      6370
           1       0.81      0.51      0.63      1630

    accuracy                           0.88      8000
   macro avg       0.85      0.74      0.78      8000
weighted avg       0.87      0.88      0.87      8000

f1  0.6278195488721805


# threshold tunning

In [34]:

probs = final_xgb_model.predict_proba(X_test_top)[:,1]
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.1, 0.9, 0.01):
    preds = (probs > thresh).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print("Best threshold:", best_thresh, "Best F1:", best_f1)


Best threshold: 0.32999999999999985 Best F1: 0.6525529265255293


In [35]:
probs = final_xgb_model.predict_proba(X_test_top)[:, 1]
y_test_pred_xgb = (probs > best_thresh).astype(int)
print("Classification Report for test:\n", classification_report(y_test, y_test_pred_xgb))    
print('f1 ', f1_score(y_test, y_test_pred_xgb))
print('===========================================================')
y_train_pred_xgb = (final_xgb_model.predict_proba(X_train_top)[:, 1] > best_thresh).astype(int)
print("Classification Report for train:\n", classification_report(y_train, y_train_pred_xgb))
print('f1 ', f1_score(y_train, y_train_pred_xgb))

Classification Report for test:
               precision    recall  f1-score   support

           0       0.91      0.92      0.91      1593
           1       0.66      0.64      0.65       407

    accuracy                           0.86      2000
   macro avg       0.79      0.78      0.78      2000
weighted avg       0.86      0.86      0.86      2000

f1  0.6525529265255293
Classification Report for train:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92      6370
           1       0.68      0.67      0.67      1630

    accuracy                           0.87      8000
   macro avg       0.80      0.79      0.80      8000
weighted avg       0.87      0.87      0.87      8000

f1  0.6747286821705426


-----------

# catboost model base

In [37]:
cat_model = CatBoostClassifier(verbose=False,random_seed=42)
train_pool = Pool(X_train_processed, y_train)
cat_model.fit(train_pool)

y_test_pred_cat = cat_model.predict(X_test_processed)
print("Classification Report for test:\n", classification_report(y_test, y_test_pred_cat)) 
print('f1 score for test',f1_score(y_test, y_test_pred_cat))
print('=====================================================')
y_train_pred_cat = cat_model.predict(X_train_processed)
print("Classification Report for train:\n", classification_report(y_train, y_train_pred_cat))
print('f1 score for train',f1_score(y_train, y_train_pred_cat))

Classification Report for test:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.80      0.49      0.61       407

    accuracy                           0.87      2000
   macro avg       0.84      0.73      0.77      2000
weighted avg       0.87      0.87      0.86      2000

f1 score for test 0.6118721461187214
Classification Report for train:
               precision    recall  f1-score   support

           0       0.91      0.98      0.95      6370
           1       0.91      0.64      0.75      1630

    accuracy                           0.91      8000
   macro avg       0.91      0.81      0.85      8000
weighted avg       0.91      0.91      0.91      8000

f1 score for train 0.7496402877697842


# Hyperparameters tuning for catboost

In [78]:
def cat_objective(trial):

    params = {
        "iterations": trial.suggest_int("iterations", 200, 1200),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0),

        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 5.0),

        "loss_function": "Logloss",
        "eval_metric": "F1",

        "verbose": False,
        "random_seed": 42,
        "use_best_model": False
    }

    # Pool
    train_pool = Pool(X_train_processed, y_train)

    model = CatBoostClassifier(**params)
    model.fit(
        train_pool,
        early_stopping_rounds=70,
        verbose=False
    )

    # Predict on validation
    preds = model.predict(X_test_processed)
    f1 = f1_score(y_test, preds)

    return f1

In [79]:
study = optuna.create_study(direction="maximize")
study.optimize(cat_objective, n_trials=120)

print("\n Best F1:", study.best_value)
print(" Best Params:\n", study.best_params)


ðŸŽ¯ Best F1: 0.6523388116308471
ðŸ”¥ Best Params:
 {'iterations': 265, 'depth': 4, 'learning_rate': 0.06529304400269391, 'l2_leaf_reg': 5.67095569918552, 'bagging_temperature': 0.6898766159711811, 'random_strength': 2.723472757442978, 'scale_pos_weight': 1.9383098788024475}


In [80]:
cat_best_params = study.best_params
cat_best_params["loss_function"] = "Logloss"
cat_best_params["eval_metric"] = "F1"
cat_best_params["verbose"] = False
cat_best_params["use_best_model"] = False
cat_best_params["random_seed"] = 42

In [81]:
tunned_cat_model = CatBoostClassifier(**cat_best_params)

train_pool = Pool(X_train_processed, y_train)
valid_pool = Pool(X_test_processed, y_test)

tunned_cat_model.fit(
    train_pool,
    eval_set=valid_pool,
    early_stopping_rounds=80,
    verbose=False
)

# Evaluate
y_test_pred_cat = tunned_cat_model.predict(X_test_processed)
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_cat))
print("F1:", f1_score(y_test, y_test_pred_cat))
print('===================================================')
y_train_pred_cat = tunned_cat_model.predict(X_train_processed)
print("\nClassification Report (Train):")
print(classification_report(y_train, y_train_pred_cat))
print("F1:", f1_score(y_train, y_train_pred_cat))


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1593
           1       0.67      0.63      0.65       407

    accuracy                           0.86      2000
   macro avg       0.79      0.78      0.78      2000
weighted avg       0.86      0.86      0.86      2000

F1: 0.6523388116308471

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      6370
           1       0.70      0.65      0.68      1630

    accuracy                           0.87      8000
   macro avg       0.81      0.79      0.80      8000
weighted avg       0.87      0.87      0.87      8000

F1: 0.6755386565272496


In [82]:
# SHAP feature importance
explainer = shap.Explainer(tunned_cat_model, X_train_processed)
shap_values = explainer(X_train_processed)
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

# Create DF
shap_df = pd.DataFrame({
    'Feature': feature_names,
    'SHAP_Value': mean_abs_shap
}).sort_values(by='SHAP_Value', ascending=False)



In [89]:
shap_df_cat = shap_df.reset_index()
shap_df_cat['cumsum'] = shap_df_cat['SHAP_Value'].cumsum()
shap_df_cat['cumsum_percent'] = 100 * shap_df_cat['cumsum'] / shap_df_cat['SHAP_Value'].sum()

top_features = shap_df_cat[shap_df_cat['cumsum_percent'] <= 85]['Feature'].tolist()
print(top_features)

['ready__NumOfProducts', 'num__Age', 'cat__Gender_Male', 'ready__ActivityScore', 'ready__IsActiveMember', 'cat__Geography_Germany', 'cat__AgeGroup_Senior', 'num__Balance', 'num__AgeProduct']


In [90]:
shap_df_cat[:9]

Unnamed: 0,index,Feature,SHAP_Value,cumsum,cumsum_percent
0,22,ready__NumOfProducts,0.625382,0.625382,21.629099
1,2,num__Age,0.533949,1.159331,40.095963
2,11,cat__Gender_Male,0.237244,1.396575,48.301154
3,18,ready__ActivityScore,0.197086,1.593661,55.117461
4,19,ready__IsActiveMember,0.193563,1.787223,61.811911
5,9,cat__Geography_Germany,0.173982,1.961205,67.829147
6,13,cat__AgeGroup_Senior,0.168332,2.129537,73.650971
7,3,num__Balance,0.147881,2.277417,78.765487
8,7,num__AgeProduct,0.111177,2.388594,82.610591


In [96]:
top_features = shap_df_cat['Feature'][:9].tolist()
top_feature_indices = [feature_names.index(feat) for feat in top_features]
X_train_top = X_train_processed[:, top_feature_indices]
X_test_top = X_test_processed[:, top_feature_indices] 

In [97]:

final_cat_model = CatBoostClassifier(**cat_best_params ,snapshot_file='cat_model.cbs')

train_pool = Pool(X_train_top, y_train)
valid_pool = Pool(X_test_top, y_test)

final_cat_model.fit(
    train_pool,
    eval_set=valid_pool,
    early_stopping_rounds=80,
    verbose=False
)

# Evaluate
y_test_pred = final_cat_model.predict(X_test_top)
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))
print("F1:", f1_score(y_test, y_test_pred))

y_train_pred = final_cat_model.predict(X_train_top)
print("\nClassification Report (Train):")
print(classification_report(y_train, y_train_pred))
print("F1:", f1_score(y_train, y_train_pred))



Classification Report (Test):
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1593
           1       0.65      0.64      0.64       407

    accuracy                           0.86      2000
   macro avg       0.78      0.77      0.78      2000
weighted avg       0.85      0.86      0.86      2000

F1: 0.6418835192069393

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      6370
           1       0.67      0.65      0.66      1630

    accuracy                           0.86      8000
   macro avg       0.79      0.78      0.79      8000
weighted avg       0.86      0.86      0.86      8000

F1: 0.6583385384134915


In [98]:

probs = final_cat_model.predict_proba(X_test_top)[:,1]
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.1, 0.9, 0.01):
    preds = (probs > thresh).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print("Best threshold:", best_thresh, "Best F1:", best_f1)


Best threshold: 0.43999999999999984 Best F1: 0.6424384525205158


--------

In [112]:
top_features=list(set(shap_df_cat['Feature'][:9].tolist() + shap_df_xgb['Feature'][:10].tolist()))

top_feature_indices = [feature_names.index(feat) for feat in top_features]
X_train_top = X_train_processed[:, top_feature_indices]
X_test_top = X_test_processed[:, top_feature_indices] 

In [118]:
# voting classifier
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', final_xgb_model),
        ('cat', final_cat_model)
    ],
    voting='soft'  # Use 'soft' voting to average predicted probabilities
)
voting_clf.fit(X_train_top, y_train)
y_test_pred_voting = voting_clf.predict(X_test_top)
print("Classification Report for Voting Classifier (Test):\n", classification_report(y_test, y_test_pred_voting))    
print('f1 ', f1_score(y_test, y_test_pred_voting))
y_train_pred_voting = voting_clf.predict(X_train_top)
print("Classification Report for Voting Classifier (Train):\n", classification_report(y_train, y_train_pred_voting))
print('f1 ', f1_score(y_train, y_train_pred_voting))

Classification Report for Voting Classifier (Test):
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      1593
           1       0.72      0.56      0.63       407

    accuracy                           0.87      2000
   macro avg       0.81      0.75      0.77      2000
weighted avg       0.86      0.87      0.86      2000

f1  0.6308539944903582
Classification Report for Voting Classifier (Train):
               precision    recall  f1-score   support

           0       0.90      0.95      0.92      6370
           1       0.75      0.59      0.66      1630

    accuracy                           0.88      8000
   macro avg       0.82      0.77      0.79      8000
weighted avg       0.87      0.88      0.87      8000

f1  0.6579490708878183


In [124]:

probs = voting_clf.predict_proba(X_test_top)[:,1]
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.1, 0.9, 0.01):
    preds = (probs > thresh).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print("Best threshold:", best_thresh, "Best F1:", best_f1)


Best threshold: 0.3599999999999999 Best F1: 0.6543352601156069


--------

# save the final model and preprocessor

In [111]:
import os
import joblib
import datetime

# Base models directory
BASE_MODEL_DIR = os.path.join("..", "models")
os.makedirs(BASE_MODEL_DIR, exist_ok=True)

# Timestamp for versioning
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# --- XGBoost ---
xgb_dir = os.path.join(BASE_MODEL_DIR, f"xgb_{timestamp}")
os.makedirs(xgb_dir, exist_ok=True)

xgb_model_path = os.path.join(xgb_dir, "model.pkl")
xgb_checkpoint_path = os.path.join(xgb_dir, "checkpoint.json")

joblib.dump(final_xgb_model, xgb_model_path)
final_xgb_model.save_model(xgb_checkpoint_path)

# --- CatBoost ---
cat_dir = os.path.join(BASE_MODEL_DIR, f"catboost_{timestamp}")
os.makedirs(cat_dir, exist_ok=True)

cat_model_path = os.path.join(cat_dir, "model.pkl")
cat_checkpoint_path = os.path.join(cat_dir, "checkpoint.cbs")

joblib.dump(final_cat_model, cat_model_path)
final_cat_model.save_model(cat_checkpoint_path)

# --- Preprocessor ---
preprocessor_dir = os.path.join(BASE_MODEL_DIR, f"preprocessor_{timestamp}")
os.makedirs(preprocessor_dir, exist_ok=True)

preprocessor_path = os.path.join(preprocessor_dir, "preprocessor.pkl")
joblib.dump(preprocessor, preprocessor_path)

print("Models, checkpoints, and preprocessor saved successfully!")


Models, checkpoints, and preprocessor saved successfully!
