In [1]:
# !pip install scikit-learn

In [2]:
# !pip install pycaret

## Importing the necessary package

In [18]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score  # Import metrics
import shap
import pickle
from pycaret.classification import setup, compare_models, pull, save_model, load_model,models


In [2]:
data = pd.read_csv('1_prep_data.csv')
data.shape

(9997, 13)

In [3]:
data.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,EstimatedSalary,CreditScore_Custom,Age_Group_Custom,Balance,NumOfProducts,Tenure,IsActiveMember,Exited
0,15634602,619,France,Female,42.0,101348.88,Good,Middle-Aged,0.0,1,2,1,1
1,15647311,608,Spain,Female,41.0,112542.58,Good,Middle-Aged,83807.86,1,1,1,0
2,15619304,502,France,Female,42.0,113931.57,Fair,Middle-Aged,159660.8,3,8,0,1
3,15701354,699,France,Female,39.0,93826.63,Very Good,Middle-Aged,0.0,2,1,0,0
4,15737888,850,Spain,Female,43.0,79084.1,Excellent,Middle-Aged,125510.82,1,2,1,0


## Preprocessing 

- Shall remove  the Credit Score and Age as we have bukcets to it 
- the No. of products and tenure are already aligned
- Gender Label encode
- Geography, Gender, age_group ad credit_score need to be one_hot_encoded 
- EstimatedSalary and Balance need to be normalized (log transformation shall be done to handle the skewness)


In [4]:
## 1) Shall remove  the Credit Score and Age as we have bukcets to it 
from sklearn.model_selection import train_test_split
y = data['Exited']
X = data.drop(['Exited','Age','CreditScore','CustomerId'],axis=1)
primary_key = data['CustomerId']
x_train, x_temp, y_train, y_temp,test_primary_key,temp_primary_key = train_test_split(X, y,primary_key, test_size=0.2, random_state=42, stratify=y)

x_test, x_oos, y_test, y_oos,test_primary_key,oos_primary_key = train_test_split(x_temp, y_temp,temp_primary_key, test_size=0.5, random_state=42, stratify=y_temp)

# Now you have your three datasets:
print(f"Training set size: X_train={x_train.shape}, y_train={y_train.shape}")
print(f"Testing set size: X_test={x_test.shape}, y_test={y_test.shape}")
print(f"Out-of-sample set size: X_oos={x_oos.shape}, y_oos={y_oos.shape}")


Training set size: X_train=(7997, 9), y_train=(7997,)
Testing set size: X_test=(1000, 9), y_test=(1000,)
Out-of-sample set size: X_oos=(1000, 9), y_oos=(1000,)


In [5]:
y_train.value_counts()

Exited
0    6368
1    1629
Name: count, dtype: int64

In [6]:
y_test.value_counts()

Exited
0    796
1    204
Name: count, dtype: int64

In [7]:
y_oos.value_counts()

Exited
0    796
1    204
Name: count, dtype: int64

In [8]:
x_train.columns

Index(['Geography', 'Gender', 'EstimatedSalary', 'CreditScore_Custom',
       'Age_Group_Custom', 'Balance', 'NumOfProducts', 'Tenure',
       'IsActiveMember'],
      dtype='object')

In [9]:
x_train['Gender'].dtypes

dtype('O')

In [10]:
x_train['Balance'].dtypes

dtype('float64')

In [11]:
numeric_columns = [col for col in x_train.columns if x_train[col].dtype != 'O' and col!='CustomerId']
numeric_columns
categorical_columns = [col for col in x_train.columns if x_train[col].dtype == 'O']
print(categorical_columns)

['Geography', 'Gender', 'CreditScore_Custom', 'Age_Group_Custom']


In [12]:
labels_endcode = ['Gender']
one_hot_labels = [val for val in categorical_columns if val not in labels_endcode]
print(labels_endcode,one_hot_labels)


['Gender'] ['Geography', 'CreditScore_Custom', 'Age_Group_Custom']


In [13]:
log_transform_cols = ['EstimatedSalary', 'Balance']

### Model Selection Phase

#### i) Freezing the Preprocessing Pipeline

In [14]:
# --- 1. Define Preprocessing Steps ---

# --- 1.1.  Label Encoding ---
label_encode_processor = Pipeline(
    steps=[
         ("label_encoder", OrdinalEncoder()),
    ]
)

# --- 1.2. One-Hot Encoding ---
onehot_encode_processor = Pipeline(
    steps=[
        ("onehot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# --- 1.3. Log Transformation ---
def log_transform(x):
    # Handle non-positive values by adding a small constant
    x_adjusted = np.where(x <= 0, 1e-6, x)
    return np.log(x_adjusted)

log_transform_processor = Pipeline(
    steps=[
        ("log_transformer", FunctionTransformer(log_transform)),
    ]
)
# --- 2.  Column Transformer ---
# Apply the different transformations to the specified columns
preprocessor = ColumnTransformer(
    transformers=[
        ("label_encode", label_encode_processor, labels_endcode),
        ("onehot_encode", onehot_encode_processor, one_hot_labels),
        ("log_transform", log_transform_processor, log_transform_cols),
    ],
    remainder="passthrough",  # Keep the remaining columns as is
)

# --- 3.  Pipeline ---
# Combine the preprocessor with optional missing value handling

positive_class_count = np.sum(y_train == 1)
negative_class_count = np.sum(y_train == 0)
scale_pos_weight = negative_class_count / positive_class_count

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])



In [15]:
X_train_processed_default = pipeline.fit_transform(x_train)
X_test_processed_default = pipeline.transform(x_test)

# --- 7.  Convert processed data to DataFrame for PyCaret ---
#  Convert to dataframe for pycaret

X_train_processed_df = pd.DataFrame(X_train_processed_default)
X_train_processed_df['target'] = y_train.values # add target variable

X_test_processed_df = pd.DataFrame(X_test_processed_default)
X_test_processed_df['target'] = y_test.values



In [16]:
# all_models = models()

#### ii) Using Pycaret to find the best model

In [19]:
# --- 8. PyCaret Setup ---
# PyCaret setup
s = setup(X_train_processed_df, target='target', fold_strategy="stratifiedkfold") # Added fold_strategy

# --- 9. Compare Models ---
# Compare models and select the best one
best_model = compare_models()
pull()

Unnamed: 0,Description,Value
0,Session id,5624
1,Target,target
2,Target type,Binary
3,Original data shape,"(7997, 20)"
4,Transformed data shape,"(7997, 20)"
5,Transformed train set shape,"(5597, 20)"
6,Transformed test set shape,"(2400, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8494,0.8423,0.3974,0.7447,0.5166,0.4375,0.4684,0.183
ada,Ada Boost Classifier,0.8453,0.8213,0.3825,0.7283,0.5003,0.4195,0.4505,0.076
lightgbm,Light Gradient Boosting Machine,0.8437,0.8322,0.4412,0.6793,0.5333,0.4449,0.4606,0.134
rf,Random Forest Classifier,0.8421,0.8177,0.4246,0.6824,0.5222,0.4339,0.4521,0.178
xgboost,Extreme Gradient Boosting,0.8333,0.8105,0.4351,0.6325,0.5144,0.4182,0.4294,0.069
et,Extra Trees Classifier,0.826,0.7998,0.4351,0.6014,0.5036,0.4017,0.41,0.138
ridge,Ridge Classifier,0.8206,0.7569,0.2061,0.7094,0.3181,0.2493,0.3136,0.015
lda,Linear Discriminant Analysis,0.8206,0.7568,0.2825,0.6418,0.3904,0.3034,0.3396,0.013
lr,Logistic Regression,0.8197,0.7606,0.2553,0.6476,0.3652,0.2827,0.325,0.663
knn,K Neighbors Classifier,0.8106,0.7193,0.2798,0.5737,0.3742,0.2784,0.3042,0.073


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8494,0.8423,0.3974,0.7447,0.5166,0.4375,0.4684,0.183
ada,Ada Boost Classifier,0.8453,0.8213,0.3825,0.7283,0.5003,0.4195,0.4505,0.076
lightgbm,Light Gradient Boosting Machine,0.8437,0.8322,0.4412,0.6793,0.5333,0.4449,0.4606,0.134
rf,Random Forest Classifier,0.8421,0.8177,0.4246,0.6824,0.5222,0.4339,0.4521,0.178
xgboost,Extreme Gradient Boosting,0.8333,0.8105,0.4351,0.6325,0.5144,0.4182,0.4294,0.069
et,Extra Trees Classifier,0.826,0.7998,0.4351,0.6014,0.5036,0.4017,0.41,0.138
ridge,Ridge Classifier,0.8206,0.7569,0.2061,0.7094,0.3181,0.2493,0.3136,0.015
lda,Linear Discriminant Analysis,0.8206,0.7568,0.2825,0.6418,0.3904,0.3034,0.3396,0.013
lr,Logistic Regression,0.8197,0.7606,0.2553,0.6476,0.3652,0.2827,0.325,0.663
knn,K Neighbors Classifier,0.8106,0.7193,0.2798,0.5737,0.3742,0.2784,0.3042,0.073


##

#### ii) Ensemble model performance (XG Boost)
Handling imbalance with scale_pos_weight

In [20]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score  # Import metrics
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

positive_class_count = np.sum(y_train == 1)
negative_class_count = np.sum(y_train == 0)
scale_pos_weight = negative_class_count / positive_class_count

# --- 8.  XGBoost Classifier ---
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)  # Suppress a warning and set the evaluation metric
xgb_model.fit(X_train_processed_default, y_train)
xgb_y_pred = xgb_model.predict(X_test_processed_default)
xgb_y_prob = xgb_model.predict_proba(X_test_processed_default)[:, 1] # Probability estimates for ROC AUC

# --- 9. CatBoost Classifier ---
cat_model = CatBoostClassifier(verbose=0, scale_pos_weight=scale_pos_weight)  # Suppress verbose output
cat_model.fit(X_train_processed_default, y_train)
cat_y_pred = cat_model.predict(X_test_processed_default)
cat_y_prob = cat_model.predict_proba(X_test_processed_default)[:, 1] # Probability estimates for ROC AUC


# --- 10. Combine Primary Key with Predictions and Actuals ---
results_df_xgb = pd.DataFrame(
    {"customer_id": test_primary_key, "prediction_xgb": xgb_y_pred, "actual": y_test}
)
results_df_cat = pd.DataFrame(
    {"customer_id": test_primary_key, "prediction_cat": cat_y_pred, "actual": y_test}
)

# --- 11. Calculate and Print Performance Metrics ---
print("\n--- XGBoost Performance ---")
print(classification_report(y_test, xgb_y_pred))
print(f"Accuracy: {accuracy_score(y_test, xgb_y_pred):.4f}")
print(f"Precision: {precision_score(y_test, xgb_y_pred):.4f}")
print(f"Recall: {recall_score(y_test, xgb_y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, xgb_y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, xgb_y_prob):.4f}")

print("\n--- CatBoost Performance ---")
print(classification_report(y_test, cat_y_pred))
print(f"Accuracy: {accuracy_score(y_test, cat_y_pred):.4f}")
print(f"Precision: {precision_score(y_test, cat_y_pred):.4f}")
print(f"Recall: {recall_score(y_test, cat_y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, cat_y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, cat_y_prob):.4f}")
print("\n--- Transformed Training Data (First 5 rows) ---")


--- XGBoost Performance ---
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       796
           1       0.56      0.66      0.60       204

    accuracy                           0.82      1000
   macro avg       0.73      0.76      0.75      1000
weighted avg       0.84      0.82      0.83      1000

Accuracy: 0.8240
Precision: 0.5583
Recall: 0.6569
F1 Score: 0.6036
ROC AUC: 0.8392

--- CatBoost Performance ---
              precision    recall  f1-score   support

           0       0.92      0.83      0.88       796
           1       0.53      0.73      0.61       204

    accuracy                           0.81      1000
   macro avg       0.73      0.78      0.74      1000
weighted avg       0.84      0.81      0.82      1000

Accuracy: 0.8120
Precision: 0.5286
Recall: 0.7255
F1 Score: 0.6116
ROC AUC: 0.8547

--- Transformed Training Data (First 5 rows) ---


Inference: With high recall I woul be able to identify all the people who will churn. Catboost seems to perform good over this dataset

#### iii) Ensemble model performance (Catboost Boost)
Handling imbalance with scale_pos_weight

In [24]:
# --- 1. Define Preprocessing Steps ---

# --- 1.1.  Label Encoding ---
label_encode_processor = Pipeline(
    steps=[
         ("label_encoder", OrdinalEncoder()),
    ]
)

# --- 1.2. One-Hot Encoding ---
onehot_encode_processor = Pipeline(
    steps=[
        ("onehot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# --- 1.3. Log Transformation ---
def log_transform(x):
    # Handle non-positive values by adding a small constant
    x_adjusted = np.where(x <= 0, 1e-6, x)
    return np.log(x_adjusted)

log_transform_processor = Pipeline(
    steps=[
        ("log_transformer", FunctionTransformer(log_transform)),
    ]
)
# --- 2.  Column Transformer ---
# Apply the different transformations to the specified columns
preprocessor = ColumnTransformer(
    transformers=[
        ("label_encode", label_encode_processor, labels_endcode),
        ("onehot_encode", onehot_encode_processor, one_hot_labels),
        ("log_transform", log_transform_processor, log_transform_cols),
    ],
    remainder="passthrough",  # Keep the remaining columns as is
)

# --- 3.  Pipeline ---
# Combine the preprocessor with optional missing value handling

positive_class_count = np.sum(y_train == 1)
negative_class_count = np.sum(y_train == 0)
scale_pos_weight = negative_class_count / positive_class_count

model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("model",CatBoostClassifier())])



In [26]:
model_pipeline.fit(x_train, y_train)

# --- 11. Get Best Model and Predictions ---
# best_model = model_pipeline.best_estimator_
y_pred = model_pipeline.predict(x_test)
y_prob = model_pipeline.predict_proba(x_test)[:, 1]

# --- 12. Combine Primary Key with Predictions and Actuals ---
results_df = pd.DataFrame(
    {"customer_id": test_primary_key, "prediction": y_pred, "actual": y_test}
)

# --- 13. Calculate and Print Performance Metrics ---
print("\n--- CatBoost Performance ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")



Learning rate set to 0.025031
0:	learn: 0.6738073	total: 7.33ms	remaining: 7.32s
1:	learn: 0.6556375	total: 13.2ms	remaining: 6.59s
2:	learn: 0.6370100	total: 18.4ms	remaining: 6.11s
3:	learn: 0.6200884	total: 23.2ms	remaining: 5.77s
4:	learn: 0.6036148	total: 28ms	remaining: 5.58s
5:	learn: 0.5879113	total: 32.8ms	remaining: 5.43s
6:	learn: 0.5740236	total: 37.2ms	remaining: 5.28s
7:	learn: 0.5611483	total: 41ms	remaining: 5.09s
8:	learn: 0.5499641	total: 45.2ms	remaining: 4.97s
9:	learn: 0.5414537	total: 49ms	remaining: 4.85s
10:	learn: 0.5306282	total: 52.7ms	remaining: 4.74s
11:	learn: 0.5205658	total: 56.9ms	remaining: 4.68s
12:	learn: 0.5117097	total: 61.3ms	remaining: 4.65s
13:	learn: 0.5029038	total: 65.6ms	remaining: 4.62s
14:	learn: 0.4939518	total: 70.2ms	remaining: 4.61s
15:	learn: 0.4876547	total: 74.3ms	remaining: 4.57s
16:	learn: 0.4801719	total: 78.7ms	remaining: 4.55s
17:	learn: 0.4743804	total: 83.1ms	remaining: 4.53s
18:	learn: 0.4686454	total: 87.4ms	remaining: 4.51

### Hyperparameter Phase

### Hyper parameter tuning - Cat boost with numerical values


In [None]:
# --- 1. Define Preprocessing Steps ---

# --- 1.1.  Label Encoding ---
label_encode_processor = Pipeline(
    steps=[
         ("label_encoder", OrdinalEncoder()),
    ]
)

# --- 1.2. One-Hot Encoding ---
onehot_encode_processor = Pipeline(
    steps=[
        ("onehot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# --- 1.3. Log Transformation ---
def log_transform(x):
    # Handle non-positive values by adding a small constant
    x_adjusted = np.where(x <= 0, 1e-6, x)
    return np.log(x_adjusted)

log_transform_processor = Pipeline(
    steps=[
        ("log_transformer", FunctionTransformer(log_transform)),
    ]
)
# --- 2.  Column Transformer ---
# Apply the different transformations to the specified columns
preprocessor = ColumnTransformer(
    transformers=[
        # ("label_encode", label_encode_processor, labels_endcode),
        # ("onehot_encode", onehot_encode_processor, one_hot_labels),
        ("log_transform", log_transform_processor, log_transform_cols),
    ],
    remainder="passthrough",  # Keep the remaining columns as is
)

# --- 3.  Pipeline ---
# Combine the preprocessor with optional missing value handling

positive_class_count = np.sum(y_train == 1)
negative_class_count = np.sum(y_train == 0)
scale_pos_weight = negative_class_count / positive_class_count

model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("model",CatBoostClassifier())])



In [None]:
cat_variables

['Geography', 'Gender', 'CreditScore_Custom', 'Age_Group_Custom']

In [None]:
# x_train.columns

In [None]:
# cat_model = model_pipeline.fit(x_train,y_train)

In [None]:
# 'model__cat_features': cat_variables,
# cat_variables

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
# --- 9. Define Parameter Grid for Tuning ---
param_grid = {
        'model__l2_leaf_reg': [1, 3, 5],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__iterations': [50, 100, 150],
        'model__scale_pos_weight': [scale_pos_weight],
        'model__border_count': [20, 32, 50],  
        'model__random_strength': [0.5, 1, 5],
    }

# --- 10. Perform Grid Search ---
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='precision', verbose=1)
grid_search.fit(x_train, y_train)

# --- 11. Get Best Model and Predictions ---
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
y_prob = best_model.predict_proba(x_test)[:, 1]

# --- 12. Combine Primary Key with Predictions and Actuals ---
results_df = pd.DataFrame(
    {"customer_id": test_primary_key, "prediction": y_pred, "actual": y_test}
)

# --- 13. Calculate and Print Performance Metrics ---
print("\n--- CatBoost Performance ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

print("\n--- Best Parameters ---")
print(grid_search.best_params_)


Fitting 3 folds for each of 243 candidates, totalling 729 fits
0:	learn: 0.6907856	total: 26.2ms	remaining: 1.28s
1:	learn: 0.6885034	total: 60.8ms	remaining: 1.46s
2:	learn: 0.6862295	total: 91.2ms	remaining: 1.43s
3:	learn: 0.6839963	total: 120ms	remaining: 1.38s
4:	learn: 0.6818290	total: 157ms	remaining: 1.41s
5:	learn: 0.6796605	total: 190ms	remaining: 1.39s
6:	learn: 0.6776891	total: 224ms	remaining: 1.37s
7:	learn: 0.6756389	total: 257ms	remaining: 1.35s
8:	learn: 0.6737057	total: 290ms	remaining: 1.32s
9:	learn: 0.6716766	total: 322ms	remaining: 1.29s
10:	learn: 0.6697209	total: 355ms	remaining: 1.26s
11:	learn: 0.6678065	total: 390ms	remaining: 1.24s
12:	learn: 0.6658796	total: 424ms	remaining: 1.21s
13:	learn: 0.6642430	total: 447ms	remaining: 1.15s
14:	learn: 0.6624305	total: 482ms	remaining: 1.12s
15:	learn: 0.6605723	total: 514ms	remaining: 1.09s
16:	learn: 0.6587386	total: 547ms	remaining: 1.06s
17:	learn: 0.6570146	total: 575ms	remaining: 1.02s
18:	learn: 0.6553237	total

### Hyper parameter tuning - Cat boost with Categorical features

In [None]:
x_train.columns

Index(['Geography', 'Gender', 'EstimatedSalary', 'CreditScore_Custom',
       'Age_Group_Custom', 'Balance', 'NumOfProducts', 'Tenure',
       'IsActiveMember'],
      dtype='object')

In [None]:
cat_variables =  [col for col in x_train.columns if x_train[col].dtype == 'O']
other_variables = list(set(x_train.columns.to_list()) - set(categorical_columns))

final_cols = cat_variables+other_variables
x_train = x_train[final_cols]
x_train.columns

Index(['Geography', 'Gender', 'CreditScore_Custom', 'Age_Group_Custom',
       'EstimatedSalary', 'Tenure', 'Balance', 'IsActiveMember',
       'NumOfProducts'],
      dtype='object')

In [None]:
# --- 1. Define Preprocessing Steps ---

# --- 1.1.  Label Encoding ---
label_encode_processor = Pipeline(
    steps=[
         ("label_encoder", OrdinalEncoder()),
    ]
)

# --- 1.2. One-Hot Encoding ---
onehot_encode_processor = Pipeline(
    steps=[
        ("onehot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# --- 1.3. Log Transformation ---
def log_transform(x):
    # Handle non-positive values by adding a small constant
    x_adjusted = np.where(x <= 0, 1e-6, x)
    return np.log(x_adjusted)

log_transform_processor = Pipeline(
    steps=[
        ("log_transformer", FunctionTransformer(log_transform)),
    ]
)
# --- 2.  Column Transformer ---
# Apply the different transformations to the specified columns
preprocessor = ColumnTransformer(
    transformers=[
        # ("label_encode", label_encode_processor, labels_endcode),
        # ("onehot_encode", onehot_encode_processor, one_hot_labels),
        ("log_transform", log_transform_processor, log_transform_cols),
    ],
    remainder="passthrough",  # Keep the remaining columns as is
)

# --- 3.  Pipeline ---
# Combine the preprocessor with optional missing value handling

positive_class_count = np.sum(y_train == 1)
negative_class_count = np.sum(y_train == 0)
scale_pos_weight = negative_class_count / positive_class_count

model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("model",CatBoostClassifier(cat_features=[2,3,4,5]))])




In [None]:
cat_model_2 = model_pipeline.fit(x_train,y_train)

Learning rate set to 0.025031
0:	learn: 0.6769137	total: 33.8ms	remaining: 33.7s
1:	learn: 0.6583311	total: 69ms	remaining: 34.4s
2:	learn: 0.6403773	total: 97.7ms	remaining: 32.5s
3:	learn: 0.6250052	total: 120ms	remaining: 30s
4:	learn: 0.6114915	total: 141ms	remaining: 28.1s
5:	learn: 0.5964122	total: 166ms	remaining: 27.4s
6:	learn: 0.5824357	total: 189ms	remaining: 26.9s
7:	learn: 0.5694416	total: 216ms	remaining: 26.8s
8:	learn: 0.5570821	total: 241ms	remaining: 26.5s
9:	learn: 0.5458768	total: 265ms	remaining: 26.2s
10:	learn: 0.5344774	total: 289ms	remaining: 26s
11:	learn: 0.5237179	total: 314ms	remaining: 25.9s
12:	learn: 0.5151085	total: 339ms	remaining: 25.7s
13:	learn: 0.5077071	total: 364ms	remaining: 25.6s
14:	learn: 0.5006855	total: 388ms	remaining: 25.5s
15:	learn: 0.4920538	total: 412ms	remaining: 25.3s
16:	learn: 0.4833416	total: 437ms	remaining: 25.2s
17:	learn: 0.4759791	total: 463ms	remaining: 25.3s
18:	learn: 0.4702970	total: 489ms	remaining: 25.2s
19:	learn: 0.4

In [None]:
# --- 11. Get Best Model and Predictions ---
best_model = grid_search.best_estimator_
y_pred = cat_model_2.predict(x_test[final_cols])
y_prob = cat_model_2.predict_proba(x_test[final_cols])[:, 1]

# --- 12. Combine Primary Key with Predictions and Actuals ---
results_df = pd.DataFrame(
    {"customer_id": test_primary_key, "prediction": y_pred, "actual": y_test}
)

# --- 13. Calculate and Print Performance Metrics ---
print("\n--- CatBoost Performance ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

print("\n--- Best Parameters ---")
print(grid_search.best_params_)



--- CatBoost Performance ---
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       796
           1       0.71      0.46      0.56       204

    accuracy                           0.85      1000
   macro avg       0.79      0.71      0.74      1000
weighted avg       0.84      0.85      0.84      1000

Accuracy: 0.8520
Precision: 0.7121
Recall: 0.4608
F1 Score: 0.5595
ROC AUC: 0.8609

--- Best Parameters ---
{'model__iterations': 150, 'model__l2_leaf_reg': 1, 'model__learning_rate': 0.1, 'model__scale_pos_weight': 3.90914671577655}


Inference: Choosing the catboost classifier and leaving the categorical variable as it is not helping much meaning no significant changes

