In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from catboost import CatBoostClassifier
import joblib

In [2]:
df = pd.read_csv('/kaggle/input/churn-modeling-dataset/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Drop useless columns
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

In [4]:
df['BalanceSalaryRatio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
df['AgeTenureRatio'] = df['Age'] / (df['Tenure'] + 1)
df['ProductsPerTenure'] = df['NumOfProducts'] / (df['Tenure'] + 1)
df['HasBalanceFlag'] = (df['Balance'] > 0).astype(int)
df['IsSeniorCitizen'] = (df['Age'] > 60).astype(int)
df['CreditScoreBin'] = pd.cut(df['CreditScore'], bins=[300,500,650,850], labels=[0,1,2])

In [5]:
# One-hot encoding
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [6]:
print("Final Dataset Shape:", df.shape)

Final Dataset Shape: (10000, 18)


In [7]:
X = df.drop(columns=['Exited'])
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
cat = CatBoostClassifier(
    iterations=800,
    depth=8,
    learning_rate=0.03,
    loss_function='Logloss',
    eval_metric='AUC',
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)

In [10]:
cat.fit(X_train_scaled, y_train)
y_pred_cat = cat.predict(X_test_scaled)

In [11]:
final_threshold = 0.35
y_prob = cat.predict_proba(X_test_scaled)[:, 1]
y_pred_final = (y_prob >= final_threshold).astype(int)

In [12]:
print(f"FINAL CATBOOST RESULTS (Threshold = {final_threshold})")
print(classification_report(y_test, y_pred_final))
print(confusion_matrix(y_test, y_pred_final))

FINAL CATBOOST RESULTS (Threshold = 0.35)
              precision    recall  f1-score   support

           0       0.93      0.79      0.85      1593
           1       0.48      0.76      0.58       407

    accuracy                           0.78      2000
   macro avg       0.70      0.77      0.72      2000
weighted avg       0.84      0.78      0.80      2000

[[1255  338]
 [  99  308]]


In [13]:
joblib.dump(cat, "catboost_churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")

with open("threshold.txt", "w") as f:
    f.write(str(final_threshold))

print("\nSaved: catboost_churn_model.pkl, scaler.pkl, threshold.txt")


Saved: catboost_churn_model.pkl, scaler.pkl, threshold.txt
