In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

samp = pd.read_csv('./sample_submission.csv')
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
main = pd.read_csv('./Churn_Modelling.csv')

main.drop('RowNumber', inplace=True, axis=1)
train.drop('id', inplace=True, axis=1)
test.drop('id', inplace=True, axis=1)

train = pd.concat([train, main], axis=0)

cat_cols = ['Geography', 'Gender']
cat_cols.append('Surname')

num_cols = [col for col in train.columns if col not in cat_cols + ['Exited']]

X = train.drop('Exited', axis=1)
y = train['Exited']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)
X_test_preprocessed = preprocessor.transform(test)

num_features_count = len(num_cols)
cat_features_indices = list(range(num_features_count, num_features_count + len(cat_cols)))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
test_preds = []

for fold, (train_index, valid_index) in enumerate(kf.split(X_preprocessed, y)):

    X_train_fold, X_valid_fold = X_preprocessed[train_index], X_preprocessed[valid_index]
    y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

    #
    model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=3,
        eval_metric='AUC',
        cat_features=cat_features_indices,
        random_seed=42,
        verbose=100,
        early_stopping_rounds=100
    )

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_valid_fold, y_valid_fold),
        use_best_model=True
    )

    valid_pred = model.predict_proba(X_valid_fold)[:, 1]
    fold_auc = roc_auc_score(y_valid_fold, valid_pred)
    fold_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")

    test_pred = model.predict_proba(X_test_preprocessed)[:, 1]
    test_preds.append(test_pred)

print("\nFold AUC scores:", fold_scores)
print("Mean AUC:", np.mean(fold_scores))

final_test_pred = np.mean(test_preds, axis=0)

samp['Exited'] = final_test_pred
samp.to_csv('sample_submission.csv', index=False)


0:	test: 0.8702285	best: 0.8702285 (0)	total: 882ms	remaining: 29m 23s
100:	test: 0.8904251	best: 0.8904251 (100)	total: 20.4s	remaining: 6m 23s
200:	test: 0.8916584	best: 0.8916584 (200)	total: 37.9s	remaining: 5m 39s
300:	test: 0.8920705	best: 0.8920705 (300)	total: 57.8s	remaining: 5m 26s
400:	test: 0.8922292	best: 0.8922546 (379)	total: 1m 16s	remaining: 5m 6s
500:	test: 0.8923180	best: 0.8923561 (447)	total: 1m 35s	remaining: 4m 45s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8923561055
bestIteration = 447

Shrink model to first 448 iterations.
Fold 1 AUC: 0.8924
0:	test: 0.8758031	best: 0.8758031 (0)	total: 209ms	remaining: 6m 58s
100:	test: 0.8915519	best: 0.8915519 (100)	total: 19.6s	remaining: 6m 9s
200:	test: 0.8928068	best: 0.8928115 (199)	total: 37.1s	remaining: 5m 31s
300:	test: 0.8932632	best: 0.8932677 (299)	total: 55.3s	remaining: 5m 12s
400:	test: 0.8935484	best: 0.8935489 (399)	total: 1m 16s	remaining: 5m 3s
500:	test: 0.8936322	best: 0.893642