In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier, Pool, metrics, cv
from tqdm.auto import tqdm
import optuna

tqdm.pandas()

SEED = 2007

In [2]:
df = pd.read_csv('../data/train_new.csv')
df_test = pd.read_csv('../data/test_new.csv')

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,261415,B,L,C,A,E,BI,A,AN,BJ,...,0.371697,0.353221,0.786306,0.736841,0.51529,0.339148,0.388802,0.485555,0.309845,0
1,143439,A,F,A,A,J,BI,A,AX,H,...,0.928611,0.12855,0.21167,0.81057,0.615746,0.408094,0.67133,0.265355,0.542658,0
2,135092,A,K,A,A,F,BI,A,N,AT,...,0.71387,0.24326,0.594517,0.735582,0.198877,0.777833,0.690787,0.5506,0.492067,0
3,326379,B,L,D,A,E,BI,A,F,L,...,0.787636,0.884292,0.723356,0.554065,0.273801,0.896823,0.751721,0.944222,0.834986,0
4,295612,B,N,A,D,E,BI,A,I,K,...,0.457731,0.682554,0.265673,0.357985,0.399585,0.542616,0.53541,0.359659,0.392999,0


## EDA

In [16]:
df.shape

(201000, 32)

In [3]:
df.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'target'],
      dtype='object')

In [4]:
df.describe()

Unnamed: 0,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
count,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0,201000.0
mean,250373.250418,0.504651,0.494231,0.516508,0.474128,0.505071,0.502358,0.488467,0.501708,0.487961,0.469658,0.50805,0.264871
std,144307.234038,0.206863,0.213308,0.214926,0.21674,0.227584,0.241415,0.211027,0.20343,0.179157,0.194539,0.203405,0.441265
min,0.0,-0.049562,0.08448,0.099276,-0.044761,0.169495,-0.036379,0.011942,0.092014,0.030979,0.214866,0.097789,0.0
25%,125520.25,0.34501,0.317305,0.325797,0.29201,0.279503,0.276298,0.324549,0.353144,0.358761,0.31008,0.368246,0.0
50%,250390.0,0.479549,0.463075,0.471004,0.387194,0.479879,0.554774,0.51813,0.435524,0.410529,0.408217,0.446279,0.0
75%,375716.75,0.637544,0.694397,0.704677,0.64494,0.72615,0.735266,0.608594,0.640702,0.612416,0.587074,0.581383,1.0
max,499999.0,1.004559,1.009958,1.0166,0.944037,0.856975,0.853022,0.966553,1.035818,1.054257,1.005652,1.011331,1.0


In [5]:
df['target'].value_counts()

target
0    147761
1     53239
Name: count, dtype: int64

In [6]:
df.isna().sum()

id        0
cat0      0
cat1      0
cat2      0
cat3      0
cat4      0
cat5      0
cat6      0
cat7      0
cat8      0
cat9      0
cat10     0
cat11     0
cat12     0
cat13     0
cat14     0
cat15     0
cat16     0
cat17     0
cat18     0
cont0     0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
target    0
dtype: int64

In [7]:
df.dtypes

id          int64
cat0       object
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cat10      object
cat11      object
cat12      object
cat13      object
cat14      object
cat15      object
cat16      object
cat17      object
cat18      object
cont0     float64
cont1     float64
cont2     float64
cont3     float64
cont4     float64
cont5     float64
cont6     float64
cont7     float64
cont8     float64
cont9     float64
cont10    float64
target      int64
dtype: object

## Data preparing

In [8]:
target = "target"
features = df.drop(target, axis=1).columns.to_numpy()
cat_cols = df.select_dtypes(include=['object']).columns.to_numpy()

In [9]:
def make_subm(model: CatBoostClassifier, name="submission.csv"):
    df_test['target'] = model.predict_proba(df_test[features])[:, 1]
    df_test[['id', 'target']].to_csv(f"../submissions/{name}", index=False)

### Baseline

In [10]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)

In [11]:
model = CatBoostClassifier(
    iterations=1000,
    random_seed=SEED,
    eval_metric='AUC',
    verbose=100,
    loss_function='Logloss',
    early_stopping_rounds=100,
    use_best_model=True,
    cat_features=cat_cols,
    auto_class_weights='Balanced'
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

Learning rate set to 0.111138
0:	test: 0.8545861	best: 0.8545861 (0)	total: 153ms	remaining: 2m 32s
100:	test: 0.8883716	best: 0.8883716 (100)	total: 7.3s	remaining: 1m 5s
200:	test: 0.8917422	best: 0.8917422 (200)	total: 14.5s	remaining: 57.8s
300:	test: 0.8929241	best: 0.8929241 (300)	total: 21.5s	remaining: 49.9s
400:	test: 0.8933364	best: 0.8933413 (397)	total: 28.4s	remaining: 42.5s
500:	test: 0.8933105	best: 0.8933443 (431)	total: 35.5s	remaining: 35.3s
600:	test: 0.8934700	best: 0.8934700 (600)	total: 42.6s	remaining: 28.3s
700:	test: 0.8937312	best: 0.8937312 (700)	total: 49.7s	remaining: 21.2s
800:	test: 0.8936652	best: 0.8937871 (738)	total: 57s	remaining: 14.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8937871221
bestIteration = 738

Shrink model to first 739 iterations.


<catboost.core.CatBoostClassifier at 0x7ffa382df830>

In [12]:
roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

0.8937871221002482

In [13]:
make_subm(model, "base_submission1.csv")

### Stratified KFold

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

models = []
val_scores = []

for train_index, val_index in tqdm(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = CatBoostClassifier(
        iterations=1000,
        random_seed=SEED,
        eval_metric='AUC',
        loss_function='Logloss',
        verbose=250,
        use_best_model=True,
        # learning_rate=0.01,
        early_stopping_rounds=100,
        auto_class_weights='Balanced'
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_cols
    )
    
    models.append(model)
    
    score = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
    val_scores.append(score)
    
    print(f"Fold score: {score}")
    
print(f"Mean score: {np.mean(val_scores)}")

0it [00:00, ?it/s]

Learning rate set to 0.112264
0:	test: 0.8444682	best: 0.8444682 (0)	total: 86.5ms	remaining: 1m 26s
250:	test: 0.8912933	best: 0.8912933 (250)	total: 17.4s	remaining: 51.8s
500:	test: 0.8925569	best: 0.8925667 (477)	total: 34.7s	remaining: 34.6s
750:	test: 0.8928494	best: 0.8928812 (714)	total: 53s	remaining: 17.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.89294349
bestIteration = 882

Shrink model to first 883 iterations.
Fold score: 0.8929434900028155
Learning rate set to 0.112264
0:	test: 0.8462626	best: 0.8462626 (0)	total: 102ms	remaining: 1m 41s
250:	test: 0.8929435	best: 0.8929435 (250)	total: 18.4s	remaining: 55s
500:	test: 0.8942649	best: 0.8942823 (497)	total: 36.3s	remaining: 36.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8943786156
bestIteration = 569

Shrink model to first 570 iterations.
Fold score: 0.8943786155894726
Learning rate set to 0.112264
0:	test: 0.8467949	best: 0.8467949 (0)	total: 75ms	remaining: 1m 14s
2

In [28]:
def make_pred(X_test):
    preds = []
    for model in models:
        preds.append(model.predict_proba(X_test)[:, 1])
    
    preds = np.mean(preds, axis=0)
    
    # preds = [1 if i > 0.5 else 0 for i in preds]
    
    return preds

In [29]:
def make_subm_kfold(name="submission.csv"):
    df_test['target'] = make_pred(df_test[features])
    df_test[['id', 'target']].to_csv(f"../submissions/{name}", index=False)
    return df_test['target'].mean()

In [30]:
make_subm_kfold()

np.float64(0.38145185269026943)

### Optuna

In [36]:
# def objective(trial):
#     # Define hyperparameter search space
#     params = {
#         'iterations': trial.suggest_int('iterations', 500, 2000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
#         'depth': trial.suggest_int('depth', 4, 10),
#         # 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
#         # 'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
#         # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#         # 'border_count': trial.suggest_categorical('border_count', [32, 64, 128, 254]),
#         # 'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
#         # 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
#         'random_seed': SEED,
#         'eval_metric': 'AUC',
#         'verbose': 250,
#         'loss_function': 'Logloss',
#         'auto_class_weights': 'Balanced',
#         'cat_features': cat_cols,
#         'early_stopping_rounds': 100,
#         'use_best_model': True
#     }
    
#     # # Optional: Add leaf estimation parameters
#     # if params['grow_policy'] == 'Lossguide':
#     #     params['max_leaves'] = trial.suggest_int('max_leaves', 16, 256)
    
#     # Choose fold strategy
#     n_splits = 5
#     cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
#     cv_scores = []
    
#     for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
#         # Split data
#         X_tr = X_train.iloc[train_idx]
#         X_val = X_train.iloc[val_idx]
#         y_tr = y_train.iloc[train_idx]
#         y_val = y_train.iloc[val_idx]
        
#         # Initialize model
#         model = CatBoostClassifier(**params)
        
#         # Train with early stopping
#         model.fit(
#             X_tr, y_tr,
#             eval_set=(X_val, y_val),
#             verbose=False
#         )
        
#         # Predict and evaluate
#         y_pred = model.predict_proba(X_val)[:, 1]
#         score = roc_auc_score(y_val, y_pred)
#         cv_scores.append(score)
    
#     # Return mean CV score
#     return np.mean(cv_scores)

# # Create Optuna study
# study = optuna.create_study(
#     direction='maximize',
#     study_name='catboost_optuna',
#     sampler=optuna.samplers.TPESampler(seed=SEED),
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
# )

# # Optimize
# study.optimize(
#     objective,
#     n_trials=100,
#     timeout=60 * 10,  # 1 hour timeout (optional)
#     show_progress_bar=True
# )

# # Print results
# print(f"Best trial:")
# print(f"  Value (AUC): {study.best_value:.5f}")
# print(f"  Params: ")
# for key, value in study.best_params.items():
#     print(f"    {key}: {value}")

# # Train final model with best parameters
# best_params = study.best_params.copy()

# # Add fixed parameters
# best_params.update({
#     'random_seed': SEED,
#     'eval_metric': 'AUC',
#     'verbose': 100,
#     'loss_function': 'Logloss',
#     'auto_class_weights': 'Balanced',
#     'cat_features': cat_cols,
#     'early_stopping_rounds': 100,
#     'use_best_model': True
# })

# final_model = CatBoostClassifier(**best_params)
# final_model.fit(X_train, y_train, eval_set=(X_test, y_test))

# import joblib
# joblib.dump(study, 'catboost_optuna_study.pkl')