In [62]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

🔹 What is CatBoost?

Gradient boosting algorithm developed by Yandex.

Specially optimized for categorical features (hence the name).

Automatically handles categorical variables → no need for manual OneHotEncoding or LabelEncoding.

Works well out of the box with minimal tuning.

In [63]:
%store -r X_train
X_train = X_train

%store -r X_val
X_val = X_val

%store -r y_train
y_train = y_train

%store -r y_val
y_val = y_val

%store -r X_test
X_test = X_test

%store -r test_passenger_id

In [64]:
cat = CatBoostClassifier(
    iterations=500,        # number of boosting rounds (trees)
    depth=6,               # depth of trees
    learning_rate=0.1,     # step size
    loss_function='Logloss',   # for classification
    eval_metric='AUC',     # evaluation metric
    random_seed=42,
    verbose=100            # show training progress every 100 iterations
)

In [65]:
cat.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)


0:	test: 0.8257346	best: 0.8257346 (0)	total: 4.17ms	remaining: 2.08s
100:	test: 0.8892505	best: 0.8894736 (97)	total: 685ms	remaining: 2.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8906064111
bestIteration = 115

Shrink model to first 116 iterations.


<catboost.core.CatBoostClassifier at 0x2686d97e120>

In [66]:
y_train_pred = cat.predict(X_train)
y_val_pred = cat.predict(X_val)

In [67]:
print("Train accuracy:", accuracy_score(y_train, y_train_pred))
print("Test accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Test):\n", classification_report(y_val, y_val_pred))

Train accuracy: 0.8495879120879121
Test accuracy: 0.7988846287905194

Classification Report (Test):
               precision    recall  f1-score   support

       False       0.82      0.76      0.79      1424
        True       0.78      0.84      0.81      1445

    accuracy                           0.80      2869
   macro avg       0.80      0.80      0.80      2869
weighted avg       0.80      0.80      0.80      2869



In [75]:
y_test_pred = cat.predict(X_test)
y_test_pred = pd.DataFrame(y_test_pred,  index=X_test.index, columns=['Transported'])
predicted_values = pd.merge(test_passenger_id,y_test_pred, how = 'left', left_index=True, right_index=True)
predicted_values

Unnamed: 0,PassengerId,Transported
8693,0013_01,True
8694,0018_01,False
8695,0019_01,True
8696,0021_01,True
8697,0023_01,True
...,...,...
12965,9266_02,True
12966,9269_01,False
12967,9271_01,True
12968,9273_01,True


In [77]:
predicted_values.to_csv('../data/submission.csv', index=False)