#### Data is on Kaggle: https://www.kaggle.com/competitions/titanic/data

# Step 1: load libs & data

In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

train = pd.read_csv('../titanic/train.csv')
test = pd.read_csv('../titanic/test.csv')

# Step 2: preprocess

In [2]:
for df in (train, test):
    df['Age']= df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

features= ['Pclass','Sex','Age','Fare','Embarked','Title','FamilySize']
cat_features = ['Pclass','Sex','Embarked','Title']
X = train[features] 
y = train['Survived']
X_test = test[features]

# Step 3: split, train & eval

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = CatBoostClassifier(cat_features=cat_features, verbose=False)
model.fit(X_train, y_train, eval_set=(X_val, y_val))
pred = model.predict(X_val)
print('Acc:', accuracy_score(y_val, pred),
      'AUC:', roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))

Acc: 0.8100558659217877 AUC: 0.8803088803088802


# Step 4: hyperparam tuning

In [5]:
param_grid = {
    'iterations':[100, 200, 300],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5]
}
rs = RandomizedSearchCV(
    CatBoostClassifier(cat_features=cat_features, verbose=False),
    param_grid, n_iter=10, cv=3,
    scoring='roc_auc', random_state=42, n_jobs=-1
)
rs.fit(X, y)
print(rs.best_params_, rs.best_score_)

{'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 100, 'depth': 3} 0.8709935129262135


# Step 5: final model & submission

In [6]:
best = rs.best_estimator_
best.fit(X, y)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived':best.predict(X_test)
})
submission.to_csv('../submission_cat.csv', index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
