In [21]:
'''%pip install catboost
%pip install numpy
%pip install pandas'''

'%pip install catboost\n%pip install numpy\n%pip install pandas'

In [22]:
import numpy as np 
import pandas as pd 
import catboost
from catboost import CatBoostClassifier, cv, metrics, Pool
from catboost.datasets import titanic

# Data preprocessing

First of all let's load data.

In [23]:
train_data, test_data = titanic()
X = train_data.drop('Survived', axis=1)
y = train_data.Survived

Now we need to explore data.

In [24]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We see that there are some missing values in our data, let's explore how much it is.

In [25]:
nan_vals = X.isnull().sum(axis=0)
nan_vals[nan_vals != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

We need to do something with them and the simplest and the most effective way is fill them. Besides null data I thing that our model doesn't need to know the name and ID of each passenger

In [28]:
train_D = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis=1) # because in 'Cabin' column too much Null values
train_D = train_D.dropna()
test_D = test_data.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
test_D = test_D.dropna()
X_d = train_D.drop('Survived', axis=1)
y_d = train_D.Survived

Second way is fill them with some value.

In [29]:
X.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

We have other problem except missing values– categorical data. CatBoost support cat data, and now I just need to get list with names column with it.

In [30]:
cat_features_D = np.where(X_d.dtypes != float)[0]
cat_features_F = np.where(X.dtypes != float)[0]

# Train the models

Now we need to define models. I think that default parameters is good enough, but later I will try parameter tuning.

In [56]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    iterations=300,
    random_seed = 10
)

I will use cross validation for my models. It's more effective and so easy with CatBoost.

In [59]:
cv_params_F=model.get_params()
cv_params_F.update({
    'use_best_model':True,
    'loss_function': metrics.Logloss(),
    'logging_level':'Silent'})
cv_data_F=cv(
    Pool(X, y, cat_features=cat_features_F),
    cv_params_F,
    fold_count=5,
    early_stopping_rounds=10,
    return_models=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [60]:
cv_params_D=model.get_params()
cv_params_D.update({'use_best_model':True,
                 'logging_level':'Silent',
                 'loss_function':metrics.Logloss()})
cv_data_D=cv(
    Pool(X_d, y_d, cat_features=cat_features_D),
    cv_params_D,
    fold_count=5,
    early_stopping_rounds=10,
    return_models=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

# Test model

In "D" models we have some good results, but the best result is "F"1: 87.5%/88.2% accuracy. It's not bad and enough for now. Let's check accuracy on test data. Also we can see that "D" models wasn't learning faster (1.7 seconds is within the margin of error), but droping data has a bad effect on accuracy.

In [69]:
best_model = cv_data_F[1][1]

In [70]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,-999,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,-999,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,-999,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,-999,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,-999,S


In [83]:
submisstion = pd.DataFrame()
submisstion['PassengerId'] = test_data['PassengerId']
submisstion['Survived'] = best_model.predict(test_data, prediction_type='Class')

In [84]:
submisstion.to_csv('submission.csv', index=False)

And yes, that's it.