In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import catboost
from catboost import CatBoostClassifier, cv, metrics, Pool
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Data preprocessing

First of all let's load data.

In [2]:
train_data, test_data = pd.read_csv("../input/titanic/train.csv"), pd.read_csv('../input/titanic/test.csv')
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

Now we need to explore data.

In [3]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We see that there are some missing values in our data, let's explore how much it is.

In [4]:
nan_vals = X.isnull().sum(axis=0)
nan_vals[nan_vals != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

We need to do something with them and the simplest and the most effective way is fill them. Besides null data I thing that our model doesn't need to know the name and ID of each passenger

Second way is fill them with some number.

In [5]:
X.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

We have other problem except missing values– categorical data. CatBoost support cat data, and now I just need to get list with names column with it.

In [6]:
# cat_features_D = np.where(X_droped.dtypes != float)[0]
cat_features_F = np.where(X.dtypes != float)[0]

# Train the model

Now we need to define model. I think that default parameters is good enough, but later I will try parameter tuning.

In [11]:
model = CatBoostClassifier(
    custom_loss=[metrics.Logloss()],
    random_seed = 10
)

I will use cross validation for my models. It's more effective and so easy with CatBoost.

In [15]:
cv_params=model.get_params()
cv_params.update({'use_best_model':True,
                 'loss_function':metrics.Logloss(),
                 'logging_level':'Silent'})
cv_data=cv(
    Pool(X, y, cat_features=cat_features_F),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))