# Catboost classifier in titanic competition on kaggle

In [4]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [5]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

### Importing and overlooking on data

In [44]:
train = pd.read_csv('train.csv', header=0)
test = pd.read_csv('test.csv', header=0)
gender_submission = pd.read_csv('gender_submission.csv', header=0)
print(train.shape, test.shape)
gender_submission.head()

(891, 12) (418, 11)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [18]:
len(train)

891

In [16]:
train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,-99999,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,-99999,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,-99999,S


In [19]:
len(test)

418

In [17]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


### Data preprocessing

In [10]:
test['Survived'] = gender_submission['Survived']
X = train
label = X['Survived']
X.drop(['PassengerId', 'Survived'], axis=1, inplace=True)
X.fillna(-99999, inplace=True)
X['Cabin'] = X['Cabin'].apply(lambda x: x[0] if x != -99999 else x)

cat_features_index = np.where(X.dtypes != float)[0]

In [11]:
from sklearn.model_selection import train_test_split as split
X_train, X_test, y_train, y_test = split(X, label, test_size=0.05, shuffle=True)

In [12]:
train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,-99999,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,-99999,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,-99999,S


### Creating and fitting catboost classifier model

In [42]:
test.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [None]:
X = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
    'Cabin', 'Embarked']

y = []

In [46]:
model = CatBoostClassifier(eval_metric='Accuracy',
                           use_best_model=True,
                           early_stopping_rounds=200,
                           learning_rate = 0.1,
                           verbose=50)


# model.fit(train[X],train[y], cat_features=cat_features_index,  use_best_model=True, eval_set=(val[X],val[y])
model.fit(X_train, y_train, cat_features=cat_features_index,  use_best_model=True, eval_set=(X_test,y_test))

0:	learn: 0.8144208	test: 0.8222222	best: 0.8222222 (0)	total: 14.9ms	remaining: 14.9s
50:	learn: 0.8817967	test: 0.8666667	best: 0.8888889 (30)	total: 706ms	remaining: 13.1s
100:	learn: 0.9030733	test: 0.8888889	best: 0.8888889 (30)	total: 1.01s	remaining: 9.01s
150:	learn: 0.9172577	test: 0.8666667	best: 0.8888889 (30)	total: 1.35s	remaining: 7.57s
200:	learn: 0.9385343	test: 0.8666667	best: 0.8888889 (30)	total: 1.66s	remaining: 6.61s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8888888889
bestIteration = 30

Shrink model to first 31 iterations.


<catboost.core.CatBoostClassifier at 0x7e447f5f9ab0>

In [47]:
def accuracy(predict, real):
    if predict.shape != real.shape:
        return 0
    all_amount = predict.shape[0]
    res = 0
    for i, j in zip(predict, real):
        if int(i) == int(j):
            res +=1
    return res / all_amount

In [48]:
print('accuracy: {}'.format(accuracy(y_test, model.predict(X_test))))

accuracy: 0.8888888888888888


### Loading the data

In [49]:
test = test[list(X_train.columns.values)]
test.fillna(-99999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.fillna(-99999, inplace=True)


In [53]:
# result = gender_submission
result = pd.read_csv('/content/result.csv')
result['Survived'] = model.predict(test)
# result['Survived'] = result['Survived'].astype(np.int)

In [54]:
with open('result.csv', 'w') as f:
    f.write(result.to_csv(index=False))