In [199]:
import pandas as pd
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt

In [200]:
data = pd.read_csv("data/train.csv").drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


__Data Analysing__

In [201]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [202]:
data = data[data['Age'].isnull() == False]
#delete objects with null values

In [203]:
data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
#class affects survival rates

Unnamed: 0,Pclass,Survived
0,1,0.655914
1,2,0.479769
2,3,0.239437


In [204]:
data[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()
#as well as gender

Unnamed: 0,Sex,Survived
0,female,0.754789
1,male,0.205298


In [205]:
data[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.530055
2,2,0.44
0,0,0.37155
3,3,0.333333
4,4,0.166667
5,5,0.0


In [206]:
data[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
2,2,0.573529
1,1,0.554545
0,0,0.357006
5,5,0.2
4,4,0.0
6,6,0.0


In [207]:
data[["Fare", "Survived"]].groupby(['Survived'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Survived,Fare
1,1,51.843205
0,0,22.965456


In [208]:
data[["Embarked", "Survived"]].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.607692
2,S,0.362816
1,Q,0.285714


In [209]:
data = data[data['Embarked'].isnull() == False]
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [210]:
from catboost import Pool
from sklearn.model_selection import train_test_split
categorial_features = ['Pclass', 'Sex', 'Embarked']
#columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

df_train, df_test = train_test_split(data, train_size=0.8, random_state=0, stratify=data["Survived"])
X_train, Y_train = df_train.drop('Survived', axis=1), df_train['Survived']
X_test, Y_test = df_test.drop('Survived', axis=1), df_test["Survived"]

__Model__

In [211]:
model= CatBoostClassifier(iterations=3000, 
                           learning_rate=0.01,
                           loss_function='Logloss',
                           eval_metric="Accuracy",
                           early_stopping_rounds=25,
                           boosting_type='Ordered',
                           one_hot_max_size=5,
                           bagging_temperature=0.1,
                           leaf_estimation_method='Newton') # Поменять

In [212]:
model.fit(X_train, Y_train, cat_features=categorial_features, #Изучить фит
            use_best_model=True,
            eval_set=(X_test, Y_test), 
            verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x20216eb3c50>

__Predicting__

In [213]:
data_test = pd.read_csv("data/test.csv").drop(['Name', 'Ticket', 'Cabin'], axis=1)
newdf = pd.DataFrame(data_test['PassengerId'])
data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [214]:
newdf['Survived'] = model.predict(data_test.drop('PassengerId', axis=1))
newdf.to_csv("solution.csv", index=False)

__Feature importance__

In [215]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,Sex,66.90865
1,Pclass,22.133441
2,Age,7.343618
3,SibSp,1.837831
4,Fare,0.929427
5,Parch,0.747302
6,Embarked,0.099731
