In [54]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [55]:
data = pd.read_csv("data/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
print("Labels:{}".format(set(data['Survived'])))
print("Zero count = {}, One count = {}".format(len(data['Survived']) - sum(data['Survived']),sum(data['Survived']))) # исследуем таргет на дисбаланс

Labels:{0, 1}
Zero count = 549, One count = 342


In [57]:
columns = ["Survived", "Pclass", "Sex", "Age"]

df_train, df_test = train_test_split(data[columns], train_size=0.8, random_state=0, stratify=data["Survived"]) # stratify тречит и схораняет пропорции
X_train, Y_train = df_train.drop('Survived', axis=1), df_train['Survived']
X_test, Y_test = df_test.drop('Survived', axis=1), df_test["Survived"]
X_test.head()

Unnamed: 0,Pclass,Sex,Age
153,3,male,40.5
752,3,male,33.0
610,3,female,39.0
200,3,male,28.0
310,1,female,24.0


In [58]:
categorial_features = ['Pclass', 'Sex'] # определяем категориальные признаки


train_pool = Pool(data=X_train, label=Y_train, cat_features=categorial_features)
test_pool = Pool(data=X_test, label=Y_test, cat_features=categorial_features)

In [59]:
model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.01,
                           use_best_model=True,
                           eval_metric="Accuracy",
                           early_stopping_rounds=50,
                           )
model.fit(X_train, Y_train, cat_features=categorial_features, eval_set=(X_test, Y_test), verbose=True)

0:	learn: 0.7991573	test: 0.7877095	best: 0.7877095 (0)	total: 21.3ms	remaining: 21.2s
1:	learn: 0.7991573	test: 0.7877095	best: 0.7877095 (0)	total: 42.7ms	remaining: 21.3s
2:	learn: 0.8202247	test: 0.7765363	best: 0.7877095 (0)	total: 64.7ms	remaining: 21.5s
3:	learn: 0.8174157	test: 0.7709497	best: 0.7877095 (0)	total: 76.5ms	remaining: 19s
4:	learn: 0.8174157	test: 0.7709497	best: 0.7877095 (0)	total: 89.7ms	remaining: 17.9s
5:	learn: 0.8047753	test: 0.7709497	best: 0.7877095 (0)	total: 112ms	remaining: 18.5s
6:	learn: 0.8061798	test: 0.7709497	best: 0.7877095 (0)	total: 133ms	remaining: 18.9s
7:	learn: 0.8061798	test: 0.7709497	best: 0.7877095 (0)	total: 144ms	remaining: 17.8s
8:	learn: 0.8061798	test: 0.7709497	best: 0.7877095 (0)	total: 163ms	remaining: 17.9s
9:	learn: 0.8061798	test: 0.7709497	best: 0.7877095 (0)	total: 182ms	remaining: 18s
10:	learn: 0.8061798	test: 0.7709497	best: 0.7877095 (0)	total: 196ms	remaining: 17.6s
11:	learn: 0.8075843	test: 0.7709497	best: 0.7877095

<catboost.core.CatBoostClassifier at 0x1c682019d10>

In [60]:
data_test = pd.read_csv("data/test.csv")
columns_test = ["Pclass", "Sex", "Age"]


In [61]:
newdf = pd.DataFrame(data_test['PassengerId'])
newdf['Survived'] = model.predict(data_test[columns_test])
newdf.to_csv("solution.csv", index=False)