In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

In [2]:
from sklearn.preprocessing import LabelEncoder

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))

    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

In [3]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

pred = model.predict_proba(test_x)[:, 1]

pred_label = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived':pred_label})
submission.to_csv('submission_first.csv', index=False)





In [1]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

score_accuracy = []
score_logloss = []

kf = KFold(n_splits=4, shuffle=True, random_state=71)

for tr_idx, val_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[val_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[val_idx]

    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)

    va_pred = model.predict_proba(va_x)[:, 1]

    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)

    score_logloss.append(logloss)
    print("append logloss correct", )
    score_accuracy.append(accuracy)
    print("append accuracy correct")

logloss = np.mean(score_logloss)
accuracy = np.mean(score_accuracy)
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')