In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
gender_submission = pd.read_csv("../input/gender_submission.csv")

data = pd.concat([train, test], sort=False)

train = data[:len(train)]
test = data[len(train):]

In [2]:
# Previous feature engineering.
data["Sex"] = data["Sex"].replace(
    {
        "male": 0,
        "female": 1
    }
)
data["Embarked"] = data["Embarked"].fillna("S")
data["Embarked"] = data["Embarked"].map(
    {
        "S": 0,
        "C": 1,
        "Q": 2,
    }
).astype(int)
data["Fare"] = data["Fare"].fillna(np.mean(data["Fare"]))
data["Age"] = data["Age"].fillna(data["Age"].median())
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
data["IsAlone"]  = (data["FamilySize"] == 1).astype(int)

In [3]:
delete_columns = ["Name", "PassengerId", "SibSp", "Parch", "Ticket", "Cabin"]
data = data.drop(delete_columns, axis=1)

train = data[:len(train)]
test = data[len(train):]

y_train = train["Survived"]
X_train = train.drop("Survived", axis=1)
X_test = test.drop("Survived", axis=1)

X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,7.25,0,2,0
1,1,1,38.0,71.2833,1,2,0
2,3,1,26.0,7.925,0,1,1
3,1,1,35.0,53.1,0,2,0
4,3,0,35.0,8.05,0,1,1


## Random forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Add 'IsAlone' and 'FamilySize'.
# clf = LogisticRegression(penalty="l2", solver="sag", random_state=0)
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

sub = gender_submission.copy()
sub["Survived"] = y_pred.astype(int)
sub.to_csv("../output/submission_randomforest.csv", index=False)

## LightBGM

In [10]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.3,
     random_state=0, stratify=y_train
)

categorical_features = ['Embarked', 'Pclass', 'Sex']
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categorical_features)

params = {"objective": "binary"}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

y_pred = (y_pred > 0.5).astype(int)

sub = gender_submission.copy()
sub["Survived"] = y_pred.astype(int)
sub.to_csv("../output/submission_lightgbm.csv", index=False)

[LightGBM] [Info] Number of positive: 82, number of negative: 131
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384977 -> initscore=-0.468478
[LightGBM] [Info] Start training from score -0.468478
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.438956	valid_1's binary_logloss: 0.519067
[20]	training's binary_logloss: 0.375668	valid_1's binary_logloss: 0.506661
[30]	training's binary_logloss: 0.326911	valid_1's binary_logloss: 0.504596
Early stopping, best iteration is:
[21]	training's binary_logloss: 0.368284	valid_1's binary_logloss: 0.50371


