In [19]:
import numpy as np
import pandas as pd
import lightgbm as lgb

train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [20]:
def process_first(data):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed.isnull().sum(axis=1)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype('Int64')
    data_processed.drop(['PassengerId', 'Name','Ticket', 'Cabin'], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    
    return data_processed

train_processed = process_first(train)
x_test = process_first(test)

x_train = train_processed.drop('Survived', axis=1, inplace=False)
y_train = train['Survived']

In [21]:
train_data = lgb.Dataset(x_train, label=y_train)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 64,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}
num_round = 200
bst = lgb.train(params, train_data, num_round)
y_pred = bst.predict(x_test)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [23]:
y_pred_binary = (y_pred >= 0.7).astype(int)
result_gbm = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': y_pred_binary})

result_gbm.to_csv('result_gbm.csv', index=False)