In [3]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np


In [8]:
def k_vote(input_train, input_test, lgbm_params, num_round=100, n_splits=5):

    X = input_train.drop(['Survived'], axis=1, inplace=False)
    y = input_train['Survived']
    x_test = input_test.drop(['PassengerId'], axis=1, inplace=False)

    scores = []

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    test_preds = np.zeros(len(input_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        
        print(f"Fold {fold+1}/{n_splits}")
        
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 
        
        lgbm = lgb.LGBMClassifier(**lgbm_params, n_estimators=num_round)
        rf = RandomForestClassifier(n_estimators=num_round, random_state=42)
        ensemble = VotingClassifier(estimators=[('rf', rf), ('lgbm', lgbm)], voting='soft')
        
        ensemble.fit(x_train, y_train)
        
        y_pred = ensemble.predict_proba(x_val)
        y_pred_binary = (y_pred[:, 1] > 0.5).astype(int)

        acc = accuracy_score(y_val, y_pred_binary)
        print(f"Fold {fold+1} Accuracy: {acc:.4f}")
        scores.append(acc)

        test_preds += ensemble.predict_proba(x_test)[:, 1] / n_splits

    print(f"Mean Accuracy: {np.mean(scores):.4f}")

    test_preds_binary = (test_preds >= 0.5).astype(int)
    result = pd.DataFrame({'PassengerId': input_test['PassengerId'], 'Survived': test_preds_binary})
    
    return result

In [9]:
baseline_train = pd.read_csv('../data/baseline/train.csv')
baseline_test = pd.read_csv('../data/baseline/test.csv')
params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'random_state': 42
    }

result = k_vote(baseline_train, baseline_test, lgbm_params=params, num_round=100, n_splits=5)

Fold 1/5
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Fold 1 Accuracy: 0.8492
Fold 2/5
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292

In [10]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
