In [3]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
def k_gbm(input_train, input_test, params, n_splits=5):
    X = input_train.drop(['Survived'], axis=1, inplace=False)
    y = input_train['Survived']
    x_test = input_test.drop(['PassengerId'], axis=1, inplace=False)

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    models = []
    scores = []
    test_preds = np.zeros(len(input_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        
        print(f"Fold {fold+1}/{n_splits}")
        
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 
        
        train_data = lgb.Dataset(x_train, label=y_train)
        val_data = lgb.Dataset(x_val, label=y_val)
        
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=10),
                       lgb.log_evaluation(10)
                       ]
            
            )
       
    
        # 予測
        y_pred = model.predict(x_val)
        y_pred_binary = (y_pred > 0.5).astype(int)

        # 精度計算
        acc = accuracy_score(y_val, y_pred_binary)
        print(f"Fold {fold+1} Accuracy: {acc:.4f}")

        # モデルとスコアを保存
        models.append(model)
        scores.append(acc)
        test_preds += model.predict(x_test) / n_splits 

    # 平均スコアを表示
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    y_preds_binary = (test_preds > 0.5).astype(int)
    result_gbm = pd.DataFrame({'PassengerId': input_test['PassengerId'].values, 'Survived': y_preds_binary})
    return result_gbm, models, scores




In [None]:
baseline_train = pd.read_csv('../data/baseline/train.csv')
baseline_test = pd.read_csv('../data/baseline/test.csv')

best_params = {
     'learning_rate': 0.1109006773045451,
     'num_leaves': 64,
     'max_depth': 11,
     'min_child_samples': 9,
     'subsample': 0.5623226907045367,
     'colsample_bytree': 0.5737897762821246, 
     'lambda_l1': 0.15874498150345134, 
     'lambda_l2': 0.0008756575871445052
     }

result, models, scores = k_gbm(baseline_train, baseline_test, params=best_params, n_splits=5)
result.head()

Fold 1/5
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	valid_0's binary_error: 0.212291
[20]	valid_0's binary_error: 0.150838
Early stopping, best iteration is:
[18]	valid_0's binary_error: 0.150838
x_val =      Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
4         3    1  35.0      0      0   8.0500         2
6         1    1  54.0      0      0  51.8625         2
13        3    1  39.0      1      5  31.2750         2

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [17]:
baseline_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2
