In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [16]:
load_train = pd.read_csv('../data/kaggle/train.csv')
load_test = pd.read_csv('../data/kaggle/test.csv')
n_splits = 6
params = {
     'learning_rate': 0.1109006773045451,
     'num_leaves': 64,
     'max_depth': 11,
     'min_child_samples': 9,
     'subsample': 0.5623226907045367,
     'colsample_bytree': 0.5737897762821246, 
     'lambda_l1': 0.15874498150345134, 
     'lambda_l2': 0.0008756575871445052
     }

list_embark= ['Embarked_C', 'Embarked_Q', 'Embarked_S',]
result_list = [] 

In [6]:
def process(data, drop_embark):
    data_processed = data.copy()

    data_processed['Miss'] = data_processed['Age'].isnull().astype(int)
    data_processed['Sex'] = data_processed['Sex'].map({'female':0, 'male':1})
    data_processed['Embarked'] = data_processed['Embarked'].fillna(data_processed['Embarked'].mode()[0])
    data_processed = pd.get_dummies(data_processed, columns=['Embarked'], prefix='Embarked', dtype=int)
    data_processed.drop(['Name','Ticket', 'Cabin', drop_embark], axis=1, inplace=True)
    data_processed['Age'] = data_processed['Age'].fillna(data_processed["Age"].median())
    data_processed['Fare'] = data_processed['Fare'].fillna(data_processed['Fare'].median())
    return data_processed



In [17]:
for drop_embark in list_embark:
    
    input_train = process(load_train, drop_embark).drop(['PassengerId'], axis=1)
    input_test = process(load_test, drop_embark)

    X = input_train.drop(['Survived'], axis=1, inplace=False)
    y = input_train['Survived']
    x_test = input_test.drop(['PassengerId'], axis=1, inplace=False)

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    models = []
    scores = []
    test_preds = np.zeros(len(input_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        
        print(f"Fold {fold+1}/{n_splits}")
        
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] 
        
        train_data = lgb.Dataset(x_train, label=y_train)
        val_data = lgb.Dataset(x_val, label=y_val)
        
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=10),
                       lgb.log_evaluation(10)
                       ]
            
            )
       
    
        # 予測
        y_pred = model.predict(x_val)
        y_pred_binary = (y_pred > 0.5).astype(int)

        # 精度計算
        acc = accuracy_score(y_val, y_pred_binary)
        print(f"Fold {fold+1} Accuracy: {acc:.4f}")

        # モデルとスコアを保存
        models.append(model)
        scores.append(acc)
        test_preds += model.predict(x_test) / n_splits 

    # 平均スコアを表示
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    result_list.append(np.mean(scores))
    y_preds_binary = (test_preds > 0.5).astype(int)
    result_gbm = pd.DataFrame({'PassengerId': input_test['PassengerId'].values, 'Survived': y_preds_binary})

Fold 1/6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 742, number of used features: 9
[LightGBM] [Info] Start training from score 0.384097
Training until validation scores don't improve for 10 rounds
[10]	valid_0's l2: 0.128662
[20]	valid_0's l2: 0.112397
[30]	valid_0's l2: 0.108458
[40]	valid_0's l2: 0.105919
[50]	valid_0's l2: 0.105691
[60]	valid_0's l2: 0.107725
Early stopping, best iteration is:
[51]	valid_0's l2: 0.105527
Fold 1 Accuracy: 0.8859
Fold 2/6
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 742, number

In [18]:

rounded_list = [round(num, 4) for num in result_list]

In [19]:
rounded_list

[0.8473, 0.835, 0.8339]