In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from optuna import create_study
from optuna.samplers import TPESampler

study = create_study(direction="maximize", sampler=TPESampler())


[I 2024-11-10 23:13:57,032] A new study created in memory with name: no-name-75d07e5e-fb5b-408b-b4e0-1ac4e11c11d3


In [4]:
# Load and preprocess the dataset
data = pd.read_csv('/kaggle/input/covid-dataset/Covid Data.csv')
data.replace({97: np.nan, 99: np.nan, '9999-99-99': np.nan}, inplace=True)


In [5]:

# Binary transformation for Boolean fields
binary_cols = [
    'SEX', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 
    'HIPERTENSION', 'CARDIOVASCULAR', 'RENAL_CHRONIC', 'OTHER_DISEASE', 
    'OBESITY', 'TOBACCO', 'INTUBED', 'ICU'
]
data[binary_cols] = data[binary_cols].applymap(lambda x: 1 if x == 1 else 0)

# Convert 'CLASIFFICATION_FINAL' to binary (COVID-19 diagnosis: 1 for positive, 0 for negative)
data['CLASIFFICATION_FINAL'] = data['CLASIFFICATION_FINAL'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

# Create binary mortality indicator and drop 'DATE_DIED'
data['died'] = data['DATE_DIED'].notna().astype(int)
data.drop(columns=['DATE_DIED'], inplace=True)

  data[binary_cols] = data[binary_cols].applymap(lambda x: 1 if x == 1 else 0)


In [6]:
# Strip spaces in column names (if necessary)
data.columns = data.columns.str.strip()

# Rename target column
data['target'] = data['CLASIFFICATION_FINAL'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

# Check if 'DATE_DIED' exists, then drop columns safely
columns_to_drop = ['target', 'CLASIFFICATION_FINAL', 'DATE_DIED']
existing_columns = [col for col in columns_to_drop if col in data.columns]
X = data.drop(existing_columns, axis=1)
y = data['target']

# Feature engineering - Interaction features
X['diabetes_hypertension'] = X['DIABETES'] * X['HIPERTENSION']
X['age_obesity'] = X['AGE'] * X['OBESITY']
X['copd_asthma'] = X['COPD'] * X['ASTHMA']

# Define numerical and categorical features
numeric_features = ['AGE']
categorical_features = [
    'SEX', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
    'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'INTUBED', 'ICU',
    'USMER', 'MEDICAL_UNIT', 'PATIENT_TYPE'
]

In [7]:
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [8]:
# Preprocess data
X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)



In [9]:
import optuna
# Model hyperparameter tuning with Optuna
def objective(trial):
    # Define models with trial parameters for tuning
    model = StackingClassifier(
        estimators=[
            ('xgb', XGBClassifier(
                n_estimators=trial.suggest_int('xgb_n_estimators', 50, 500),
                max_depth=trial.suggest_int('xgb_max_depth', 3, 10),
                learning_rate=trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
                subsample=trial.suggest_float('xgb_subsample', 0.5, 1.0),
                random_state=42)),
            ('lgbm', LGBMClassifier(
                n_estimators=trial.suggest_int('lgbm_n_estimators', 50, 500),
                max_depth=trial.suggest_int('lgbm_max_depth', 3, 10),
                learning_rate=trial.suggest_float('lgbm_learning_rate', 0.01, 0.3),
                random_state=42)),
            ('catboost', CatBoostClassifier(
                iterations=trial.suggest_int('catboost_iterations', 50, 500),
                depth=trial.suggest_int('catboost_depth', 3, 10),
                learning_rate=trial.suggest_float('catboost_learning_rate', 0.01, 0.3),
                silent=True))
        ],
        final_estimator=RandomForestClassifier(
            n_estimators=100, random_state=42, max_depth=5),
        stack_method='predict_proba',
        passthrough=True
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

study = optuna.create_study(direction="maximize", sampler=TPESampler())  # Use TPESampler here
study.optimize(objective, n_trials=10)

# Best hyperparameters found
best_params = study.best_params
print("Best parameters found by Optuna:", best_params)

# Training the final model with best parameters
final_model = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(
            n_estimators=best_params['xgb_n_estimators'],
            max_depth=best_params['xgb_max_depth'],
            learning_rate=best_params['xgb_learning_rate'],
            subsample=best_params['xgb_subsample'],
            random_state=42)),
        ('lgbm', LGBMClassifier(
            n_estimators=best_params['lgbm_n_estimators'],
            max_depth=best_params['lgbm_max_depth'],
            learning_rate=best_params['lgbm_learning_rate'],
            random_state=42)),
        ('catboost', CatBoostClassifier(
            iterations=best_params['catboost_iterations'],
            depth=best_params['catboost_depth'],
            learning_rate=best_params['catboost_learning_rate'],
            silent=True))
    ],
    final_estimator=RandomForestClassifier(
        n_estimators=100, random_state=42, max_depth=5),
    stack_method='predict_proba',
    passthrough=True
)

[I 2024-11-10 23:15:46,895] A new study created in memory with name: no-name-698cbcc7-1597-47ae-8b8e-a899c1a61429


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:22:13,189] Trial 0 finished with value: 0.6697777622364284 and parameters: {'xgb_n_estimators': 463, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.23762905020949074, 'xgb_subsample': 0.5139665901142949, 'lgbm_n_estimators': 418, 'lgbm_max_depth': 4, 'lgbm_learning_rate': 0.2783435983474958, 'catboost_iterations': 175, 'catboost_depth': 5, 'catboost_learning_rate': 0.22067860736996606}. Best is trial 0 with value: 0.6697777622364284.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:27:30,467] Trial 1 finished with value: 0.6700606854370845 and parameters: {'xgb_n_estimators': 229, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.28343083347049514, 'xgb_subsample': 0.5202816946124216, 'lgbm_n_estimators': 168, 'lgbm_max_depth': 10, 'lgbm_learning_rate': 0.18431868213365563, 'catboost_iterations': 278, 'catboost_depth': 7, 'catboost_learning_rate': 0.19907983237855154}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:33:00,405] Trial 2 finished with value: 0.6695806696696792 and parameters: {'xgb_n_estimators': 226, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.14463457056378365, 'xgb_subsample': 0.6069728018608966, 'lgbm_n_estimators': 315, 'lgbm_max_depth': 4, 'lgbm_learning_rate': 0.017456318745188604, 'catboost_iterations': 348, 'catboost_depth': 3, 'catboost_learning_rate': 0.17691814781730764}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:39:02,838] Trial 3 finished with value: 0.670000286102113 and parameters: {'xgb_n_estimators': 404, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.1995068500123994, 'xgb_subsample': 0.7607546692986373, 'lgbm_n_estimators': 184, 'lgbm_max_depth': 3, 'lgbm_learning_rate': 0.14618428980450401, 'catboost_iterations': 133, 'catboost_depth': 10, 'catboost_learning_rate': 0.2154015789349215}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:43:59,430] Trial 4 finished with value: 0.6698063724477308 and parameters: {'xgb_n_estimators': 268, 'xgb_max_depth': 3, 'xgb_learning_rate': 0.18108004838005567, 'xgb_subsample': 0.8510925276199633, 'lgbm_n_estimators': 182, 'lgbm_max_depth': 4, 'lgbm_learning_rate': 0.05624407104053772, 'catboost_iterations': 179, 'catboost_depth': 9, 'catboost_learning_rate': 0.10354378776299995}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:48:40,322] Trial 5 finished with value: 0.6696219955304492 and parameters: {'xgb_n_estimators': 280, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.21789000682052603, 'xgb_subsample': 0.803046941800752, 'lgbm_n_estimators': 339, 'lgbm_max_depth': 4, 'lgbm_learning_rate': 0.09551237556084882, 'catboost_iterations': 100, 'catboost_depth': 3, 'catboost_learning_rate': 0.036174950729554595}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-10 23:54:55,689] Trial 6 finished with value: 0.6694694077368369 and parameters: {'xgb_n_estimators': 424, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.157631328767355, 'xgb_subsample': 0.6343358168435769, 'lgbm_n_estimators': 292, 'lgbm_max_depth': 5, 'lgbm_learning_rate': 0.014709495320670625, 'catboost_iterations': 260, 'catboost_depth': 5, 'catboost_learning_rate': 0.035659316446468484}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-11 00:01:20,183] Trial 7 finished with value: 0.6698031935353638 and parameters: {'xgb_n_estimators': 80, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.06269802561360677, 'xgb_subsample': 0.6027427773471501, 'lgbm_n_estimators': 458, 'lgbm_max_depth': 9, 'lgbm_learning_rate': 0.2982355122458652, 'catboost_iterations': 371, 'catboost_depth': 8, 'catboost_learning_rate': 0.18409817917407126}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-11 00:05:59,135] Trial 8 finished with value: 0.6697523309374931 and parameters: {'xgb_n_estimators': 272, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.25801732357675894, 'xgb_subsample': 0.6678394922644215, 'lgbm_n_estimators': 350, 'lgbm_max_depth': 6, 'lgbm_learning_rate': 0.2417193489901762, 'catboost_iterations': 111, 'catboost_depth': 3, 'catboost_learning_rate': 0.20221330729353118}. Best is trial 1 with value: 0.6700606854370845.


[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

[I 2024-11-11 00:11:07,274] Trial 9 finished with value: 0.6695329859841753 and parameters: {'xgb_n_estimators': 170, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.04161284012202062, 'xgb_subsample': 0.7669552854240695, 'lgbm_n_estimators': 70, 'lgbm_max_depth': 5, 'lgbm_learning_rate': 0.22263845915328623, 'catboost_iterations': 318, 'catboost_depth': 7, 'catboost_learning_rate': 0.028025525564083206}. Best is trial 1 with value: 0.6700606854370845.


Best parameters found by Optuna: {'xgb_n_estimators': 229, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.28343083347049514, 'xgb_subsample': 0.5202816946124216, 'lgbm_n_estimators': 168, 'lgbm_max_depth': 10, 'lgbm_learning_rate': 0.18431868213365563, 'catboost_iterations': 278, 'catboost_depth': 7, 'catboost_learning_rate': 0.19907983237855154}


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

try:
    # Fit the final model
    final_model.fit(X_train, y_train)
    final_preds = final_model.predict(X_test)

    # Evaluate performance
    print("\nFinal Model Performance:")
    print("Accuracy:", accuracy_score(y_test, final_preds))
    print("Precision:", precision_score(y_test, final_preds, average='weighted'))  # Use average if multi-class
    print("Recall:", recall_score(y_test, final_preds, average='weighted'))
    print("F1 Score:", f1_score(y_test, final_preds, average='weighted'))

except KeyError as e:
    print("Parameter error:", e)
except ValueError as e:
    print("Value error during model fitting or evaluation:", e)
except Exception as e:
    print("Unexpected error:", e)

[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069733 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Fit the final model
final_model.fit(X_train, y_train)
final_preds = final_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, final_preds)
precision = precision_score(y_test, final_preds, average='weighted')
recall = recall_score(y_test, final_preds, average='weighted')
f1 = f1_score(y_test, final_preds, average='weighted')

# Save metrics to CSV
output_data = {
    "Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1 Score": [f1]
}

output_df = pd.DataFrame(output_data)
output_df.to_csv("final_model_performance.csv", index=False)

print("Final model performance saved to final_model_performance.csv")

[LightGBM] [Info] Number of positive: 274608, number of negative: 459394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374124 -> initscore=-0.514564
[LightGBM] [Info] Start training from score -0.514564
[LightGBM] [Info] Number of positive: 219686, number of negative: 367515
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 587201, number of used features: 30
[LightGBM] [In