In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import gc


train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')

target_cols = [c for c in train.columns if c not in test.columns]
TARGET = target_cols[0] 

print(f" Auto-detected Target Column: {TARGET}")


# Drop 'id' and the target to isolate features
features = [c for c in train.columns if c not in ['id', TARGET]]

# Define Categorical Columns for LightGBM
cat_features = train[features].select_dtypes(include=['object']).columns.tolist()
for col in cat_features:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')


# Using 5-Fold CV to ensure our model doesn't overfit
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store Out-of-Fold (OOF) and Test predictions
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

# Use log-transform if the target is skewed (common in insurance/pricing)
y = np.log1p(train[TARGET])


for fold, (train_idx, val_idx) in enumerate(kf.split(train[features], y)):
    X_train, X_val = train[features].iloc[train_idx], train[features].iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Professional Hyperparameters for Tabular Data
    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        reg_alpha=0.1,
        reg_lambda=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ]
    )
    
    # Store predictions
    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(test[features]) / kf.n_splits
    
    print(f"--- Fold {fold + 1} Finished ---")
    gc.collect()


# Reverting the log transformation for final score
final_oof = np.expm1(oof_preds)
final_test = np.expm1(test_preds)

score = root_mean_squared_error(train[TARGET], final_oof)
print(f"\n Final CV RMSE Score: {score:.5f}")

# Save the results
submission[TARGET] = final_test
submission.to_csv('submission.csv', index=False)
print(" Submission file saved as 'submission.csv'")

 Auto-detected Target Column: diagnosed_diabetes
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l2: 0.096835
[400]	valid_0's l2: 0.0964981
[600]	valid_0's l2: 0.0963303
[800]	valid_0's l2: 0.0962471
[1000]	valid_0's l2: 0.0961513
[1200]	valid_0's l2: 0.0961121
[1400]	valid_0's l2: 0.0961041
[1600]	valid_0's l2: 0.0960917
[1800]	valid_0's l2: 0.0960908
Early stopping, best iteration is:
[1700]	valid_0's l2: 0.0960875
--- Fold 1 Finished ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l2: 0.0966523
[400]	valid_0's l2: 0.0963111
[600]	valid_0's l2: 0.0961433
[800]	valid_0's l2: 0.0960452
[1000]	valid_0's l2: 0.0959967
[1200]	valid_0's l2: 0.0959776
[1400]	valid_0's l2: 0.095947
[1600]	valid_0's l2: 0.0959425
Early stopping, best iteration is:
[1553]	valid_0's l2: 0.095936
--- Fold 2 Finished ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l2: 0.0969494
[400]	valid_0's l2: 0.0966309
[600]	val