In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

histgradientboostingclassifier

In [3]:
import pandas as pd
import numpy as np
# No xgboost needed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import gc
import warnings
import os
# Optuna is no longer used for tuning, but keep import if it's potentially used elsewhere in the notebook
# import optuna 
# Import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
# log_loss is not explicitly used for HGB evaluation here, so it's not strictly needed.
# from sklearn.metrics import log_loss 

warnings.filterwarnings('ignore')

# --- Reusable Functions ---
def calculate_map3(y_true, y_pred_proba):
    """Calculates the Mean Average Precision @ 3 score."""
    # Get the indices of the top 3 predictions for each sample
    top3_preds_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    scores = []
    for i, true_label in enumerate(y_true):
        top3 = top3_preds_indices[i]
        score = 0.0
        # Check if the true label is in the top 3 predictions
        if true_label in top3:
            # Find the rank (1, 2, or 3)
            rank = np.where(top3 == true_label)[0][0] + 1
            if rank == 1:
                score = 1.0
            elif rank == 2:
                score = 0.5
            elif rank == 3:
                score = 1/3
        scores.append(score)
    return np.mean(scores)

# --- Feature Engineering Function (Converts Numerical to Binned Categorical) ---
def feature_eng(df, target_col=None):
    # Identify numerical columns to be binned
    numerical_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                            if col != 'id'] # Exclude 'id'
    if target_col and target_col in numerical_features: # Exclude target if it's numerical
        numerical_features.remove(target_col)

    for col in numerical_features:
        # Create a new column with '_Binned' suffix
        # Convert numerical values to string, then to 'category' dtype
        if f'{col}_Binned' not in df.columns: # Avoid recreating if already exists
            df[f'{col}_Binned'] = df[col].astype(str).astype('category')
    return df

# --- Main Execution ---
if __name__ == "__main__":
    # Define target column name
    TARGET_COL = 'Fertilizer Name'

    # --- 1. Load Data ---
    print("Loading datasets...")
    train_synthetic_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    train_original_df = pd.read_csv("/kaggle/input/original/Fertilizer Prediction .csv")
    test_df_raw = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    submission_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

    # Drop 'id' columns
    train_synthetic_df = train_synthetic_df.drop(columns=['id'])
    if 'id' in test_df_raw.columns:
        test_df_raw = test_df_raw.drop(columns=['id'])

    # --- Apply Feature Engineering to make all numerical features categorical ---
    print("Applying feature engineering (binning numerical features into categories)...")
    train_synthetic_df = feature_eng(train_synthetic_df, target_col=TARGET_COL)
    train_original_df = feature_eng(train_original_df, target_col=TARGET_COL)
    test_df_raw = feature_eng(test_df_raw)

    # --- 2. Preprocessing and Feature Preparation ---
    print("Applying preprocessing...")
    # Target Encoding
    le_fertilizer = LabelEncoder()
    # Fit on combined labels to ensure all possible classes are covered
    le_fertilizer.fit(pd.concat([train_synthetic_df[TARGET_COL], train_original_df[TARGET_COL]]))
    y_synth = le_fertilizer.transform(train_synthetic_df[TARGET_COL])
    y_orig = le_fertilizer.transform(train_original_df[TARGET_COL])
    num_classes = len(le_fertilizer.classes_)

    # Dynamically determine the common feature columns after feature engineering
    base_categorical_features = ['Soil Type', 'Crop Type']
    
    # Collect all binned columns that are present across all three dataframes
    all_processed_dfs = [train_synthetic_df, train_original_df, test_df_raw]
    common_binned_cols = set()

    # Get binned columns from the first dataframe
    if all_processed_dfs:
        first_df_binned = [col for col in all_processed_dfs[0].columns if col.endswith('_Binned')]
        common_binned_cols.update(first_df_binned)

        # Intersect with binned columns from subsequent dataframes
        for df_item in all_processed_dfs[1:]:
            current_df_binned = [col for col in df_item.columns if col.endswith('_Binned')]
            common_binned_cols.intersection_update(current_df_binned)

    # The final feature_cols will be the base categorical plus the common binned ones
    feature_cols = base_categorical_features + sorted(list(common_binned_cols))
    
    print(f"Selected feature columns (all categorical): {feature_cols}")

    # Feature Sets - now containing only categorical/binned features
    X_synth = train_synthetic_df[feature_cols].copy()
    X_orig = train_original_df[feature_cols].copy()
    test_df = test_df_raw[feature_cols].copy() # Ensure test_df also has only selected features

    # Categorical Feature Encoding (Using LabelEncoder for consistent integer mapping)
    for col in feature_cols:
        # Concatenate all categories from all datasets for consistent mapping
        full_vocab = pd.concat([X_synth[col].astype(str), X_orig[col].astype(str), test_df[col].astype(str)], axis=0).unique()
        
        label_enc = LabelEncoder()
        label_enc.fit(full_vocab.astype(str)) # Fit on the full vocabulary of strings
        
        X_synth[col] = label_enc.transform(X_synth[col].astype(str))
        X_orig[col] = label_enc.transform(X_orig[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))
        
        # Cast to 'category' dtype after encoding. This is crucial for HGB's native categorical handling.
        X_synth[col] = X_synth[col].astype("category")
        X_orig[col] = X_orig[col].astype("category")
        test_df[col] = test_df[col].astype("category")

    # --- 3. Define Fixed Hyperparameters for HistGradientBoostingClassifier ---
    # These parameters are provided by the user.
    final_params = {
        'max_iter': 1236,
        'learning_rate': 0.07913270952323785,
        'max_depth': 4,
        'min_samples_leaf': 55,
        'l2_regularization': 0.045227288910538066,
        'max_leaf_nodes': 39,
        'random_state': 42, # Keep seed fixed
        'early_stopping': True, # Enable early stopping
        'n_iter_no_change': 50, # Number of iterations with no improvement to trigger early stopping
        'verbose': 0, # Suppress verbose output during training, or set to 1 for progress bar
    }
    
    print("\nUsing fixed HistGradientBoostingClassifier parameters:")
    for key, value in final_params.items():
        print(f"  {key}: {value}")

    # --- 4. Train HistGradientBoostingClassifier Model with 5-Fold CV & 5x Original Data Augmentation ---
    print("\nTraining HistGradientBoostingClassifier with 5x original data (5 Folds)...")
    NFOLDS = 5 # Fixed to 5 folds for final training and prediction
    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(X_synth), num_classes))
    test_preds = np.zeros((len(test_df), num_classes))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_synth, y_synth)):
        print(f"--- Fold {fold+1}/{NFOLDS} ---")
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Octuple the original data to give it more weight (5x)
        X_orig_oct = pd.concat([X_orig] * 5, ignore_index=True)
        y_orig_oct = np.concatenate([y_orig] * 5)

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct])

        # Initialize HGB with fixed parameters
        model = HistGradientBoostingClassifier(**final_params)
        model.fit(X_train_aug, y_train_aug)

        oof_preds[val_idx] = model.predict_proba(X_synth_val)
        test_preds += model.predict_proba(test_df) / NFOLDS
        gc.collect()

    # --- 5. Evaluate and Save ---
    map3_score = calculate_map3(y_synth, oof_preds)
    print(f"\n--- Final Model CV MAP@3 (5x Original Data, Fixed Params): {map3_score:.5f} ---\n")

    # Save OOF and Test Predictions
    output_dir = './' # Save in current directory
    os.makedirs(output_dir, exist_ok=True) # Ensure directory exists

    oof_filename = os.path.join(output_dir, 'oof_preds_hgb_fixed_params.npy')
    test_filename = os.path.join(output_dir, 'test_preds_hgb_fixed_params.npy')

    np.save(oof_filename, oof_preds)
    np.save(test_filename, test_preds)
    print(f"OOF predictions saved to: {oof_filename}")
    print(f"Test predictions saved to: {test_filename}")

    print("Generating final submission file...")
    top3_preds_indices = np.argsort(test_preds, axis=1)[:, ::-1][:, :3]
    top3_preds_labels = le_fertilizer.inverse_transform(top3_preds_indices.flatten()).reshape(top3_preds_indices.shape)

    submission_df['Fertilizer Name'] = [' '.join(row) for row in top3_preds_labels]
    submission_df.to_csv('submission_hgb_fixed_params.csv', index=False)

    print("Submission file 'submission_hgb_fixed_params.csv' created successfully.")


Loading datasets...
Applying feature engineering (binning numerical features into categories)...
Applying preprocessing...
Selected feature columns (all categorical): ['Soil Type', 'Crop Type', 'Humidity_Binned', 'Moisture_Binned', 'Nitrogen_Binned', 'Phosphorous_Binned', 'Potassium_Binned', 'Temparature_Binned']

Using fixed HistGradientBoostingClassifier parameters:
  max_iter: 1236
  learning_rate: 0.07913270952323785
  max_depth: 4
  min_samples_leaf: 55
  l2_regularization: 0.045227288910538066
  max_leaf_nodes: 39
  random_state: 42
  early_stopping: True
  n_iter_no_change: 50
  verbose: 0

Training HistGradientBoostingClassifier with 5x original data (5 Folds)...
--- Fold 1/5 ---
--- Fold 2/5 ---
--- Fold 3/5 ---
--- Fold 4/5 ---
--- Fold 5/5 ---

--- Final Model CV MAP@3 (5x Original Data, Fixed Params): 0.34283 ---

OOF predictions saved to: ./oof_preds_hgb_fixed_params.npy
Test predictions saved to: ./test_preds_hgb_fixed_params.npy
Generating final submission file...
Submis

GaussianNB


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # Added OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
import gc
import warnings
import os
import optuna # Added for hyperparameter optimization
from sklearn.naive_bayes import GaussianNB # Added for Naive Bayes classifier

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Reusable Functions ---
def calculate_map3(y_true, y_pred_proba):
    """Calculates the Mean Average Precision @ 3 score."""
    # Get the indices of the top 3 predictions for each sample
    top3_preds_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    scores = []
    for i, true_label in enumerate(y_true):
        top3 = top3_preds_indices[i]
        score = 0.0
        # Check if the true label is in the top 3 predictions
        if true_label in top3:
            # Find the rank (1, 2, or 3)
            rank = np.where(top3 == true_label)[0][0] + 1
            if rank == 1:
                score = 1.0
            elif rank == 2:
                score = 0.5
            elif rank == 3:
                score = 1/3
        scores.append(score)
    return np.mean(scores)

# --- Feature Engineering Function (Converts Numerical to Binned Categorical) ---
def feature_eng(df, target_col=None):
    # Identify numerical columns to be binned
    numerical_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                            if col != 'id'] # Exclude 'id'
    if target_col and target_col in numerical_features: # Exclude target if it's numerical
        numerical_features.remove(target_col)

    for col in numerical_features:
        # Create a new column with '_Binned' suffix
        # Convert numerical values to string, then to 'category' dtype
        if f'{col}_Binned' not in df.columns: # Avoid recreating if already exists
            df[f'{col}_Binned'] = df[col].astype(str).astype('category')
    return df

# --- Optuna Objective Function for Naive Bayes Hyperparameter Tuning ---
def nb_objective(trial, X_synth_processed, y_synth, X_orig_processed, y_orig, num_classes, calculate_map3_func):
    """
    Objective function for Optuna to optimize Naive Bayes hyperparameters.
    Accepts already processed (scaled/encoded) dataframes.
    """
    # GaussianNB has 'var_smoothing' as its main hyperparameter
    var_smoothing = trial.suggest_float('var_smoothing', 1e-10, 1e-5, log=True)
    
    NFOLDS_OPTUNA = 5 # Fixed to 5 folds for Optuna
    skf_optuna = StratifiedKFold(n_splits=NFOLDS_OPTUNA, shuffle=True, random_state=42)

    oof_preds_trial = np.zeros((len(X_synth_processed), num_classes))
    
    # Octuple the original data for augmentation, consistent with previous models
    # Note: X_orig_processed is already a NumPy array here.
    X_orig_oct_obj = np.vstack([X_orig_processed] * 5)
    y_orig_oct_obj = np.concatenate([y_orig] * 5)

    for fold_num, (train_idx, val_idx) in enumerate(skf_optuna.split(X_synth_processed, y_synth)):
        # Ensure that indexing on NumPy arrays works correctly (direct slicing)
        X_synth_train, y_synth_train = X_synth_processed[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth_processed[val_idx], y_synth[val_idx]

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = np.vstack([X_synth_train, X_orig_oct_obj])
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct_obj])

        model = GaussianNB(var_smoothing=var_smoothing)
        model.fit(X_train_aug, y_train_aug)

        oof_preds_trial[val_idx] = model.predict_proba(X_synth_val)
        
        del model
        gc.collect()

    map3_score_trial = calculate_map3_func(y_synth, oof_preds_trial)
    return map3_score_trial


# --- Main Execution ---
if __name__ == "__main__":
    # Define target column name
    TARGET_COL = 'Fertilizer Name'

    # --- 1. Load Data ---
    print("Loading datasets...")
    train_synthetic_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    train_original_df = pd.read_csv("/kaggle/input/original/Fertilizer Prediction .csv") 
    test_df_raw = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    submission_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

    # Drop 'id' columns
    train_synthetic_df = train_synthetic_df.drop(columns=['id'])
    if 'id' in test_df_raw.columns:
        test_df_raw = test_df_raw.drop(columns=['id'])

    # --- Apply Feature Engineering to make all numerical features categorical ---
    print("Applying feature engineering (binning numerical features into categories)...")
    train_synthetic_df = feature_eng(train_synthetic_df, target_col=TARGET_COL)
    train_original_df = feature_eng(train_original_df, target_col=TARGET_COL)
    test_df_raw = feature_eng(test_df_raw)

    # --- 2. Preprocessing and Feature Preparation ---
    print("Applying preprocessing (Label Encoding target, One-Hot Encoding categorical features, Scaling numerical features)...")
    
    # Target Encoding
    le_fertilizer = LabelEncoder()
    # Fit on combined labels to ensure all possible classes are covered
    le_fertilizer.fit(pd.concat([train_synthetic_df[TARGET_COL], train_original_df[TARGET_COL]]))
    y_synth = le_fertilizer.transform(train_synthetic_df[TARGET_COL])
    y_orig = le_fertilizer.transform(train_original_df[TARGET_COL])
    num_classes = len(le_fertilizer.classes_)

    # Separate feature columns into numerical and categorical
    # Use the original numerical columns before binning for potential scaling
    original_numerical_features = [col for col in train_synthetic_df.select_dtypes(include=['int64', 'float64']).columns
                                   if col != 'id' and col != TARGET_COL]
    
    # The categorical features are the base_categorical_features PLUS all the '_Binned' columns
    base_categorical_features = ['Soil Type', 'Crop Type']
    binned_categorical_features = [col for col in train_synthetic_df.columns if col.endswith('_Binned')]
    all_categorical_features = base_categorical_features + binned_categorical_features
    
    print(f"Original numerical features (for scaling): {original_numerical_features}")
    print(f"All categorical features (for one-hot encoding): {all_categorical_features}")

    # Prepare dataframes for processing
    # Drop the original numerical columns that are now binned if you don't want them alongside binned
    # For Naive Bayes, often it's better to keep numerical if they are scaled.
    # We will keep both the original numericals (scaled) and the binned categoricals (one-hot encoded).
    
    # --- Apply Scaling to Original Numerical Features ---
    scaler = StandardScaler()
    # Fit on combined numerical data for consistency
    scaler.fit(pd.concat([train_synthetic_df[original_numerical_features], 
                          train_original_df[original_numerical_features], 
                          test_df_raw[original_numerical_features]], ignore_index=True))
    
    X_synth_numerical_scaled = scaler.transform(train_synthetic_df[original_numerical_features])
    X_orig_numerical_scaled = scaler.transform(train_original_df[original_numerical_features])
    X_test_numerical_scaled = scaler.transform(test_df_raw[original_numerical_features])

    # --- Apply One-Hot Encoding to Categorical Features ---
    # Concatenate all categorical data to fit the encoder for consistent columns
    all_cat_data = pd.concat([train_synthetic_df[all_categorical_features].astype(str), 
                              train_original_df[all_categorical_features].astype(str), 
                              test_df_raw[all_categorical_features].astype(str)], ignore_index=True)
    
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse_output=False for dense array
    one_hot_encoder.fit(all_cat_data)
    
    X_synth_cat_encoded = one_hot_encoder.transform(train_synthetic_df[all_categorical_features].astype(str))
    X_orig_cat_encoded = one_hot_encoder.transform(train_original_df[all_categorical_features].astype(str))
    X_test_cat_encoded = one_hot_encoder.transform(test_df_raw[all_categorical_features].astype(str))

    # --- Combine Scaled Numerical and One-Hot Encoded Categorical Features ---
    # Concatenate horizontally
    X_synth_processed = np.hstack([X_synth_numerical_scaled, X_synth_cat_encoded])
    X_orig_processed = np.hstack([X_orig_numerical_scaled, X_orig_cat_encoded])
    X_test_processed = np.hstack([X_test_numerical_scaled, X_test_cat_encoded])

    print(f"Processed synthetic training features shape: {X_synth_processed.shape}")
    print(f"Processed original training features shape: {X_orig_processed.shape}")
    print(f"Processed test features shape: {X_test_processed.shape}")

    # --- 3. Naive Bayes Model Training ---
    # --- Optuna Hyperparameter Optimization for Naive Bayes ---
    print("\nStarting Optuna hyperparameter optimization for Naive Bayes (5-Fold CV)...")
    nb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
    nb_study.optimize(
        lambda trial: nb_objective(trial, X_synth_processed, y_synth, X_orig_processed, y_orig, num_classes, calculate_map3),
        n_trials=20, # Fewer trials for Naive Bayes as it has fewer hyperparameters
        show_progress_bar=True
    )

    print("\nNaive Bayes Optuna optimization finished.")
    print(f"Best Naive Bayes trial: {nb_study.best_trial.value:.5f} (MAP@3)")
    print("Best Naive Bayes hyperparameters found:")
    for key, value in nb_study.best_params.items():
        print(f"  {key}: {value}")

    # --- Define Final Naive Bayes Hyperparameters using Optuna's Best ---
    final_nb_params = {
        'var_smoothing': nb_study.best_params['var_smoothing']
    }

    # --- Train Final Naive Bayes Model with 5-Fold CV & 5x Original Data Augmentation (using Best Params) ---
    print("\nTraining final Naive Bayes model with best hyperparameters (5-Fold CV & 5x original data augmentation)...")
    NFOLDS_NB = 5
    skf_nb = StratifiedKFold(n_splits=NFOLDS_NB, shuffle=True, random_state=42)

    oof_preds_nb = np.zeros((len(X_synth_processed), num_classes))
    test_preds_nb = np.zeros((len(X_test_processed), num_classes)) # Use X_test_processed here

    for fold, (train_idx, val_idx) in enumerate(skf_nb.split(X_synth_processed, y_synth)):
        print(f"--- Naive Bayes Fold {fold+1}/{NFOLDS_NB} ---")
        X_synth_train, y_synth_train = X_synth_processed[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth_processed[val_idx], y_synth[val_idx]

        # Octuple the original data to give it more weight (5x)
        X_orig_oct = np.vstack([X_orig_processed] * 5)
        y_orig_oct = np.concatenate([y_orig] * 5)

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = np.vstack([X_synth_train, X_orig_oct])
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct])

        model_nb = GaussianNB(**final_nb_params)
        model_nb.fit(X_train_aug, y_train_aug)

        oof_preds_nb[val_idx] = model_nb.predict_proba(X_synth_val)
        test_preds_nb += model_nb.predict_proba(X_test_processed) / NFOLDS_NB # Use X_test_processed here
        
        del model_nb
        gc.collect()

    map3_score_nb = calculate_map3(y_synth, oof_preds_nb)
    print(f"\n--- Final Naive Bayes CV MAP@3 (5x Original Data, Best Params): {map3_score_nb:.5f} ---\n")

    # Save OOF and Test Predictions for Naive Bayes
    output_dir = './' 
    os.makedirs(output_dir, exist_ok=True) 

    oof_filename_nb = os.path.join(output_dir, 'oof_preds_nb_onehot.npy')
    test_filename_nb = os.path.join(output_dir, 'test_preds_nb_onehot.npy')

    np.save(oof_filename_nb, oof_preds_nb)
    np.save(test_filename_nb, test_preds_nb)
    print(f"Naive Bayes OOF predictions saved to: {oof_filename_nb}")
    print(f"Naive Bayes Test predictions saved to: {test_filename_nb}")

    # --- 4. Generate Final Submission File ---
    print("\nGenerating final submission file (using Naive Bayes predictions)...")
    top3_preds_indices = np.argsort(test_preds_nb, axis=1)[:, ::-1][:, :3]
    top3_preds_labels = le_fertilizer.inverse_transform(top3_preds_indices.flatten()).reshape(top3_preds_indices.shape)

    submission_df['Fertilizer Name'] = [' '.join(row) for row in top3_preds_labels]
    submission_df.to_csv('submission_naive_bayes_onehot.csv', index=False)

    print("Submission file 'submission_naive_bayes_onehot.csv' created successfully.")


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import gc
import warnings
import os # Added for saving files

warnings.filterwarnings('ignore')

# --- Reusable Functions ---
def calculate_map3(y_true, y_pred_proba):
    """Calculates the Mean Average Precision @ 3 score."""
    top3_preds_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    scores = []
    for i, true_label in enumerate(y_true):
        top3 = top3_preds_indices[i]
        score = 0.0
        # Check if the true label is in the top 3 predictions
        if true_label in top3:
            # Find the rank (1, 2, or 3)
            rank = np.where(top3 == true_label)[0][0] + 1
            if rank == 1:
                score = 1.0
            elif rank == 2:
                score = 0.5
            elif rank == 3:
                score = 1/3
        scores.append(score)
    return np.mean(scores)

# --- Feature Engineering Function (Converts Numerical to Binned Categorical) ---
def feature_eng(df, target_col=None):
    # Identify numerical columns to be binned
    numerical_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                            if col != 'id'] # Exclude 'id'
    if target_col and target_col in numerical_features: # Exclude target if it's numerical
        numerical_features.remove(target_col)

    for col in numerical_features:
        # Create a new column with '_Binned' suffix
        # Convert numerical values to string, then to 'category' dtype
        if f'{col}_Binned' not in df.columns: # Avoid recreating if already exists
            df[f'{col}_Binned'] = df[col].astype(str).astype('category')
    return df

# --- Main Execution ---
if __name__ == "__main__":
    # Define target column name
    TARGET_COL = 'Fertilizer Name'

    # --- 1. Load Data ---
    print("Loading datasets...")
    train_synthetic_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    train_original_df = pd.read_csv("/kaggle/input/original/Fertilizer Prediction .csv") # Corrected path
    test_df_raw = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    submission_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

    # Drop 'id' columns
    train_synthetic_df = train_synthetic_df.drop(columns=['id'])
    if 'id' in test_df_raw.columns:
        test_df_raw = test_df_raw.drop(columns=['id'])

    # --- Apply Feature Engineering to make all numerical features categorical ---
    print("Applying feature engineering (binning numerical features into categories)...")
    train_synthetic_df = feature_eng(train_synthetic_df, target_col=TARGET_COL)
    train_original_df = feature_eng(train_original_df, target_col=TARGET_COL)
    test_df_raw = feature_eng(test_df_raw)

    # --- 2. Preprocessing and Feature Preparation ---
    print("Applying preprocessing...")
    # Target Encoding
    le_fertilizer = LabelEncoder()
    # Fit on combined labels to ensure all possible classes are covered
    le_fertilizer.fit(pd.concat([train_synthetic_df[TARGET_COL], train_original_df[TARGET_COL]]))
    y_synth = le_fertilizer.transform(train_synthetic_df[TARGET_COL])
    y_orig = le_fertilizer.transform(train_original_df[TARGET_COL])
    num_classes = len(le_fertilizer.classes_)

    # Dynamically determine the common feature columns after feature engineering
    base_categorical_features = ['Soil Type', 'Crop Type']
    
    # Collect all binned columns that are present across all three dataframes
    all_processed_dfs = [train_synthetic_df, train_original_df, test_df_raw]
    common_binned_cols = set()

    # Get binned columns from the first dataframe
    if all_processed_dfs:
        first_df_binned = [col for col in all_processed_dfs[0].columns if col.endswith('_Binned')]
        common_binned_cols.update(first_df_binned)

        # Intersect with binned columns from subsequent dataframes
        for df_item in all_processed_dfs[1:]:
            current_df_binned = [col for col in df_item.columns if col.endswith('_Binned')]
            common_binned_cols.intersection_update(current_df_binned)

    # The final feature_cols will be the base categorical plus the common binned ones
    feature_cols = base_categorical_features + sorted(list(common_binned_cols))
    
    print(f"Selected feature columns (all categorical): {feature_cols}")

    # Feature Sets - now containing only categorical/binned features
    X_synth = train_synthetic_df[feature_cols].copy()
    X_orig = train_original_df[feature_cols].copy()
    test_df = test_df_raw[feature_cols].copy()

    # Categorical Feature Encoding (Using LabelEncoder for consistent integer mapping)
    for col in feature_cols:
        # Concatenate all categories from all datasets for consistent mapping
        # Convert to string first to handle potential mixed types from .cat.codes or original data
        full_vocab = pd.concat([X_synth[col].astype(str), X_orig[col].astype(str), test_df[col].astype(str)], axis=0).unique()
        
        label_enc = LabelEncoder()
        label_enc.fit(full_vocab.astype(str)) # Fit on the full vocabulary of strings
        
        X_synth[col] = label_enc.transform(X_synth[col].astype(str))
        X_orig[col] = label_enc.transform(X_orig[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))
        
        # Cast to 'category' dtype after encoding, as XGBoost's enable_categorical can leverage this
        X_synth[col] = X_synth[col].astype("category")
        X_orig[col] = X_orig[col].astype("category")
        test_df[col] = test_df[col].astype("category")

    # --- 3. Define Final Hyperparameters (Corrected syntax and 'num_class') ---
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes, # CORRECTED: Use 'num_classes' variable
        'max_depth': 15,
        'learning_rate': 0.010181857193698362,
        'min_child_weight': 0.6343101283603367,
        'colsample_bytree': 0.40213679386048695,
        'subsample': 0.6936417386806593,
        'gamma': 0.23727358231785983,
        'reg_alpha': 9.642216011708818,
        'reg_lambda': 1.1393721704644078,
        'eval_metric': 'mlogloss',
        'device': "cuda",
        'seed': 42,
        'tree_method': 'hist',
        'enable_categorical': True,
        # 'n_estimators' and 'early_stopping_rounds' are parameters for xgb.train, not part of params dict directly.
        # They are correctly passed to xgb.train function call.
    }

    # --- 4. Train Model with 10-Fold CV & 5x Original Data Augmentation ---
    print("\nTraining XGBoost with 5x original data (5 Folds)...")
    NFOLDS = 5
    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(X_synth), num_classes))
    test_preds = np.zeros((len(test_df), num_classes))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_synth, y_synth)):
        print(f"--- Fold {fold+1}/{NFOLDS} ---")
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Octuple the original data to give it more weight (5x)
        X_orig_oct = pd.concat([X_orig] * 5, ignore_index=True)
        y_orig_oct = np.concatenate([y_orig] * 5)

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct])

        # Use X_synth.columns.tolist() for feature_names to ensure consistency with trained features
        dtrain = xgb.DMatrix(X_train_aug, label=y_train_aug, feature_names=X_synth.columns.tolist(), enable_categorical=True)
        dval = xgb.DMatrix(X_synth_val, label=y_synth_val, feature_names=X_synth.columns.tolist(), enable_categorical=True)
        dtest = xgb.DMatrix(test_df, feature_names=X_synth.columns.tolist(), enable_categorical=True) # Ensure test_df also uses these names

        model = xgb.train(params, dtrain, num_boost_round=5000, evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=None)

        oof_preds[val_idx] = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        test_preds += model.predict(dtest, iteration_range=(0, model.best_iteration + 1)) / NFOLDS
        gc.collect()

    # --- 5. Evaluate and Save ---
    map3_score = calculate_map3(y_synth, oof_preds)
    print(f"\n--- Final Model CV MAP@3 (5x Original Data): {map3_score:.5f} ---\n")

    # Save OOF and Test Predictions
    output_dir = './' # Save in current directory
    os.makedirs(output_dir, exist_ok=True) # Ensure directory exists

    oof_filename = os.path.join(output_dir, 'oof_preds.npy')
    test_filename = os.path.join(output_dir, 'test_preds.npy')

    np.save(oof_filename, oof_preds)
    np.save(test_filename, test_preds)
    print(f"OOF predictions saved to: {oof_filename}")
    print(f"Test predictions saved to: {test_filename}")

    print("Generating final submission file...")
    top3_preds_indices = np.argsort(test_preds, axis=1)[:, ::-1][:, :3]
    top3_preds_labels = le_fertilizer.inverse_transform(top3_preds_indices.flatten()).reshape(top3_preds_indices.shape)

    submission_df['Fertilizer Name'] = [' '.join(row) for row in top3_preds_labels]
    submission_df.to_csv('submission.csv', index=False)

    print("Submission file 'submission.csv' created successfully.")


Loading datasets...
Applying feature engineering (binning numerical features into categories)...
Applying preprocessing...
Selected feature columns (all categorical): ['Soil Type', 'Crop Type', 'Humidity_Binned', 'Moisture_Binned', 'Nitrogen_Binned', 'Phosphorous_Binned', 'Potassium_Binned', 'Temparature_Binned']

Training XGBoost with 5x original data (5 Folds)...
--- Fold 1/5 ---
--- Fold 2/5 ---
--- Fold 3/5 ---
--- Fold 4/5 ---
--- Fold 5/5 ---

--- Final Model CV MAP@3 (5x Original Data): 0.37911 ---

OOF predictions saved to: ./oof_preds.npy
Test predictions saved to: ./test_preds.npy
Generating final submission file...
Submission file 'submission.csv' created successfully.


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import gc
import warnings
import os
import optuna # Added for hyperparameter optimization

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Reusable Functions ---
def calculate_map3(y_true, y_pred_proba):
    """Calculates the Mean Average Precision @ 3 score."""
    # Get the indices of the top 3 predictions for each sample
    top3_preds_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    scores = []
    for i, true_label in enumerate(y_true):
        top3 = top3_preds_indices[i]
        score = 0.0
        # Check if the true label is in the top 3 predictions
        if true_label in top3:
            # Find the rank (1, 2, or 3)
            rank = np.where(top3 == true_label)[0][0] + 1
            if rank == 1:
                score = 1.0
            elif rank == 2:
                score = 0.5
            elif rank == 3:
                score = 1/3
        scores.append(score)
    return np.mean(scores)

# --- Feature Engineering Function (Converts Numerical to Binned Categorical) ---
def feature_eng(df, target_col=None):
    # Identify numerical columns to be binned
    numerical_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                            if col != 'id'] # Exclude 'id'
    if target_col and target_col in numerical_features: # Exclude target if it's numerical
        numerical_features.remove(target_col)

    for col in numerical_features:
        # Create a new column with '_Binned' suffix
        # Convert numerical values to string, then to 'category' dtype
        if f'{col}_Binned' not in df.columns: # Avoid recreating if already exists
            df[f'{col}_Binned'] = df[col].astype(str).astype('category')
    return df

# --- Optuna Objective Function for XGBoost Hyperparameter Tuning ---
def objective(trial, X_synth, y_synth, X_orig, y_orig, test_df_feature_names, num_classes, calculate_map3_func):
    """
    Objective function for Optuna to optimize XGBoost hyperparameters.
    """
    # Define hyperparameter search space
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'max_depth': trial.suggest_int('max_depth', 3, 15), # Reduced max_depth range based on previous discussion
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0, log=True), # Changed to float for more granular tuning
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'eval_metric': 'mlogloss',
        'device': "cuda", # Keep as fixed, assuming GPU environment
        'seed': 42, # Keep seed fixed for reproducibility of trials
        'tree_method': 'hist',
        'enable_categorical': True,
        # No 'n_estimators' or 'early_stopping_rounds' directly in params for xgb.train,
        # these are passed to xgb.train call.
    }

    NFOLDS_OPTUNA = 5 # Fixed to 5 folds for Optuna
    skf_optuna = StratifiedKFold(n_splits=NFOLDS_OPTUNA, shuffle=True, random_state=42)

    oof_preds_trial = np.zeros((len(X_synth), num_classes))
    
    # Octuple the original data once for the objective function.
    X_orig_oct_obj = pd.concat([X_orig] * 5, ignore_index=True)
    y_orig_oct_obj = np.concatenate([y_orig] * 5)

    for fold_num, (train_idx, val_idx) in enumerate(skf_optuna.split(X_synth, y_synth)):
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct_obj], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct_obj])

        dtrain = xgb.DMatrix(X_train_aug, label=y_train_aug, feature_names=test_df_feature_names, enable_categorical=True)
        dval = xgb.DMatrix(X_synth_val, label=y_synth_val, feature_names=test_df_feature_names, enable_categorical=True)

        model = xgb.train(params, dtrain, num_boost_round=5000, evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=None)

        oof_preds_trial[val_idx] = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        
        del model, dtrain, dval
        gc.collect()

    map3_score_trial = calculate_map3_func(y_synth, oof_preds_trial)
    return map3_score_trial


# --- Main Execution ---
if __name__ == "__main__":
    # Define target column name
    TARGET_COL = 'Fertilizer Name'

    # --- 1. Load Data ---
    print("Loading datasets...")
    train_synthetic_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    train_original_df = pd.read_csv("/kaggle/input/original/Fertilizer Prediction .csv")
    test_df_raw = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    submission_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

    # Drop 'id' columns
    train_synthetic_df = train_synthetic_df.drop(columns=['id'])
    if 'id' in test_df_raw.columns:
        test_df_raw = test_df_raw.drop(columns=['id'])

    # --- Apply Feature Engineering to make all numerical features categorical ---
    print("Applying feature engineering (binning numerical features into categories)...")
    train_synthetic_df = feature_eng(train_synthetic_df, target_col=TARGET_COL)
    train_original_df = feature_eng(train_original_df, target_col=TARGET_COL)
    test_df_raw = feature_eng(test_df_raw)


    # --- 2. Preprocessing and Feature Preparation ---
    print("Applying preprocessing...")
    # Target Encoding
    le_fertilizer = LabelEncoder()
    # Fit on combined labels to ensure all possible classes are covered
    le_fertilizer.fit(pd.concat([train_synthetic_df[TARGET_COL], train_original_df[TARGET_COL]]))
    y_synth = le_fertilizer.transform(train_synthetic_df[TARGET_COL])
    y_orig = le_fertilizer.transform(train_original_df[TARGET_COL])
    num_classes = len(le_fertilizer.classes_)

    # Dynamically determine the common feature columns after feature engineering
    base_categorical_features = ['Soil Type', 'Crop Type']
    
    # Collect all binned columns that are present across all three dataframes
    all_processed_dfs = [train_synthetic_df, train_original_df, test_df_raw]
    common_binned_cols = set()

    # Get binned columns from the first dataframe
    if all_processed_dfs:
        first_df_binned = [col for col in all_processed_dfs[0].columns if col.endswith('_Binned')]
        common_binned_cols.update(first_df_binned)

        # Intersect with binned columns from subsequent dataframes
        for df_item in all_processed_dfs[1:]:
            current_df_binned = [col for col in df_item.columns if col.endswith('_Binned')]
            common_binned_cols.intersection_update(current_df_binned)

    # The final feature_cols will be the base categorical plus the common binned ones
    feature_cols = base_categorical_features + sorted(list(common_binned_cols))
    
    print(f"Selected feature columns (all categorical): {feature_cols}")

    # Feature Sets - now containing only categorical/binned features
    X_synth = train_synthetic_df[feature_cols].copy()
    X_orig = train_original_df[feature_cols].copy()
    test_df = test_df_raw[feature_cols].copy() # Ensure test_df also has only selected features

    # Categorical Feature Encoding (Using LabelEncoder for consistent integer mapping)
    for col in feature_cols:
        # Concatenate all categories from all datasets for consistent mapping
        full_vocab = pd.concat([X_synth[col].astype(str), X_orig[col].astype(str), test_df[col].astype(str)], axis=0).unique()
        
        label_enc = LabelEncoder()
        label_enc.fit(full_vocab.astype(str)) # Fit on the full vocabulary of strings
        
        X_synth[col] = label_enc.transform(X_synth[col].astype(str))
        X_orig[col] = label_enc.transform(X_orig[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))
        
        # Cast to 'category' dtype after encoding, as XGBoost's enable_categorical can leverage this
        X_synth[col] = X_synth[col].astype("category")
        X_orig[col] = X_orig[col].astype("category")
        test_df[col] = test_df[col].astype("category")


    # --- 3. Optuna Hyperparameter Optimization ---
    print("\nStarting Optuna hyperparameter optimization (5-Fold CV)...")
    # Using TPESampler for more efficient exploration
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(
        lambda trial: objective(trial, X_synth, y_synth, X_orig, y_orig, X_synth.columns.tolist(), num_classes, calculate_map3),
        n_trials=50, # Number of trials for Optuna to run. Adjust as needed.
        show_progress_bar=True
    )

    print("\nOptuna optimization finished.")
    print(f"Best trial: {study.best_trial.value:.5f} (MAP@3)")
    print("Best hyperparameters found:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")

    # --- 4. Define Final Hyperparameters using Optuna's Best ---
    # Start with fixed parameters, then override with best params from Optuna
    final_params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'device': "cuda",
        'seed': 42,
        'tree_method': 'hist',
        'enable_categorical': True,
        # These will be set by Optuna's best params
        'max_depth': study.best_params['max_depth'],
        'learning_rate': study.best_params['learning_rate'],
        'min_child_weight': study.best_params['min_child_weight'],
        'colsample_bytree': study.best_params['colsample_bytree'],
        'subsample': study.best_params['subsample'],
        'gamma': study.best_params['gamma'],
        'reg_alpha': study.best_params['reg_alpha'],
        'reg_lambda': study.best_params['reg_lambda'],
        # 'n_estimators' and 'early_stopping_rounds' are for xgb.train call, not params dict
    }

    # --- 5. Train Final Model with 5-Fold CV & 5x Original Data Augmentation (using Best Params) ---
    print("\nTraining final XGBoost model with best hyperparameters (5-Fold CV & 5x original data augmentation)...")
    NFOLDS = 5 # Fixed to 5 folds for final training and prediction
    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(X_synth), num_classes))
    test_preds = np.zeros((len(test_df), num_classes))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_synth, y_synth)):
        print(f"--- Fold {fold+1}/{NFOLDS} ---")
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Octuple the original data to give it more weight (5x as per user's earlier snippet)
        X_orig_oct = pd.concat([X_orig] * 5, ignore_index=True)
        y_orig_oct = np.concatenate([y_orig] * 5)

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct])

        dtrain = xgb.DMatrix(X_train_aug, label=y_train_aug, feature_names=X_synth.columns.tolist(), enable_categorical=True)
        dval = xgb.DMatrix(X_synth_val, label=y_synth_val, feature_names=X_synth.columns.tolist(), enable_categorical=True)
        dtest = xgb.DMatrix(test_df, feature_names=X_synth.columns.tolist(), enable_categorical=True)

        model = xgb.train(final_params, dtrain, num_boost_round=5000, evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=None)

        oof_preds[val_idx] = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        test_preds += model.predict(dtest, iteration_range=(0, model.best_iteration + 1)) / NFOLDS
        gc.collect()

    # --- 6. Evaluate and Save ---
    map3_score = calculate_map3(y_synth, oof_preds)
    print(f"\n--- Final Model CV MAP@3 (5x Original Data, Best Params): {map3_score:.5f} ---\n")

    # Save OOF and Test Predictions
    output_dir = './' # Save in current directory
    os.makedirs(output_dir, exist_ok=True) # Ensure directory exists

    oof_filename = os.path.join(output_dir, 'oof_preds.npy')
    test_filename = os.path.join(output_dir, 'test_preds.npy')

    np.save(oof_filename, oof_preds)
    np.save(test_filename, test_preds)
    print(f"OOF predictions saved to: {oof_filename}")
    print(f"Test predictions saved to: {test_filename}")

    print("Generating final submission file...")
    top3_preds_indices = np.argsort(test_preds, axis=1)[:, ::-1][:, :3]
    top3_preds_labels = le_fertilizer.inverse_transform(top3_preds_indices.flatten()).reshape(top3_preds_indices.shape)

    submission_df['Fertilizer Name'] = [' '.join(row) for row in top3_preds_labels]
    submission_df.to_csv('submission.csv', index=False)

    print("Submission file 'submission.csv' created successfully.")


Loading datasets...
Applying feature engineering (binning numerical features into categories)...
Applying preprocessing...
Selected feature columns (all categorical): ['Soil Type', 'Crop Type', 'Humidity_Binned', 'Moisture_Binned', 'Nitrogen_Binned', 'Phosphorous_Binned', 'Potassium_Binned', 'Temparature_Binned']


[I 2025-06-20 10:29:32,951] A new study created in memory with name: no-name-97ca8832-00cd-4a30-ab25-67296a4ba27e



Starting Optuna hyperparameter optimization (5-Fold CV)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-06-20 10:33:44,049] Trial 0 finished with value: 0.361878 and parameters: {'max_depth': 7, 'learning_rate': 0.08927180304353628, 'min_child_weight': 2.9106359131330697, 'colsample_bytree': 0.759195090518222, 'subsample': 0.4936111842654619, 'gamma': 0.07799726016810132, 'reg_alpha': 0.014936568554617643, 'reg_lambda': 3.9676050770529883}. Best is trial 0 with value: 0.361878.
[I 2025-06-20 10:42:47,776] Trial 1 finished with value: 0.3604271111111111 and parameters: {'max_depth': 10, 'learning_rate': 0.051059032093947576, 'min_child_weight': 0.10994335574766201, 'colsample_bytree': 0.9819459112971965, 'subsample': 0.899465584480253, 'gamma': 0.10616955533913808, 'reg_alpha': 0.035113563139704075, 'reg_lambda': 0.03549878832196503}. Best is trial 0 with value: 0.361878.
[I 2025-06-20 10:59:39,368] Trial 2 finished with value: 0.37477555555555553 and parameters: {'max_depth': 6, 'learning_rate': 0.03347776308515933, 'min_child_weight': 0.7309539835912913, 'colsample_bytree': 0.57