In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
/kaggle/input/firstlayer/oof_preds_hgb_fixed_params.npy
/kaggle/input/firstlayer/oof_preds_nn_keras.npy
/kaggle/input/firstlayer/oof_preds_nb_onehot.npy
/kaggle/input/firstlayer/oof_preds.npy
/kaggle/input/firstlayer/test_preds.npy
/kaggle/input/firstlayer/test_preds_nb_onehot.npy
/kaggle/input/firstlayer/test_preds_nn_keras.npy
/kaggle/input/firstlayer/test_preds_hgb_fixed_params.npy
/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv


In [None]:
import pandas as pd
import numpy as np
# No xgboost needed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import gc
import warnings
import os
import optuna # Added for hyperparameter optimization
import lightgbm as lgb # Changed from xgboost to lightgbm

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Reusable Functions ---
def calculate_map3(y_true, y_pred_proba):
    """Calculates the Mean Average Precision @ 3 score."""
    # Get the indices of the top 3 predictions for each sample
    top3_preds_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    scores = []
    for i, true_label in enumerate(y_true):
        top3 = top3_preds_indices[i]
        score = 0.0
        # Check if the true label is in the top 3 predictions
        if true_label in top3:
            # Find the rank (1, 2, or 3)
            rank = np.where(top3 == true_label)[0][0] + 1
            if rank == 1:
                score = 1.0
            elif rank == 2:
                score = 0.5
            elif rank == 3:
                score = 1/3
        scores.append(score)
    return np.mean(scores)

# --- Feature Engineering Function (Converts Numerical to Binned Categorical) ---
def feature_eng(df, target_col=None):
    # Identify numerical columns to be binned
    numerical_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                            if col != 'id'] # Exclude 'id'
    if target_col and target_col in numerical_features: # Exclude target if it's numerical
        numerical_features.remove(target_col)

    for col in numerical_features:
        # Create a new column with '_Binned' suffix
        # Convert numerical values to string, then to 'category' dtype
        if f'{col}_Binned' not in df.columns: # Avoid recreating if already exists
            df[f'{col}_Binned'] = df[col].astype(str).astype('category')
    return df

# --- Optuna Objective Function for LightGBM Hyperparameter Tuning ---
def objective(trial, X_synth, y_synth, X_orig, y_orig, test_df_feature_names, num_classes, calculate_map3_func):
    """
    Objective function for Optuna to optimize LightGBM hyperparameters.
    """
    # Define hyperparameter search space for LightGBM
    params = {
        'objective': 'multiclass', # LightGBM objective for multiclass classification
        'num_class': num_classes,
        'metric': 'multi_logloss', # Evaluation metric
        'boosting_type': 'gbdt',   # Gradient Boosting Decision Tree
        'n_estimators': 5000,      # Max number of boosting rounds
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200), # Number of leaves in one tree
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100), # Min data in a child leaf
        'subsample': trial.suggest_float('subsample', 0.4, 1.0), # bagging_fraction in LightGBM
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0), # feature_fraction in LightGBM
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True), # L1 regularization (lambda_l1)
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True), # L2 regularization (lambda_l2)
        'random_state': 42, # Keep seed fixed for reproducibility of trials
        'n_jobs': -1,       # Use all available cores
        'verbose': -1,      # Suppress verbose output during training
        'device': 'gpu' if hasattr(lgb, 'GPU_DEVICE') else 'cpu', # Use GPU if available
    }

    NFOLDS_OPTUNA = 5 # Fixed to 5 folds for Optuna
    skf_optuna = StratifiedKFold(n_splits=NFOLDS_OPTUNA, shuffle=True, random_state=42)

    oof_preds_trial = np.zeros((len(X_synth), num_classes))
    
    # Octuple the original data once for the objective function.
    X_orig_oct_obj = pd.concat([X_orig] * 5, ignore_index=True)
    y_orig_oct_obj = np.concatenate([y_orig] * 5)

    for fold_num, (train_idx, val_idx) in enumerate(skf_optuna.split(X_synth, y_synth)):
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct_obj], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct_obj])

        # LightGBM requires Dataset objects
        # `categorical_feature='auto'` allows LightGBM to detect categorical columns if they are `category` dtype.
        # Or, pass feature_names as a list of strings: `categorical_feature=[col for col in X_train_aug.columns if X_train_aug[col].dtype.name == 'category']`
        lgb_train = lgb.Dataset(X_train_aug, y_train_aug, categorical_feature='auto')
        lgb_eval = lgb.Dataset(X_synth_val, y_synth_val, reference=lgb_train, categorical_feature='auto')

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_eval],
            callbacks=[lgb.early_stopping(50, verbose=False)] # Early stopping
        )

        oof_preds_trial[val_idx] = model.predict(X_synth_val, num_iteration=model.best_iteration)
        
        del model, lgb_train, lgb_eval
        gc.collect()

    map3_score_trial = calculate_map3_func(y_synth, oof_preds_trial)
    return map3_score_trial


# --- Main Execution ---
if __name__ == "__main__":
    # Define target column name
    TARGET_COL = 'Fertilizer Name'

    # --- 1. Load Data ---
    print("Loading datasets...")
    train_synthetic_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    train_original_df = pd.read_csv("/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv")
    test_df_raw = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    submission_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

    # Drop 'id' columns
    train_synthetic_df = train_synthetic_df.drop(columns=['id'])
    if 'id' in test_df_raw.columns:
        test_df_raw = test_df_raw.drop(columns=['id'])

    # --- Apply Feature Engineering to make all numerical features categorical ---
    print("Applying feature engineering (binning numerical features into categories)...")
    train_synthetic_df = feature_eng(train_synthetic_df, target_col=TARGET_COL)
    train_original_df = feature_eng(train_original_df, target_col=TARGET_COL)
    test_df_raw = feature_eng(test_df_raw)


    # --- 2. Preprocessing and Feature Preparation ---
    print("Applying preprocessing...")
    # Target Encoding
    le_fertilizer = LabelEncoder()
    # Fit on combined labels to ensure all possible classes are covered
    le_fertilizer.fit(pd.concat([train_synthetic_df[TARGET_COL], train_original_df[TARGET_COL]]))
    y_synth = le_fertilizer.transform(train_synthetic_df[TARGET_COL])
    y_orig = le_fertilizer.transform(train_original_df[TARGET_COL])
    num_classes = len(le_fertilizer.classes_)

    # Dynamically determine the common feature columns after feature engineering
    base_categorical_features = ['Soil Type', 'Crop Type']
    
    # Collect all binned columns that are present across all three dataframes
    all_processed_dfs = [train_synthetic_df, train_original_df, test_df_raw]
    common_binned_cols = set()

    # Get binned columns from the first dataframe
    if all_processed_dfs:
        first_df_binned = [col for col in all_processed_dfs[0].columns if col.endswith('_Binned')]
        common_binned_cols.update(first_df_binned)

        # Intersect with binned columns from subsequent dataframes
        for df_item in all_processed_dfs[1:]:
            current_df_binned = [col for col in df_item.columns if col.endswith('_Binned')]
            common_binned_cols.intersection_update(current_df_binned)

    # The final feature_cols will be the base categorical plus the common binned ones
    feature_cols = base_categorical_features + sorted(list(common_binned_cols))
    
    print(f"Selected feature columns (all categorical): {feature_cols}")

    # Feature Sets - now containing only categorical/binned features
    X_synth = train_synthetic_df[feature_cols].copy()
    X_orig = train_original_df[feature_cols].copy()
    test_df = test_df_raw[feature_cols].copy() # Ensure test_df also has only selected features

    # Categorical Feature Encoding (Using LabelEncoder for consistent integer mapping)
    for col in feature_cols:
        # Concatenate all categories from all datasets for consistent mapping
        full_vocab = pd.concat([X_synth[col].astype(str), X_orig[col].astype(str), test_df[col].astype(str)], axis=0).unique()
        
        label_enc = LabelEncoder()
        label_enc.fit(full_vocab.astype(str)) # Fit on the full vocabulary of strings
        
        X_synth[col] = label_enc.transform(X_synth[col].astype(str))
        X_orig[col] = label_enc.transform(X_orig[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))
        
        # Cast to 'category' dtype after encoding, as LightGBM can leverage this
        X_synth[col] = X_synth[col].astype("category")
        X_orig[col] = X_orig[col].astype("category")
        test_df[col] = test_df[col].astype("category")


    # --- 3. Optuna Hyperparameter Optimization ---
    print("\nStarting Optuna hyperparameter optimization (5-Fold CV)...")
    # Using TPESampler for more efficient exploration
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(
        lambda trial: objective(trial, X_synth, y_synth, X_orig, y_orig, X_synth.columns.tolist(), num_classes, calculate_map3),
        n_trials=50, # Number of trials for Optuna to run. Adjust as needed.
        show_progress_bar=True
    )

    print("\nOptuna optimization finished.")
    print(f"Best trial: {study.best_trial.value:.5f} (MAP@3)")
    print("Best hyperparameters found:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")

    # --- 4. Define Final Hyperparameters using Optuna's Best ---
    # Start with fixed parameters, then override with best params from Optuna
    final_params = {
        'objective': 'multiclass',
        'num_class': num_classes,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'n_estimators': 5000, # Max number of boosting rounds for final model
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1, # Suppress verbose output
        'device': 'gpu' if hasattr(lgb, 'GPU_DEVICE') else 'cpu',
        # These will be set by Optuna's best params
        'learning_rate': study.best_params['learning_rate'],
        'num_leaves': study.best_params['num_leaves'],
        'max_depth': study.best_params['max_depth'],
        'min_child_samples': study.best_params['min_child_samples'],
        'subsample': study.best_params['subsample'],
        'colsample_bytree': study.best_params['colsample_bytree'],
        'reg_alpha': study.best_params['reg_alpha'],
        'reg_lambda': study.best_params['reg_lambda'],
    }

    # --- 5. Train Final Model with 5-Fold CV & 5x Original Data Augmentation (using Best Params) ---
    print("\nTraining final LightGBM model with best hyperparameters (5-Fold CV & 5x original data augmentation)...")
    NFOLDS = 5 # Fixed to 5 folds for final training and prediction
    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(X_synth), num_classes))
    test_preds = np.zeros((len(test_df), num_classes))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_synth, y_synth)):
        print(f"--- Fold {fold+1}/{NFOLDS} ---")
        X_synth_train, y_synth_train = X_synth.iloc[train_idx], y_synth[train_idx]
        X_synth_val, y_synth_val = X_synth.iloc[val_idx], y_synth[val_idx]

        # Octuple the original data to give it more weight (5x)
        X_orig_oct = pd.concat([X_orig] * 5, ignore_index=True)
        y_orig_oct = np.concatenate([y_orig] * 5)

        # Augment the training data with the OCTUPLED original dataset
        X_train_aug = pd.concat([X_synth_train, X_orig_oct], ignore_index=True)
        y_train_aug = np.concatenate([y_synth_train, y_orig_oct])

        # LightGBM Dataset for training and validation
        lgb_train = lgb.Dataset(X_train_aug, y_train_aug, categorical_feature='auto')
        lgb_eval = lgb.Dataset(X_synth_val, y_synth_val, reference=lgb_train, categorical_feature='auto')
        
        # Train the LightGBM model
        model = lgb.train(
            final_params,
            lgb_train,
            valid_sets=[lgb_eval],
            callbacks=[lgb.early_stopping(50, verbose=False)], # Early stopping
            verbose_eval=False # Suppress training verbosity
        )

        oof_preds[val_idx] = model.predict(X_synth_val, num_iteration=model.best_iteration)
        test_preds += model.predict(test_df, num_iteration=model.best_iteration) / NFOLDS
        gc.collect()

    # --- 6. Evaluate and Save ---
    map3_score = calculate_map3(y_synth, oof_preds)
    print(f"\n--- Final Model CV MAP@3 (5x Original Data, Best Params): {map3_score:.5f} ---\n")

    # Save OOF and Test Predictions
    output_dir = './' # Save in current directory
    os.makedirs(output_dir, exist_ok=True) # Ensure directory exists

    oof_filename = os.path.join(output_dir, 'oof_preds_lgbm.npy') # Changed filename
    test_filename = os.path.join(output_dir, 'test_preds_lgbm.npy') # Changed filename

    np.save(oof_filename, oof_preds)
    np.save(test_filename, test_preds)
    print(f"OOF predictions saved to: {oof_filename}")
    print(f"Test predictions saved to: {test_filename}")

    print("Generating final submission file...")
    top3_preds_indices = np.argsort(test_preds, axis=1)[:, ::-1][:, :3]
    top3_preds_labels = le_fertilizer.inverse_transform(top3_preds_indices.flatten()).reshape(top3_preds_indices.shape)

    submission_df['Fertilizer Name'] = [' '.join(row) for row in top3_preds_labels]
    submission_df.to_csv('submission_lgbm.csv', index=False) # Changed submission filename

    print("Submission file 'submission_lgbm.csv' created successfully.")


Loading datasets...
Applying feature engineering (binning numerical features into categories)...
Applying preprocessing...
Selected feature columns (all categorical): ['Soil Type', 'Crop Type', 'Humidity_Binned', 'Moisture_Binned', 'Nitrogen_Binned', 'Phosphorous_Binned', 'Potassium_Binned', 'Temparature_Binned']


[I 2025-06-23 10:00:28,067] A new study created in memory with name: no-name-b6f2e14a-356e-4fb4-b065-60a74d288197



Starting Optuna hyperparameter optimization (5-Fold CV)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-06-23 10:54:28,535] Trial 0 finished with value: 0.36666422222222217 and parameters: {'learning_rate': 0.023688639503640783, 'num_leaves': 192, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.4936111842654619, 'colsample_bytree': 0.49359671220172163, 'reg_alpha': 0.014936568554617643, 'reg_lambda': 3.9676050770529883}. Best is trial 0 with value: 0.36666422222222217.
[I 2025-06-23 12:40:24,512] Trial 1 finished with value: 0.3648224444444444 and parameters: {'learning_rate': 0.039913058785616795, 'num_leaves': 148, 'max_depth': 3, 'min_child_samples': 98, 'subsample': 0.899465584480253, 'colsample_bytree': 0.5274034664069657, 'reg_alpha': 0.035113563139704075, 'reg_lambda': 0.03549878832196503}. Best is trial 0 with value: 0.36666422222222217.
[I 2025-06-23 13:54:26,194] Trial 2 finished with value: 0.3676902222222222 and parameters: {'learning_rate': 0.02014847788415866, 'num_leaves': 114, 'max_depth': 8, 'min_child_samples': 32, 'subsample': 0.7671117368334277, 'cols