In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ensemble3/oof_preds_xgb_custom.npy
/kaggle/input/ensemble3/test_preds_xgb_optimized.npy
/kaggle/input/ensemble3/oof_preds_xgb_optimized.npy
/kaggle/input/ensemble3/test_preds_xgb_custom.npy
/kaggle/input/ensemble6/pred_test_log_proba.csv
/kaggle/input/ensemble6/xgb_repeat_train_oof.npy
/kaggle/input/ensemble6/pred_oof_log_proba.csv
/kaggle/input/ensemble6/xgb_repeat_test_oof.npy
/kaggle/input/ensemble4/test_preds_gaussian_nb.npy
/kaggle/input/ensemble4/test_preds_lda_base.npy
/kaggle/input/ensemble4/oof_preds_gaussian_nb.npy
/kaggle/input/ensemble4/oof_preds_lda_base.npy
/kaggle/input/ensemble5/test_preds_hgb.npy
/kaggle/input/ensemble5/oof_preds_hgb.npy
/kaggle/input/ensemble5/oof_preds_ydf.npy
/kaggle/input/ensemble5/test_preds_ydf.npy
/kaggle/input/boosting-output/test_preds_xgb_sklearn.npy
/kaggle/input/boosting-output/oof_preds_xgb_sklearn.npy
/kaggle/input/boosting-output/test_preds_lgbm.npy
/kaggle/input/boosting-output/oof_preds_xgb_train_api2.npy
/kaggle/input/bo

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime # Import datetime for timestamps
import optuna # Still imported but not used directly in the provided snippets
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import warnings
import gc
import xgboost as xgb
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB # Import Gaussian Naive Bayes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Import Linear Discriminant Analysis
from sklearn.ensemble import HistGradientBoostingClassifier # Import HistGradientBoostingClassifier
import ydf # NEW: Import Yggdrasil Decision Forests


# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- 1. Configuration and Global Random State ---
class CFG:
    seed = 42
    target = 'Fertilizer Name'
    n_splits = 5 # Number of folds for cross-validation
    learning_rate = 0.03
    num_boost_round = 5000
    early_stopping_rounds = 50
    verbose_eval = 200

    # --- Output Directory for current session (will be cleared on session end) ---
    OUTPUT_DIR = '/kaggle/working/outputs/' 
    
    # --- Directory for User Uploaded Input Files (e.g., from a Kaggle Dataset) ---
    # Set to /kaggle/input/ensemble2/ as the primary uploaded input dir for new models
    UPLOADED_INPUT_DIR = '/kaggle/input/ensemble2/' 

    # Filenames for base model predictions (ONLY INCLUDING ACTIVE MODELS)
    FNAME_XGB_API1_OOF = 'oof_preds_xgb_train_api1.npy'
    FNAME_XGB_API1_TEST = 'test_preds_xgb_train_api1.npy'
    
    FNAME_XGB_SKLEARN_OOF = 'oof_preds_xgb_sklearn.npy'
    FNAME_XGB_SKLEARN_TEST = 'test_preds_xgb_sklearn.npy'
    
    FNAME_LGBM_OOF = 'oof_preds_lgbm.npy'
    FNAME_LGBM_TEST = 'test_preds_lgbm.npy'
    
    FNAME_LR_OOF = 'oof_preds_lr.npy'
    FNAME_LR_TEST = 'test_preds_lr.npy'

    FNAME_LGBM_OPTIMIZED_OOF = 'oof_preds_lgbm_optimized.npy'
    FNAME_LGBM_OPTIMIZED_TEST = 'test_preds_lgbm_optimized.npy'

    FNAME_VOTING_OOF = 'oof_preds_voting.npy'
    FNAME_VOTING_TEST = 'test_preds_voting.npy'

    FNAME_GAUSSIAN_NB_OOF = 'oof_preds_gaussian_nb.npy'
    FNAME_GAUSSIAN_NB_TEST = 'test_preds_gaussian_nb.npy'

    FNAME_HGB_OOF = 'oof_preds_hgb.npy'
    FNAME_HGB_TEST = 'test_preds_hgb.npy'

    FNAME_YDF_OOF = 'oof_preds_ydf.npy'
    FNAME_YDF_TEST = 'test_preds_ydf.npy'

    # NEW: Filenames for the new custom XGB model
    FNAME_XGB_CUSTOM_NEW_OOF = 'oof_preds_xgb_custom_new.npy'
    FNAME_XGB_CUSTOM_NEW_TEST = 'test_preds_xgb_custom_new.npy'


# Create the output directory if it doesn't exist. This is essential for saving.
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

# Define FOLDS based on CFG for consistency
FOLDS = CFG.n_splits
GLOBAL_RANDOM_STATE = CFG.seed
np.random.seed(GLOBAL_RANDOM_STATE)

# --- Helper function to check and load predictions from either source ---
def load_predictions_if_exist(oof_filename, test_filename, X_shape, X_test_shape, num_classes):
    oof_path_output = os.path.join(CFG.OUTPUT_DIR, oof_filename)
    test_path_output = os.path.join(CFG.OUTPUT_DIR, test_filename)
    
    uploaded_input_dir_exists = hasattr(CFG, 'UPLOADED_INPUT_DIR') and CFG.UPLOADED_INPUT_DIR
    
    oof_path_primary_input = None
    test_path_primary_input = None
    if uploaded_input_dir_exists:
        oof_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, oof_filename) # /kaggle/input/ensemble2/
        test_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, test_filename)

    # Specific path for /kaggle/input/ensemble3/
    oof_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', oof_filename)
    test_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', test_filename)

    # Specific path for /kaggle/input/ensemble4/
    oof_path_ensemble4_input = os.path.join('/kaggle/input/ensemble4/', oof_filename)
    test_path_ensemble4_input = os.path.join('/kaggle/input/ensemble4/', test_filename)

    # NEW: Specific path for /kaggle/input/ensemble5/ (for HGB and YDF typically)
    oof_path_ensemble5_input = os.path.join('/kaggle/input/ensemble5/', oof_filename)
    test_path_ensemble5_input = os.path.join('/kaggle/input/ensemble5/', test_filename)

    # Original secondary input directory for boosting-output files
    oof_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', oof_filename)
    test_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', test_filename)


    # Priority 1: Check in the current session's output directory
    if os.path.exists(oof_path_output) and os.path.exists(test_path_output):
        print(f"Loading predictions from current session's OUTPUT_DIR: {oof_filename}, {test_filename}")
        return np.load(oof_path_output), np.load(test_path_output), True
    # Priority 2: Check in the primary user-uploaded input directory (e.g., /kaggle/input/ensemble2/)
    elif uploaded_input_dir_exists and os.path.exists(oof_path_primary_input) and os.path.exists(test_path_primary_input):
        print(f"Loading predictions from primary UPLOADED_INPUT_DIR ({CFG.UPLOADED_INPUT_DIR}): {oof_filename}, {test_filename}")
        return np.load(oof_path_primary_input), np.load(test_path_primary_input), True
    # Priority 3: Check in the /kaggle/input/ensemble3/ directory
    elif os.path.exists(oof_path_ensemble3_input) and os.path.exists(test_path_ensemble3_input):
        print(f"Loading predictions from /kaggle/input/ensemble3/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble3_input), np.load(test_path_ensemble3_input), True
    # Priority 4: Check in the /kaggle/input/ensemble4/ directory
    elif os.path.exists(oof_path_ensemble4_input) and os.path.exists(test_path_ensemble4_input):
        print(f"Loading predictions from /kaggle/input/ensemble4/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble4_input), np.load(test_path_ensemble4_input), True
    # NEW Priority 5: Check in the /kaggle/input/ensemble5/ directory
    elif os.path.exists(oof_path_ensemble5_input) and os.path.exists(test_path_ensemble5_input):
        print(f"Loading predictions from /kaggle/input/ensemble5/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble5_input), np.load(test_path_ensemble5_input), True
    # Priority 6: Check in the secondary user-uploaded input directory (e.g., /kaggle/input/boosting-output/)
    elif os.path.exists(oof_path_boosting_output_input) and os.path.exists(test_path_boosting_output_input):
        print(f"Loading predictions from secondary /kaggle/input/boosting-output/: {oof_filename}, {test_filename}")
        return np.load(oof_path_boosting_output_input), np.load(test_path_boosting_output_input), True
    else:
        # If files are not found, return zero-initialized arrays
        print(f"No existing predictions found for {oof_filename} or {test_filename}. Will initialize as zeros.")
        return np.zeros((X_shape[0], num_classes)), np.zeros((X_test_shape[0], num_classes)), False


# --- 2. Data Loading and Initial Preprocessing ---
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Starting Data Loading and Initial Preprocessing ---")
start_time_data_load = datetime.now()

# Ensure these paths are correct for your environment
df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')
df_original = pd.read_csv('/kaggle/input/original/Fertilizer Prediction .csv')

# Drop 'id' columns if they exist in train/test sets as per original notebook
df_train = df_train.drop(columns=['id'])
if 'id' in df_test.columns:
    df_test = df_test.drop(columns=['id'])

# Concatenate original dataset to the training data
df_train = pd.concat([df_train, df_original], axis=0, ignore_index=True)

# --- 3. Ordinal and Label Encoding ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Starting Ordinal and Label Encoding ---")
cat_cols_for_ordinal = df_train.select_dtypes(include='object').columns.tolist()
if 'Fertilizer Name' in cat_cols_for_ordinal:
    cat_cols_for_ordinal.remove('Fertilizer Name')

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_train[cat_cols_for_ordinal] = ordinal_encoder.fit_transform(df_train[cat_cols_for_ordinal].astype(str)).astype(int)

cat_cols_for_test = [col for col in cat_cols_for_ordinal if col in df_test.columns]
df_test[cat_cols_for_test] = ordinal_encoder.transform(df_test[cat_cols_for_test].astype(str)).astype(int)

le = LabelEncoder()
df_train['Fertilizer Name'] = le.fit_transform(df_train['Fertilizer Name'])
num_classes = len(np.unique(df_train['Fertilizer Name']))

y_encoded = df_train['Fertilizer Name'] # Target for training
X = df_train.drop(columns=['Fertilizer Name']) # Features for training
X_test = df_test # Features for final test prediction

# Define numerical columns for scaling
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
# Filter out any columns that might have been ordinal encoded but are numeric-like
numerical_cols = [col for col in numerical_cols if col not in cat_cols_for_ordinal]

# Get indices of categorical features for HistGradientBoostingClassifier
# Since we ordinal encoded them to integers, we need to explicitly tell HGB which ones are categorical
categorical_feature_indices_hgb = [X.columns.get_loc(col) for col in cat_cols_for_ordinal]
print(f"HistGradientBoostingClassifier will treat columns at indices {categorical_feature_indices_hgb} as categorical.")

end_time_data_load = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Data Loading and Preprocessing Finished. Elapsed: {end_time_data_load - start_time_data_load} ---\n")

# Define mapk function
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
            if hits == k: # Optimized: if we have found 'k' items, no need to continue
                break
        return score / min(len(a), k) if min(len(a), k) > 0 else 0.0 # Return 0.0 if actual has 0 elements to avoid division by zero

    if not isinstance(actual[0], (list, np.ndarray)):
        actual = [[a] for a in actual]

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Stacking Ensemble Setup Started ---")

# --- Initialize OOF and Test Prediction Arrays for Each Active Base Model (10 Models Now) ---
oof_preds_xgb_train_api1 = np.empty(0)
test_preds_xgb_train_api1 = np.empty(0)

oof_preds_xgb_sklearn = np.empty(0)
test_preds_xgb_sklearn = np.empty(0)

oof_preds_lgbm = np.empty(0)
test_preds_lgbm = np.empty(0)

oof_preds_lr = np.empty(0)
test_preds_lr = np.empty(0)

oof_preds_lgbm_optimized = np.empty(0) # Model 7
test_preds_lgbm_optimized = np.empty(0) # Model 7

oof_preds_voting = np.empty(0)          # Model 8
test_preds_voting = np.empty(0)         # Model 8

oof_preds_gaussian_nb = np.empty(0) # Model 11 (Gaussian Naive Bayes)
test_preds_gaussian_nb = np.empty(0) # Model 11 (Gaussian Naive Bayes)

oof_preds_hgb = np.empty(0) # Model 13 (HistGradientBoostingClassifier)
test_preds_hgb = np.empty(0) # Model 13 (HistGradientBoostingClassifier)

oof_preds_ydf = np.empty(0) # NEW Model 14 (YDF)
test_preds_ydf = np.empty(0) # NEW Model 14 (YDF)

oof_preds_xgb_custom_new = np.empty(0) # NEW Model (custom XGB)
test_preds_xgb_custom_new = np.empty(0) # NEW Model (custom XGB)


# --- Base Model 1: XGBoost (using xgb.train API - original block 1) ---
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 1: XGBoost (xgb.train API - Original Block 1) ---")
start_time_model1 = datetime.now()
oof_preds_xgb_train_api1, test_preds_xgb_train_api1, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_API1_OOF, CFG.FNAME_XGB_API1_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 1.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 1. This model will be zero-initialized.")
end_time_model1 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 1 Handling Finished. Elapsed: {end_time_model1 - start_time_model1} ---\n")


# --- Base Model 2: XGBoost (using XGBClassifier API) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 2: XGBoost (XGBClassifier API) ---")
start_time_model2 = datetime.now()
oof_preds_xgb_sklearn, test_preds_xgb_sklearn, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_SKLEARN_OOF, CFG.FNAME_XGB_SKLEARN_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 2.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 2. This model will be zero-initialized.")
end_time_model2 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 2 Handling Finished. Elapsed: {end_time_model2 - start_time_model2} ---\n")


# --- Base Model 3: LightGBM Model ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 3: LightGBM ---")
start_time_model3 = datetime.now()
oof_preds_lgbm, test_preds_lgbm, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LGBM_OOF, CFG.FNAME_LGBM_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 3.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 3. This model will be zero-initialized.")
end_time_model3 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 3 Handling Finished. Elapsed: {end_time_model3 - start_time_model3} ---\n")


# --- Base Model 6: Logistic Regression ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 6: Logistic Regression ---")
start_time_model6 = datetime.now()
oof_preds_lr, test_preds_lr, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LR_OOF, CFG.FNAME_LR_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 6.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 6. This model will be zero-initialized.")
end_time_model6 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 6 Handling Finished. Elapsed: {end_time_model6 - start_time_model6} ---\n")


# --- Base Model 7: LightGBM (Optimized Hyperparameters) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 7: LightGBM (Optimized Hyperparameters) ---")
start_time_model7 = datetime.now()

oof_preds_lgbm_optimized, test_preds_lgbm_optimized, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LGBM_OPTIMIZED_OOF, CFG.FNAME_LGBM_OPTIMIZED_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 7.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 7. This model will be zero-initialized.")
end_time_model7 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 7 Handling Finished. Elapsed: {end_time_model7 - start_time_model7} ---\n")


# --- Base Model 8: VotingClassifier ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 8: VotingClassifier ---")
start_time_model8 = datetime.now()

oof_preds_voting, test_preds_voting, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_VOTING_OOF, CFG.FNAME_VOTING_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 8.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 8. This model will be zero-initialized.")
end_time_model8 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 8 Handling Finished. Elapsed: {end_time_model8 - start_time_model8} ---\n")
    
# --- Base Model 11: Gaussian Naive Bayes ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 11: Gaussian Naive Bayes ---")
start_time_model11 = datetime.now()

oof_preds_gaussian_nb, test_preds_gaussian_nb, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_GAUSSIAN_NB_OOF, CFG.FNAME_GAUSSIAN_NB_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 11.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 11. This model will be zero-initialized.")
end_time_model11 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 11 Handling Finished. Elapsed: {end_time_model11 - start_time_model11} ---\n")


# --- Base Model 13: HistGradientBoostingClassifier ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 13: HistGradientBoostingClassifier ---")
start_time_model13 = datetime.now()

oof_preds_hgb, test_preds_hgb, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_HGB_OOF, CFG.FNAME_HGB_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 13 (HGB).")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 13 (HGB). This model will be zero-initialized.")
end_time_model13 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 13 Handling Finished. Elapsed: {end_time_model13 - start_time_model13} ---\n")

# --- Base Model 14: Yggdrasil Decision Forests (RandomForest) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling Base Model 14: Yggdrasil Decision Forests (RandomForest) ---")
start_time_model14 = datetime.now()

oof_preds_ydf, test_preds_ydf, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_YDF_OOF, CFG.FNAME_YDF_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 14 (YDF).")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 14 (YDF). This model will be zero-initialized.")
end_time_model14 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 14 Handling Finished. Elapsed: {end_time_model14 - start_time_model14} ---\n")

# --- NEW Base Model: Custom XGBoost ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Handling NEW Base Model: Custom XGBoost ---")
start_time_model_new_xgb = datetime.now()

oof_preds_xgb_custom_new, test_preds_xgb_custom_new, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_CUSTOM_NEW_OOF, CFG.FNAME_XGB_CUSTOM_NEW_TEST, X.shape, X_test.shape, num_classes
)

if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for NEW Custom XGBoost Model.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Training NEW Custom XGBoost Model...")
    xgb_custom_new_params = {
        'objective': 'multi:softprob', 
        'num_class': num_classes, 
        'max_depth': 7,
        'learning_rate': 0.03,
        'subsample': 0.8,
        'max_bin': 128,
        'colsample_bytree': 0.3, 
        'colsample_bylevel': 1, 
        'colsample_bynode': 1, 
        'tree_method': 'hist', 
        'random_state': 42,
        'eval_metric': 'mlogloss',
        'enable_categorical':True,
        'n_estimators': 10000, # Use CFG value for consistency
        'early_stopping_rounds': 50, # Use CFG value for consistency
    }

    kf_xgb_new = KFold(n_splits=FOLDS, shuffle=True, random_state=GLOBAL_RANDOM_STATE)
    model_new_xgb_logloss_scores = []
    model_new_xgb_map3_scores = [] # NEW: To store MAP@3 scores per fold
    model_new_xgb_test_pred_sum = np.zeros((len(X_test), num_classes))

    for i, (train_idx, valid_idx) in enumerate(kf_xgb_new.split(X, y_encoded)):
        fold_start_time = datetime.now()
        print(f"\n[{datetime.now().strftime('%H:%M:%S')}] {'#'*10} Training Fold {i+1}/{FOLDS} (NEW Custom XGBoost) {'#'*10}")

        x_train_fold, y_train_fold = X.iloc[train_idx].copy(), y_encoded.iloc[train_idx]
        x_valid_fold, y_valid_fold = X.iloc[valid_idx].copy(), y_encoded.iloc[valid_idx]

        dtrain = xgb.DMatrix(x_train_fold, label=y_train_fold, enable_categorical=True)
        dvalid = xgb.DMatrix(x_valid_fold, label=y_valid_fold, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        ES_callback = xgb.callback.EarlyStopping(
            rounds=xgb_custom_new_params['early_stopping_rounds'],
            maximize=False,
            save_best=True,
        )

        model_new_xgb_instance = xgb.train(
            xgb_custom_new_params,
            dtrain,
            num_boost_round=xgb_custom_new_params['n_estimators'],
            evals=[(dvalid, 'validation')],
            callbacks=[ES_callback],
            verbose_eval=CFG.verbose_eval
        )

        oof_preds_xgb_custom_new[valid_idx] = model_new_xgb_instance.predict(dvalid, iteration_range=(0, model_new_xgb_instance.best_iteration + 1))
        model_new_xgb_test_pred_sum += model_new_xgb_instance.predict(dtest, iteration_range=(0, model_new_xgb_instance.best_iteration + 1))

        log_loss_value = log_loss(y_valid_fold, oof_preds_xgb_custom_new[valid_idx])
        model_new_xgb_logloss_scores.append(log_loss_value)

        # NEW: Calculate MAP@3 for the current fold
        top_3_oof_preds_fold = np.argsort(oof_preds_xgb_custom_new[valid_idx], axis=1)[:, -3:][:, ::-1]
        map3_score_fold = mapk(y_valid_fold.values, top_3_oof_preds_fold)
        model_new_xgb_map3_scores.append(map3_score_fold)

        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {i+1} log_loss: {log_loss_value:.4f}, MAP@3: {map3_score_fold:.5f}. Elapsed for fold: {datetime.now() - fold_start_time}")
        
        del model_new_xgb_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_custom_new = model_new_xgb_test_pred_sum / FOLDS
    avg_log_loss_xgb_new = np.mean(model_new_xgb_logloss_scores)
    avg_map3_xgb_new = np.mean(model_new_xgb_map3_scores) # NEW: Average MAP@3
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] NEW Custom XGBoost Model Final CV log_loss: {avg_log_loss_xgb_new:.4f}, Avg MAP@3: {avg_map3_xgb_new:.5f}") # NEW: Print Avg MAP@3

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_CUSTOM_NEW_OOF), oof_preds_xgb_custom_new)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_CUSTOM_NEW_TEST), test_preds_xgb_custom_new)
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Saved NEW Custom XGBoost Model predictions to {CFG.OUTPUT_DIR}")

end_time_model_new_xgb = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- NEW Custom XGBoost Model Handling Finished. Elapsed: {end_time_model_new_xgb - start_time_model_new_xgb} ---\n")


# --- Prepare Meta-Features for the Single-Layer Logistic Regression (10 Active Base Models) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Preparing Meta-Features for Single-Layer Logistic Regression (10 Active Base Models) ---")
start_time_meta_features = datetime.now()

X_meta_train = np.hstack([
    oof_preds_xgb_train_api1, # Model 1
    oof_preds_xgb_sklearn, # Model 2
    oof_preds_lgbm, # Model 3
    oof_preds_lr, # Model 6
    oof_preds_lgbm_optimized, # Model 7
    oof_preds_voting, # Model 8
    oof_preds_gaussian_nb, # Model 11
    oof_preds_hgb, # Model 13
    oof_preds_ydf, # Model 14
    oof_preds_xgb_custom_new # NEW Custom XGBoost Model
])
X_meta_test = np.hstack([
    test_preds_xgb_train_api1, # Model 1
    test_preds_xgb_sklearn, # Model 2
    test_preds_lgbm, # Model 3
    test_preds_lr, # Model 6
    test_preds_lgbm_optimized, # Model 7
    test_preds_voting, # Model 8
    test_preds_gaussian_nb, # Model 11
    test_preds_hgb, # Model 13
    test_preds_ydf, # Model 14
    test_preds_xgb_custom_new # NEW Custom XGBoost Model
])

# Scale inputs for the final Logistic Regression meta-model
scaler_final_meta = StandardScaler()
X_meta_train_scaled = scaler_final_meta.fit_transform(X_meta_train)
X_meta_test_scaled = scaler_final_meta.transform(X_meta_test)

print(f"[{datetime.now().strftime('%H:%M:%S')}] Meta-training set shape for Logistic Regression: {X_meta_train_scaled.shape}")
print(f"[{datetime.now().strftime('%H:%M:%S')}] Meta-test set shape for Logistic Regression: {X_meta_test_scaled.shape}")

end_time_meta_features = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Meta-Features for Logistic Regression Prepared. Elapsed: {end_time_meta_features - start_time_meta_features} ---\n")

# --- NO Hill Climbing Optimization for Logistic Regression C parameter ---
# Using a fixed C value as requested.
FIXED_C_VALUE = 0.1
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Using a fixed C value for Logistic Regression: C={FIXED_C_VALUE} ---")


# --- 8. Train the Final Meta-Model (Logistic Regression) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Final Logistic Regression Meta-Model (Single Layer, 10 Active Base Models) ---")
start_time_final_model = datetime.now()

final_meta_model = LogisticRegression(
    solver='liblinear',
    C=FIXED_C_VALUE, # Using the fixed C value
    random_state=GLOBAL_RANDOM_STATE,
    n_jobs=-1,
    multi_class='ovr'
)
final_meta_model.fit(X_meta_train_scaled, y_encoded)
print(f"[{datetime.now().strftime('%H:%M:%S')}] Final Logistic Regression Meta-Model training complete with fixed C={FIXED_C_VALUE:.6f}.")

# --- Evaluate Meta-Model's OOF MAP@3 Score ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Evaluating Final Meta-Model's OOF MAP@3 Score ---")
# Get OOF predictions from the trained meta-model
meta_oof_preds = final_meta_model.predict_proba(X_meta_train_scaled)

# Get the top 3 predicted class indices for the OOF predictions
top_3_meta_oof_preds = np.argsort(meta_oof_preds, axis=1)[:, -3:][:, ::-1]

# Calculate MAP@3 score using the original encoded labels
meta_map3_score = mapk(y_encoded.values, top_3_meta_oof_preds)
print(f"[{datetime.now().strftime('%H:%M:%S')}] Final Meta-Model OOF MAP@3 Score: {meta_map3_score:.5f}")


end_time_final_model = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Final Logistic Regression Meta-Model Training Finished. Elapsed: {end_time_final_model - start_time_final_model} ---\n")


# --- 9. Generate Final Ensemble Predictions and Submission ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Generating Final Stacked Ensemble Predictions (Single Layer, 10 Active Base Models) ---")
start_time_submission = datetime.now()

final_ensemble_test_probs = final_meta_model.predict_proba(X_meta_test_scaled)
top_3_preds_ensemble = np.argsort(final_ensemble_test_probs, axis=1)[:, -3:][:, ::-1]

# Inverse transform to get original fertilizer names (strings)
top_3_labels_ensemble = le.inverse_transform(top_3_preds_ensemble.ravel()).reshape(top_3_preds_ensemble.shape)

# Create submission DataFrame
submission_filename = "submission_stacked_ensemble_10_models.csv" # Updated filename
submission_ensemble = pd.DataFrame({
    "id": df_sub["id"],
    "Fertilizer Name": [' '.join(label for label in row) for row in top_3_labels_ensemble]
})

submission_ensemble.to_csv(submission_filename, index=False)

print(f"📁 Final single-layer stacked ensemble submission saved to '{submission_filename}'")

# --- Display the head of the submission file (for verification) ---
print("\nFirst 5 rows of the final submission DataFrame (for display):")
with pd.option_context('display.max_colwidth', None, 'display.width', 1000):
    print(submission_ensemble.head())

end_time_submission = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Final Submission Generation Finished. Elapsed: {end_time_submission - start_time_submission} ---\n")

print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Single-Layer Stacked Ensemble Process Finished ---")



[03:38:50] --- Starting Data Loading and Initial Preprocessing ---
[03:38:52] --- Starting Ordinal and Label Encoding ---
HistGradientBoostingClassifier will treat columns at indices [3, 4] as categorical.
[03:38:53] --- Data Loading and Preprocessing Finished. Elapsed: 0:00:02.718159 ---

[03:38:53] --- Stacking Ensemble Setup Started ---

[03:38:53] --- Handling Base Model 1: XGBoost (xgb.train API - Original Block 1) ---
Loading predictions from secondary /kaggle/input/boosting-output/: oof_preds_xgb_train_api1.npy, test_preds_xgb_train_api1.npy
[03:38:54] Loaded existing predictions for Model 1.
[03:38:54] --- Model 1 Handling Finished. Elapsed: 0:00:00.353391 ---

[03:38:54] --- Handling Base Model 2: XGBoost (XGBClassifier API) ---
Loading predictions from secondary /kaggle/input/boosting-output/: oof_preds_xgb_sklearn.npy, test_preds_xgb_sklearn.npy
[03:38:54] Loaded existing predictions for Model 2.
[03:38:54] --- Model 2 Handling Finished. Elapsed: 0:00:00.332317 ---

[03:38:

In [5]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # Import for data preprocessing

# --- Configuration and Global Random State ---
class CFG:
    seed = 42
    target = 'Fertilizer Name'
    n_splits = 5 # Number of folds for cross-validation
    
    # Output Directory for current session
    OUTPUT_DIR = '/kaggle/working/outputs/' 
    
    # --- Filenames for base model predictions ---
    # Boosting-output files
    FNAME_XGB_API1_OOF = 'oof_preds_xgb_train_api1.npy'
    FNAME_XGB_API1_TEST = 'test_preds_xgb_train_api1.npy'
    
    FNAME_XGB_SKLEARN_OOF = 'oof_preds_xgb_sklearn.npy'
    FNAME_XGB_SKLEARN_TEST = 'test_preds_xgb_sklearn.npy'
    
    FNAME_LGBM_OOF = 'oof_preds_lgbm.npy'
    FNAME_LGBM_TEST = 'test_preds_lgbm.npy'
    
    FNAME_XGB_API2_OOF = 'oof_preds_xgb_train_api2.npy'
    FNAME_XGB_API2_TEST = 'test_preds_xgb_train_api2.npy'
    
    FNAME_LGBM_GOSS_OOF = 'oof_preds_lgbm_goss.npy'
    FNAME_LGBM_GOSS_TEST = 'test_preds_lgbm_goss.npy'

    # Ensemble2 files
    FNAME_LR_OOF = 'oof_preds_lr.npy'
    FNAME_LR_TEST = 'test_preds_lr.npy'

    FNAME_LGBM_OPTIMIZED_OOF = 'oof_preds_lgbm_optimized.npy'
    FNAME_LGBM_OPTIMIZED_TEST = 'test_preds_lgbm_optimized.npy'

    FNAME_VOTING_OOF = 'oof_preds_voting.npy'
    FNAME_VOTING_TEST = 'test_preds_voting.npy'

    # Ensemble3 files
    FNAME_XGB_OPTIMIZED_ENSEMB3_OOF = 'oof_preds_xgb_optimized.npy' 
    FNAME_XGB_OPTIMIZED_ENSEMB3_TEST = 'test_preds_xgb_optimized.npy'

    FNAME_XGB_CUSTOM_OOF = 'oof_preds_xgb_custom.npy'
    FNAME_XGB_CUSTOM_TEST = 'test_preds_xgb_custom.npy'

    # Ensemble4 files
    FNAME_GAUSSIAN_NB_OOF = 'oof_preds_gaussian_nb.npy'
    FNAME_GAUSSIAN_NB_TEST = 'test_preds_gaussian_nb.npy'

    FNAME_LDA_OOF = 'oof_preds_lda_base.npy'
    FNAME_LDA_TEST = 'test_preds_lda_base.npy'

    # Ensemble5 files
    FNAME_HGB_OOF = 'oof_preds_hgb.npy'
    FNAME_HGB_TEST = 'test_preds_hgb.npy'

    FNAME_YDF_OOF = 'oof_preds_ydf.npy'
    FNAME_YDF_TEST = 'test_preds_ydf.npy'

    # Ensemble6 files (New models)
    FNAME_XGB_REPEAT_OOF = 'xgb_repeat_train_oof.npy' # Renamed from xgb_repeat_train_oof.npy for clarity
    FNAME_XGB_REPEAT_TEST = 'xgb_repeat_test_oof.npy' # Renamed from xgb_repeat_test_oof.npy for clarity
    FNAME_LOG_PROBA_OOF = 'pred_oof_log_proba.csv'
    FNAME_LOG_PROBA_TEST = 'pred_test_log_proba.csv'


# Create the output directory if it doesn't exist.
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

# --- 2. Data Loading and Initial Preprocessing to get num_classes and X_shape ---
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Starting Data Loading for shape and class info ---")
df_train_raw = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
df_original_raw = pd.read_csv('/kaggle/input/original/Fertilizer Prediction .csv')
df_test_raw = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

# Concatenate original dataset to the training data
df_train_full = pd.concat([df_train_raw.drop(columns=['id']), df_original_raw], axis=0, ignore_index=True)
if 'id' in df_test_raw.columns:
    df_test_raw = df_test_raw.drop(columns=['id'])

# Encoding for class count
le = LabelEncoder()
le.fit(df_train_full[CFG.target])
num_classes = len(le.classes_)
X_train_len = len(df_train_full)
X_test_len = len(df_test_raw)

print(f"[{datetime.now().strftime('%H:%M:%S')}] Detected number of classes: {num_classes}")
print(f"[{datetime.now().strftime('%H:%M:%S')}] Training data length: {X_train_len}")
print(f"[{datetime.now().strftime('%H:%M:%S')}] Test data length: {X_test_len}\n")


# --- Helper function to check and load predictions from various sources ---
def load_predictions_if_exist(oof_filename, test_filename, x_len, x_test_len, n_classes):
    """
    Checks if prediction files exist in the current session's OUTPUT_DIR or
    various specified Kaggle input directories, and loads them.
    Handles both .npy and .csv files.

    Args:
        oof_filename (str): The filename for the OOF predictions.
        test_filename (str): The filename for the test predictions.
        x_len (int): Expected number of rows for OOF predictions (training data size).
        x_test_len (int): Expected number of rows for test predictions (test data size).
        n_classes (int): Expected number of columns (classes) for predictions.

    Returns:
        tuple: (oof_preds_array, test_preds_array, loaded_from_disk_flag).
               Returns (zero-initialized arrays, False) if files are not found.
    """
    # Define potential directories in order of priority (newer/specific first)
    search_dirs = [
        CFG.OUTPUT_DIR,
        '/kaggle/input/ensemble6/',
        '/kaggle/input/ensemble5/',
        '/kaggle/input/ensemble4/',
        '/kaggle/input/ensemble3/',
        '/kaggle/input/ensemble2/',
        '/kaggle/input/boosting-output/',
    ]
    
    oof_path = None
    test_path = None
    
    for d in search_dirs:
        current_oof_path = os.path.join(d, oof_filename)
        current_test_path = os.path.join(d, test_filename)
        if os.path.exists(current_oof_path) and os.path.exists(current_test_path):
            oof_path = current_oof_path
            test_path = current_test_path
            print(f"Found predictions in: {d} for {oof_filename}, {test_filename}")
            break # Found files, stop searching

    if oof_path and test_path:
        try:
            # Handle .npy files
            if oof_path.endswith('.npy'):
                oof_preds = np.load(oof_path)
            elif oof_path.endswith('.csv'):
                oof_preds = pd.read_csv(oof_path).values # Assuming CSV is already probabilities/scores
            else:
                raise ValueError(f"Unsupported OOF file format for {oof_filename}")

            if test_path.endswith('.npy'):
                test_preds = np.load(test_path)
            elif test_path.endswith('.csv'):
                test_preds = pd.read_csv(test_path).values
            else:
                raise ValueError(f"Unsupported test file format for {test_filename}")

            # Ensure the loaded arrays have the expected number of classes
            if oof_preds.shape[1] != n_classes or test_preds.shape[1] != n_classes:
                print(f"Warning: Loaded predictions for {oof_filename} have {oof_preds.shape[1]} classes, expected {n_classes}. Reshaping or zero-initializing.")
                # If shapes don't match, it's safer to zero-initialize than force-reshape
                # unless a specific logic for different number of classes is known.
                # For this correlation task, it's better to align or skip.
                return np.zeros((x_len, n_classes)), np.zeros((x_test_len, n_classes)), False

            # Ensure the loaded arrays have the expected number of samples
            if oof_preds.shape[0] != x_len or test_preds.shape[0] != x_test_len:
                print(f"Warning: Loaded predictions for {oof_filename} have {oof_preds.shape[0]} samples, expected {x_len}. Reshaping or zero-initializing.")
                return np.zeros((x_len, n_classes)), np.zeros((x_test_len, n_classes)), False


            return oof_preds, test_preds, True
        except Exception as e:
            print(f"Error loading {oof_filename} or {test_filename}: {e}. Initializing as zeros.")
            return np.zeros((x_len, n_classes)), np.zeros((x_test_len, n_classes)), False
    else:
        print(f"No existing predictions found for {oof_filename} or {test_filename}. Will initialize as zeros.")
        return np.zeros((x_len, n_classes)), np.zeros((x_test_len, n_classes)), False


print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Starting Base Model OOF Prediction Loading for Correlation ---")

# List of (OOF_filename, Test_filename, Model_Name) for all base models
model_files = [
    (CFG.FNAME_XGB_API1_OOF, CFG.FNAME_XGB_API1_TEST, 'XGB_API1'),
    (CFG.FNAME_XGB_SKLEARN_OOF, CFG.FNAME_XGB_SKLEARN_TEST, 'XGB_SKLEARN'),
    (CFG.FNAME_LGBM_OOF, CFG.FNAME_LGBM_TEST, 'LGBM'),
    (CFG.FNAME_XGB_API2_OOF, CFG.FNAME_XGB_API2_TEST, 'XGB_API2'),
    (CFG.FNAME_LGBM_GOSS_OOF, CFG.FNAME_LGBM_GOSS_TEST, 'LGBM_GOSS'),
    (CFG.FNAME_LR_OOF, CFG.FNAME_LR_TEST, 'LR'),
    (CFG.FNAME_LGBM_OPTIMIZED_OOF, CFG.FNAME_LGBM_OPTIMIZED_TEST, 'LGBM_OPT'),
    (CFG.FNAME_VOTING_OOF, CFG.FNAME_VOTING_TEST, 'VOTING'),
    (CFG.FNAME_XGB_OPTIMIZED_ENSEMB3_OOF, CFG.FNAME_XGB_OPTIMIZED_ENSEMB3_TEST, 'XGB_OPT_E3'),
    (CFG.FNAME_XGB_CUSTOM_OOF, CFG.FNAME_XGB_CUSTOM_TEST, 'XGB_CUSTOM_E3'),
    (CFG.FNAME_GAUSSIAN_NB_OOF, CFG.FNAME_GAUSSIAN_NB_TEST, 'GAUSSIAN_NB'),
    (CFG.FNAME_LDA_OOF, CFG.FNAME_LDA_TEST, 'LDA'),
    (CFG.FNAME_HGB_OOF, CFG.FNAME_HGB_TEST, 'HGB'),
    (CFG.FNAME_YDF_OOF, CFG.FNAME_YDF_TEST, 'YDF'),
    (CFG.FNAME_XGB_REPEAT_OOF, CFG.FNAME_XGB_REPEAT_TEST, 'XGB_REPEAT'), # New model from ensemble6
    (CFG.FNAME_LOG_PROBA_OOF, CFG.FNAME_LOG_PROBA_TEST, 'LOG_PROBA') # New model from ensemble6
]

oof_predictions_flattened = {}

for oof_fname, test_fname, model_name in model_files:
    oof_preds, _, loaded_flag = load_predictions_if_exist(
        oof_fname, test_fname, X_train_len, X_test_len, num_classes
    )
    
    # Only proceed if predictions were actually loaded and are not entirely zero (unless they genuinely are all zeros)
    # Check if the loaded array is mostly non-zero. A simple sum check can suffice for this purpose.
    if loaded_flag and np.sum(oof_preds) > 1e-9: # Check for non-trivial loaded data
        # Flatten the OOF predictions (N, num_classes) to (N * num_classes,)
        oof_predictions_flattened[model_name] = oof_preds.flatten()
    else:
        print(f"[{datetime.now().strftime('%H:%M:%S')}] WARNING: Skipping {model_name} for correlation as its OOF predictions were not found, loaded as zeros, or contain only zeros.")

if not oof_predictions_flattened:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No base model OOF predictions were successfully loaded or contained meaningful data. Cannot calculate correlation.")
else:
    # Ensure all arrays have the same length for correlation calculation.
    # We take the minimum length to avoid issues if files have slight discrepancies.
    min_len = min(len(arr) for arr in oof_predictions_flattened.values())
    
    correlation_df = pd.DataFrame({
        model_name: preds[:min_len] for model_name, preds in oof_predictions_flattened.items()
    })

    # Calculate the correlation matrix
    correlation_matrix = correlation_df.corr()

    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Correlation Matrix of Base Model OOF Predictions ---")
    # Display the correlation matrix with 3 decimal places for readability
    print(correlation_matrix.round(3))

print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Correlation Analysis Finished ---")



[15:34:07] --- Starting Data Loading for shape and class info ---
[15:34:09] Detected number of classes: 7
[15:34:09] Training data length: 850000
[15:34:09] Test data length: 250000

[15:34:09] --- Starting Base Model OOF Prediction Loading for Correlation ---
Found predictions in: /kaggle/input/boosting-output/ for oof_preds_xgb_train_api1.npy, test_preds_xgb_train_api1.npy
Found predictions in: /kaggle/input/boosting-output/ for oof_preds_xgb_sklearn.npy, test_preds_xgb_sklearn.npy
Found predictions in: /kaggle/input/boosting-output/ for oof_preds_lgbm.npy, test_preds_lgbm.npy
Found predictions in: /kaggle/input/boosting-output/ for oof_preds_xgb_train_api2.npy, test_preds_xgb_train_api2.npy
Found predictions in: /kaggle/input/boosting-output/ for oof_preds_lgbm_goss.npy, test_preds_lgbm_goss.npy
Found predictions in: /kaggle/input/ensemble2/ for oof_preds_lr.npy, test_preds_lr.npy
Found predictions in: /kaggle/input/ensemble2/ for oof_preds_lgbm_optimized.npy, test_preds_lgbm_opti

In [15]:
import pandas as pd
import numpy as np
import os
from datetime import datetime # Import datetime for timestamps
import optuna
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import warnings
import gc
import xgboost as xgb
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB # Import Gaussian Naive Bayes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Import Linear Discriminant Analysis
# Removed: from catboost import CatBoostClassifier
# Removed: import torch # No longer needed without CatBoost's GPU check
from sklearn.ensemble import HistGradientBoostingClassifier # Import HistGradientBoostingClassifier
import ydf # NEW: Import Yggdrasil Decision Forests


# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- 1. Configuration and Global Random State ---
class CFG:
    seed = 42
    target = 'Fertilizer Name'
    n_splits = 5 # Number of folds for cross-validation
    learning_rate = 0.03
    num_boost_round = 5000
    early_stopping_rounds = 50
    verbose_eval = 200

    # --- Output Directory for current session (will be cleared on session end) ---
    OUTPUT_DIR = '/kaggle/working/outputs/'
    
    # --- Directory for User Uploaded Input Files (e.g., from a Kaggle Dataset) ---
    # Set to /kaggle/input/ensemble2/ as the primary uploaded input dir for new models
    UPLOADED_INPUT_DIR = '/kaggle/input/ensemble2/' 

    # Filenames for base model predictions
    FNAME_XGB_API1_OOF = 'oof_preds_xgb_train_api1.npy'
    FNAME_XGB_API1_TEST = 'test_preds_xgb_train_api1.npy'
    
    FNAME_XGB_SKLEARN_OOF = 'oof_preds_xgb_sklearn.npy'
    FNAME_XGB_SKLEARN_TEST = 'test_preds_xgb_sklearn.npy'
    
    FNAME_LGBM_OOF = 'oof_preds_lgbm.npy'
    FNAME_LGBM_TEST = 'test_preds_lgbm.npy'
    
    # Updated filenames for Model 4
    FNAME_XGB_API2_OOF = 'oof_preds_xgb_train_api2.npy'
    FNAME_XGB_API2_TEST = 'test_preds_xgb_train_api2.npy'
    
    FNAME_LGBM_GOSS_OOF = 'oof_preds_lgbm_goss.npy'
    FNAME_LGBM_GOSS_TEST = 'test_preds_lgbm_goss.npy'

    FNAME_LR_OOF = 'oof_preds_lr.npy'
    FNAME_LR_TEST = 'test_preds_lr.npy'

    FNAME_LGBM_OPTIMIZED_OOF = 'oof_preds_lgbm_optimized.npy'
    FNAME_LGBM_OPTIMIZED_TEST = 'test_preds_lgbm_optimized.npy'

    FNAME_VOTING_OOF = 'oof_preds_voting.npy'
    FNAME_VOTING_TEST = 'test_preds_voting.npy'

    # --- Filenames for the XGBoost models from /kaggle/input/ensemble3/ ---
    FNAME_XGB_OPTIMIZED_ENSEMBLE3_OOF = 'oof_preds_xgb_optimized.npy' 
    FNAME_XGB_OPTIMIZED_ENSEMBLE3_TEST = 'test_preds_xgb_optimized.npy'

    FNAME_XGB_CUSTOM_OOF = 'oof_preds_xgb_custom.npy'
    FNAME_XGB_CUSTOM_TEST = 'test_preds_xgb_custom.npy'

    # --- Filenames for the Gaussian Naive Bayes Model ---
    FNAME_GAUSSIAN_NB_OOF = 'oof_preds_gaussian_nb.npy'
    FNAME_GAUSSIAN_NB_TEST = 'test_preds_gaussian_nb.npy'

    # --- Filenames for the LDA Base Model ---
    FNAME_LDA_OOF = 'oof_preds_lda_base.npy'
    FNAME_LDA_TEST = 'test_preds_lda_base.npy'

    # NEW: Filenames for HistGradientBoostingClassifier Model
    FNAME_HGB_OOF = 'oof_preds_hgb.npy'
    FNAME_HGB_TEST = 'test_preds_hgb.npy'

    # NEW: Filenames for YDF Model (Model 14)
    FNAME_YDF_OOF = 'oof_preds_ydf.npy'
    FNAME_YDF_TEST = 'test_preds_ydf.npy'


# Create the output directory if it doesn't exist. This is essential for saving.
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

# Define FOLDS based on CFG for consistency
FOLDS = CFG.n_splits
GLOBAL_RANDOM_STATE = CFG.seed
np.random.seed(GLOBAL_RANDOM_STATE)

# --- Helper function to check and load predictions from either source ---
def load_predictions_if_exist(oof_filename, test_filename, X_shape, X_test_shape, num_classes):
    """
    Checks if prediction files exist in the current session's OUTPUT_DIR,
    the primary UPLOADED_INPUT_DIR (CFG.UPLOADED_INPUT_DIR),
    the /kaggle/input/ensemble3/ directory,
    the /kaggle/input/ensemble4/ directory (NEW), or
    the secondary /kaggle/input/boosting-output/ directory, and loads them.
    If files are not found, it returns zero-initialized NumPy arrays of the correct shape.

    Args:
        oof_filename (str): The filename for the OOF predictions.
        test_filename (str): The filename for the test predictions.
        X_shape (tuple): Shape of the full training features (e.g., (num_samples, num_features)).
        X_test_shape (tuple): Shape of the full test features (e.g., (num_test_samples, num_features)).
        num_classes (int): Number of target classes.

    Returns:
        tuple: (oof_preds_array, test_preds_array, loaded_from_disk_flag).
               Returns (zero_initialized_oof_array, zero_initialized_test_array, False)
               if files are not found.
    """
    oof_path_output = os.path.join(CFG.OUTPUT_DIR, oof_filename)
    test_path_output = os.path.join(CFG.OUTPUT_DIR, test_filename)
    
    uploaded_input_dir_exists = hasattr(CFG, 'UPLOADED_INPUT_DIR') and CFG.UPLOADED_INPUT_DIR
    
    oof_path_primary_input = None
    test_path_primary_input = None
    if uploaded_input_dir_exists:
        oof_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, oof_filename) # /kaggle/input/ensemble2/
        test_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, test_filename)

    # Specific path for /kaggle/input/ensemble3/
    oof_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', oof_filename)
    test_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', test_filename)

    # NEW: Specific path for /kaggle/input/ensemble4/
    oof_path_ensemble4_input = os.path.join('/kaggle/input/ensemble4/', oof_filename)
    test_path_ensemble4_input = os.path.join('/kaggle/input/ensemble4/', test_filename)

    # Original secondary input directory for boosting-output files
    oof_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', oof_filename)
    test_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', test_filename)


    # Priority 1: Check in the current session's output directory
    if os.path.exists(oof_path_output) and os.path.exists(test_path_output):
        print(f"Loading predictions from current session's OUTPUT_DIR: {oof_filename}, {test_filename}")
        return np.load(oof_path_output), np.load(test_path_output), True
    # Priority 2: Check in the primary user-uploaded input directory (e.g., /kaggle/input/ensemble2/)
    elif uploaded_input_dir_exists and os.path.exists(oof_path_primary_input) and os.path.exists(test_path_primary_input):
        print(f"Loading predictions from primary UPLOADED_INPUT_DIR ({CFG.UPLOADED_INPUT_DIR}): {oof_filename}, {test_filename}")
        return np.load(oof_path_primary_input), np.load(test_path_primary_input), True
    # Priority 3: Check in the /kaggle/input/ensemble3/ directory
    elif os.path.exists(oof_path_ensemble3_input) and os.path.exists(test_path_ensemble3_input):
        print(f"Loading predictions from /kaggle/input/ensemble3/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble3_input), np.load(test_path_ensemble3_input), True
    # NEW Priority 4: Check in the /kaggle/input/ensemble4/ directory
    elif os.path.exists(oof_path_ensemble4_input) and os.path.exists(test_path_ensemble4_input):
        print(f"Loading predictions from /kaggle/input/ensemble4/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble4_input), np.load(test_path_ensemble4_input), True
    # Priority 5: Check in the secondary user-uploaded input directory (e.g., /kaggle/input/boosting-output/)
    elif os.path.exists(oof_path_boosting_output_input) and os.path.exists(test_path_boosting_output_input):
        print(f"Loading predictions from secondary /kaggle/input/boosting-output/: {oof_filename}, {test_filename}")
        return np.load(oof_path_boosting_output_input), np.load(test_path_boosting_output_input), True
    else:
        # If files are not found, return zero-initialized arrays
        print(f"No existing predictions found for {oof_filename} or {test_filename}. Will initialize as zeros.")
        return np.zeros((X_shape[0], num_classes)), np.zeros((X_test_shape[0], num_classes)), False


# --- 2. Data Loading and Initial Preprocessing ---
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Starting Data Loading and Initial Preprocessing ---")
start_time_data_load = datetime.now()

# Ensure these paths are correct for your environment
df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')
df_original = pd.read_csv('/kaggle/input/original/Fertilizer Prediction .csv')

# Drop 'id' columns if they exist in train/test sets as per original notebook
df_train = df_train.drop(columns=['id'])
if 'id' in df_test.columns:
    df_test = df_test.drop(columns=['id'])

# Concatenate original dataset to the training data
df_train = pd.concat([df_train, df_original], axis=0, ignore_index=True)

# --- 3. Ordinal and Label Encoding ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Starting Ordinal and Label Encoding ---")
cat_cols_for_ordinal = df_train.select_dtypes(include='object').columns.tolist()
if 'Fertilizer Name' in cat_cols_for_ordinal:
    cat_cols_for_ordinal.remove('Fertilizer Name')

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_train[cat_cols_for_ordinal] = ordinal_encoder.fit_transform(df_train[cat_cols_for_ordinal].astype(str)).astype(int)

cat_cols_for_test = [col for col in cat_cols_for_ordinal if col in df_test.columns]
df_test[cat_cols_for_test] = ordinal_encoder.transform(df_test[cat_cols_for_test].astype(str)).astype(int)

le = LabelEncoder()
df_train['Fertilizer Name'] = le.fit_transform(df_train['Fertilizer Name'])
num_classes = len(np.unique(df_train['Fertilizer Name']))

y_encoded = df_train['Fertilizer Name'] # Target for training
X = df_train.drop(columns=['Fertilizer Name']) # Features for training
X_test = df_test # Features for final test prediction

# Define numerical columns for scaling
# Moved this definition here to ensure it's available globally before any model training or helper functions use it.
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
# Filter out any columns that might have been ordinal encoded but are numeric-like
numerical_cols = [col for col in numerical_cols if col not in cat_cols_for_ordinal]

# Get indices of categorical features for HistGradientBoostingClassifier
# Since we ordinal encoded them to integers, we need to explicitly tell HGB which ones are categorical
categorical_feature_indices_hgb = [X.columns.get_loc(col) for col in cat_cols_for_ordinal]
print(f"HistGradientBoostingClassifier will treat columns at indices {categorical_feature_indices_hgb} as categorical.")

end_time_data_load = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Data Loading and Preprocessing Finished. Elapsed: {end_time_data_load - start_time_data_load} ---\n")

# Define mapk function
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
            if hits == k: # Optimized: if we have found 'k' items, no need to continue
                break
        return score / min(len(a), k) if min(len(a), k) > 0 else 0.0 # Return 0.0 if actual has 0 elements to avoid division by zero

    if not isinstance(actual[0], (list, np.ndarray)):
        actual = [[a] for a in actual]

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Stacking Ensemble Setup Started ---")

# --- Initialize OOF and Test Prediction Arrays for Each Base Model (14 Models Now) ---
# These variables will be assigned directly by load_predictions_if_exist,
# which now guarantees returning NumPy arrays.
oof_preds_xgb_train_api1 = np.empty(0)
test_preds_xgb_train_api1 = np.empty(0)

oof_preds_xgb_sklearn = np.empty(0)
test_preds_xgb_sklearn = np.empty(0)

oof_preds_lgbm = np.empty(0)
test_preds_lgbm = np.empty(0)

oof_preds_xgb_api2 = np.empty(0)
test_preds_xgb_api2 = np.empty(0)

oof_preds_lgbm_goss = np.empty(0)
test_preds_lgbm_goss = np.empty(0)

oof_preds_lr = np.empty(0)
test_preds_lr = np.empty(0)

oof_preds_lgbm_optimized = np.empty(0) # Model 7
test_preds_lgbm_optimized = np.empty(0) # Model 7

oof_preds_voting = np.empty(0)          # Model 8
test_preds_voting = np.empty(0)         # Model 8

oof_preds_xgb_optimized_ensemble3 = np.empty(0) # Model 9 (from ensemble3 optimized)
test_preds_xgb_optimized_ensemble3 = np.empty(0) # Model 9 (from ensemble3 optimized)

oof_preds_xgb_custom = np.empty(0) # Model 10 (from ensemble3 custom)
test_preds_xgb_custom = np.empty(0) # Model 10 (from ensemble3 custom)

oof_preds_gaussian_nb = np.empty(0) # Model 11 (Gaussian Naive Bayes)
test_preds_gaussian_nb = np.empty(0) # Model 11 (Gaussian Naive Bayes)

oof_preds_lda_base = np.empty(0) # Model 12 (LDA)
test_preds_lda_base = np.empty(0) # Model 12 (LDA)

oof_preds_hgb = np.empty(0) # Model 13 (HistGradientBoostingClassifier)
test_preds_hgb = np.empty(0) # Model 13 (HistGradientBoostingClassifier)

oof_preds_ydf = np.empty(0) # NEW Model 14 (YDF)
test_preds_ydf = np.empty(0) # NEW Model 14 (YDF)


# --- Base Model 1: XGBoost (using xgb.train API - original block 1) ---
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 1: XGBoost (xgb.train API - Original Block 1) ---")
start_time_model1 = datetime.now()
oof_preds_xgb_train_api1, test_preds_xgb_train_api1, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_API1_OOF, CFG.FNAME_XGB_API1_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 1.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 1. Starting training.")
    # oof_preds_xgb_train_api1 and test_preds_xgb_train_api1 are already zero-initialized from load_predictions_if_exist
    kf_model1 = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    model1_logloss_scores = []
    model1_test_pred_sum = np.zeros((len(X_test), num_classes))

    for i, (train_idx, valid_idx) in enumerate(kf_model1.split(X, y_encoded)):
        print(f"\n{'#'*10} Fold {i+1} (Model 1) {'#'*10}")

        x_train_fold, y_train_fold = X.iloc[train_idx].copy(), y_encoded.iloc[train_idx]
        x_valid_fold, y_valid_fold = X.iloc[valid_idx].copy(), y_encoded.iloc[valid_idx]

        dtrain = xgb.DMatrix(x_train_fold, label=y_train_fold)
        dvalid = xgb.DMatrix(x_valid_fold, label=y_valid_fold)
        dtest = xgb.DMatrix(X_test)

        params_model1 = {
            'objective': 'multi:softprob',
            'num_class': num_classes,
            'max_depth': 16,
            'learning_rate': 0.03,
            'min_child_weight': 2,
            'alpha': 0.8,
            'reg_lambda': 4.0,
            'colsample_bytree': 0.3,
            'subsample': 0.8,
            'max_bin': 128,
            'colsample_bylevel': 1,
            'colsample_bynode': 1,
            'tree_method': 'hist',
            'random_state': 42,
            'eval_metric': 'mlogloss'
        }

        model1_instance = xgb.train(
            params_model1,
            dtrain,
            num_boost_round=CFG.num_boost_round, # Use CFG value
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=CFG.early_stopping_rounds, # Use CFG value
            verbose_eval=CFG.verbose_eval
        )

        oof_preds_xgb_train_api1[valid_idx] = model1_instance.predict(dvalid, iteration_range=(0, model1_instance.best_iteration + 1))
        model1_test_pred_sum += model1_instance.predict(dtest, iteration_range=(0, model1_instance.best_iteration + 1))

        log_loss_value = log_loss(y_valid_fold, oof_preds_xgb_train_api1[valid_idx])
        print(f"Fold {i+1} log_loss: {log_loss_value:.4f}")
        model1_logloss_scores.append(log_loss_value)

        del model1_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_train_api1 = model1_test_pred_sum / FOLDS
    avg_log_loss_model1 = np.mean(model1_logloss_scores)
    print(f"\nModel 1 (xgb.train API 1) Final CV log_loss: {avg_log_loss_model1:.4f}")
    top_3_oof_preds_model1 = np.argsort(oof_preds_xgb_train_api1, axis=1)[:, -3:][:, ::-1]
    map3_score_model1 = mapk(y_encoded.values, top_3_oof_preds_model1)
    print(f"Model 1 (xgb.train API 1) OOF MAP@3 Score: {map3_score_model1:.5f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_API1_OOF), oof_preds_xgb_train_api1)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_API1_TEST), test_preds_xgb_train_api1)
    print(f"Saved Model 1 predictions to {CFG.OUTPUT_DIR}")


# --- Base Model 2: XGBoost (using XGBClassifier API) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 2: XGBoost (XGBClassifier API) ---")
start_time_model2 = datetime.now()
oof_preds_xgb_sklearn, test_preds_xgb_sklearn, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_SKLEARN_OOF, CFG.FNAME_XGB_SKLEARN_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 2.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 2. Starting training.")
    # oof_preds_xgb_sklearn and test_preds_xgb_sklearn are already zero-initialized from load_predictions_if_exist
    fixed_xgb_params_model2 = {
        'max_depth': 12,
        'colsample_bytree': 0.467,
        'subsample': 0.86,
        'n_estimators': 4000,
        'learning_rate': 0.03,
        'gamma': 0.26,
        'max_delta_step': 4,
        'reg_alpha': 2.7,
        'reg_lambda': 1.4,
        'objective': 'multi:softprob',
        'random_state': 13,
        'enable_categorical': True,
        'tree_method': 'hist',
        'early_stopping_rounds': 100,
        'eval_metric': 'mlogloss',
        'num_class': num_classes,
        'n_jobs': -1
    }
    n_splits_cv_model2 = FOLDS # Use CFG.n_splits
    model2_fold_scores = []
    model2_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model2 = StratifiedKFold(n_splits=n_splits_cv_model2, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model2.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 2) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model2_instance = XGBClassifier(**fixed_xgb_params_model2)
        model2_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            verbose=False)

        best_iteration = model2_instance.best_iteration if hasattr(model2_instance, 'best_iteration') else fixed_xgb_params_model2['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_xgb_sklearn[valid_idx] = model2_instance.predict_proba(fold_valid_X, iteration_range=(0, best_iteration))
        model2_test_pred_sum += model2_instance.predict_proba(X_test, iteration_range=(0, best_iteration))

        top_3_preds = np.argsort(oof_preds_xgb_sklearn[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model2_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model2_instance
        gc.collect()

    test_preds_xgb_sklearn = model2_test_pred_sum / n_splits_cv_model2
    avg_cv_score_model2 = np.mean(model2_fold_scores)
    print(f"\nModel 2 (XGBClassifier) Average MAP@3 across {n_splits_cv_model2} folds: {avg_cv_score_model2:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_SKLEARN_OOF), oof_preds_xgb_sklearn)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_SKLEARN_TEST), test_preds_xgb_sklearn)
    print(f"Saved Model 2 predictions to {CFG.OUTPUT_DIR}")

# --- Base Model 3: LightGBM Model ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 3: LightGBM ---")
start_time_model3 = datetime.now()
oof_preds_lgbm, test_preds_lgbm, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LGBM_OOF, CFG.FNAME_LGBM_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 3.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 3. Starting training.")
    # oof_preds_lgbm and test_preds_lgbm are already zero-initialized from load_predictions_if_exist
    fixed_lgbm_params = {
        "objective": "multiclass",
        "num_class": num_classes,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "n_estimators": 1214,
        "learning_rate": 0.064080948,
        "num_leaves": 169,
        "max_depth": 10,
        "subsample": 0.642,
        "min_child_samples": 19,
        "colsample_bytree": 0.7,
        "reg_alpha": 6.2941,
        "reg_lambda": 5.556,
        "random_state": GLOBAL_RANDOM_STATE,
        "n_jobs": -1,
        "verbose": -1,
    }
    n_splits_cv_model3 = FOLDS # Use CFG.n_splits
    early_stopping_rounds_cv_model3 = 100
    model3_fold_scores = []
    model3_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model3 = StratifiedKFold(n_splits=n_splits_cv_model3, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model3.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 3) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model3_instance = lgb.LGBMClassifier(**fixed_lgbm_params)

        model3_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model3, verbose=False)])

        best_iteration = model3_instance.best_iteration_ if hasattr(model3_instance, 'best_iteration_') else fixed_lgbm_params['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm[valid_idx] = model3_instance.predict_proba(fold_valid_X)
        model3_test_pred_sum += model3_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model3_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model3_instance
        gc.collect()

    test_preds_lgbm = model3_test_pred_sum / n_splits_cv_model3
    avg_cv_score_model3 = np.mean(model3_fold_scores)
    print(f"\nModel 3 (LightGBM) Average MAP@3 across {n_splits_cv_model3} folds: {avg_cv_score_model3:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_OOF), oof_preds_lgbm)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_TEST), test_preds_lgbm)
    print(f"Saved Model 3 predictions to {CFG.OUTPUT_DIR}")

# --- Base Model 4: XGBoost (using xgb.train API with CFG parameters) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 4: XGBoost (xgb.train API with CFG parameters) ---")
start_time_model4 = datetime.now()
oof_preds_xgb_api2, test_preds_xgb_api2, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_API2_OOF, CFG.FNAME_XGB_API2_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 4.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 4. Starting training.")
    # oof_preds_xgb_api2 and test_preds_xgb_api2 are already zero-initialized from load_predictions_if_exist
    params_model4 = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'seed': CFG.seed,
        'max_depth': 32,
        'learning_rate': CFG.learning_rate,
        'min_child_weight': 2,
        'alpha': 5.6,
        'reg_lambda': 0.06,
        'subsample': 0.8,
        'colsample_bytree': 0.3,
        'colsample_bylevel': 1,
        'colsample_bynode': 1,
        'tree_method': 'hist',
        'device': "cuda" if os.environ.get('ACCELERATOR_TYPE') == 'GPU' else 'cpu'
    }
    model4_fold_scores = []
    model4_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model4 = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

    for fold, (trn_idx, val_idx) in enumerate(kf_model4.split(X, y_encoded)):
        X_train_fold = X.iloc[trn_idx]
        y_train_fold = y_encoded.iloc[trn_idx]
        X_valid_fold = X.iloc[val_idx]
        y_valid_fold = y_encoded.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
        dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_fold, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        ES = xgb.callback.EarlyStopping(
            rounds=CFG.early_stopping_rounds,
            maximize=False,
            save_best=True,
        )

        model4_instance = xgb.train(
            params_model4,
            dtrain,
            num_boost_round=CFG.num_boost_round,
            evals=[(dtrain, 'train'), (dvalid, 'validation')],
            verbose_eval=CFG.verbose_eval,
            callbacks=[ES]
        )

        oof_preds_xgb_api2[val_idx] = model4_instance.predict(dvalid, iteration_range=(0, model4_instance.best_iteration + 1))
        model4_test_pred_sum += model4_instance.predict(dtest, iteration_range=(0, model4_instance.best_iteration + 1))

        top3_preds = np.argsort(oof_preds_xgb_api2[val_idx], axis=1)[:, -3:][:, ::-1]
        actual = [[label] for label in y_valid_fold.values]
        map3_score_fold = mapk(actual, top3_preds)
        model4_fold_scores.append(map3_score_fold)
        print("----------------------------------------------------------------")
        print(f"fold: {fold:02d}, map@3: {map3_score_fold:.6f}, best iteration: {model4_instance.best_iteration}, best score: {model4_instance.best_score: .6f}\n")

        del model4_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_api2 = model4_test_pred_sum / CFG.n_splits
    avg_map3_score_model4 = np.mean(model4_fold_scores)
    print("----------------------------------------------------------------")
    print(f"Model 4 (xgb.train CFG) Average MAP@3: {avg_map3_score_model4:.6f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_API2_OOF), oof_preds_xgb_api2)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_API2_TEST), test_preds_xgb_api2)
    print(f"Saved Model 4 predictions to {CFG.OUTPUT_DIR}")

# --- Base Model 5: LightGBM (GOSS Boosting Type) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 5: LightGBM (GOSS) ---")
start_time_model5 = datetime.now()
oof_preds_lgbm_goss, test_preds_lgbm_goss, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LGBM_GOSS_OOF, CFG.FNAME_LGBM_GOSS_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 5.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 5. Starting training.")
    # oof_preds_lgbm_goss and test_preds_lgbm_goss are already zero-initialized from load_predictions_if_exist
    lgbm_goss_params = {
        "objective": "multiclass",
        "num_class": num_classes,
        "metric": "multi_logloss",
        "boosting_type": "goss", # GOSS boosting type
        "colsample_bytree": 0.39736332491996407,
        "learning_rate": 0.008033740989500222,
        "min_child_samples": 29,
        "min_child_weight": 0.6732469853333759,
        "n_estimators": 10000,
        "n_jobs": -1,
        "num_leaves": 89,
        "random_state": GLOBAL_RANDOM_STATE,
        "reg_alpha": 15.595856670965969,
        "reg_lambda": 51.43625034648377,
        "subsample": 0.07846482736630467,
        "verbose": -1,
    }
    n_splits_cv_model5 = FOLDS # Use CFG.n_splits
    early_stopping_rounds_cv_model5 = 100
    model5_fold_scores = []
    model5_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model5 = StratifiedKFold(n_splits=n_splits_cv_model5, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model5.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 5) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model5_instance = lgb.LGBMClassifier(**lgbm_goss_params)

        model5_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model5, verbose=False)])

        best_iteration = model5_instance.best_iteration_ if hasattr(model5_instance, 'best_iteration_') else lgbm_goss_params['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm_goss[valid_idx] = model5_instance.predict_proba(fold_valid_X)
        model5_test_pred_sum += model5_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm_goss[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model5_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model5_instance
        gc.collect()

    test_preds_lgbm_goss = model5_test_pred_sum / n_splits_cv_model5
    avg_cv_score_model5 = np.mean(model5_fold_scores)
    print(f"\nModel 5 (LightGBM GOSS) Average MAP@3 across {n_splits_cv_model5} folds: {avg_cv_score_model5:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_GOSS_OOF), oof_preds_lgbm_goss)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_TEST), test_preds_lgbm_goss)
    print(f"Saved Model 5 predictions to {CFG.OUTPUT_DIR}")

# --- Base Model 6: Logistic Regression ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 6: Logistic Regression ---")
start_time_model6 = datetime.now()
oof_preds_lr, test_preds_lr, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LR_OOF, CFG.FNAME_LR_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 6.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 6. Starting training.")
    # oof_preds_lr and test_preds_lr are already zero-initialized from load_predictions_if_exist
    lr_params = {
        'solver': 'liblinear',
        'C': 0.1,
        'random_state': GLOBAL_RANDOM_STATE,
        'n_jobs': -1,
        'multi_class': 'ovr'
    }
    n_splits_cv_model6 = FOLDS
    model6_fold_scores = []
    model6_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model6 = StratifiedKFold(n_splits=n_splits_cv_model6, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model6.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 6) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        # Scale numerical features for Logistic Regression
        scaler_lr = StandardScaler()
        X_train_scaled_lr = fold_train_X.copy()
        X_valid_scaled_lr = fold_valid_X.copy()
        X_test_scaled_lr = X_test.copy()

        X_train_scaled_lr[numerical_cols] = scaler_lr.fit_transform(X_train_scaled_lr[numerical_cols])
        X_valid_scaled_lr[numerical_cols] = scaler_lr.transform(X_valid_scaled_lr[numerical_cols])
        X_test_scaled_lr[numerical_cols] = scaler_lr.transform(X_test_scaled_lr[numerical_cols])


        model6_instance = LogisticRegression(**lr_params)
        model6_instance.fit(X_train_scaled_lr, y_train_fold)

        oof_preds_lr[valid_idx] = model6_instance.predict_proba(X_valid_scaled_lr)
        model6_test_pred_sum += model6_instance.predict_proba(X_test_scaled_lr)

        top_3_preds = np.argsort(oof_preds_lr[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model6_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model6_instance, scaler_lr
        gc.collect()

    test_preds_lr = model6_test_pred_sum / n_splits_cv_model6
    avg_cv_score_model6 = np.mean(model6_fold_scores)
    print(f"\nModel 6 (Logistic Regression) Average MAP@3 across {n_splits_cv_model6} folds: {avg_cv_score_model6:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LR_OOF), oof_preds_lr)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LR_TEST), test_preds_lr)
    print(f"Saved Model 6 predictions to {CFG.OUTPUT_DIR}")

# --- NEW Base Model 7: LightGBM (Optimized Hyperparameters) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 7: LightGBM (Optimized Hyperparameters) ---")
start_time_model7 = datetime.now()

oof_preds_lgbm_optimized, test_preds_lgbm_optimized, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LGBM_OPTIMIZED_OOF, CFG.FNAME_LGBM_OPTIMIZED_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 7.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 7. Starting training.")
    # oof_preds_lgbm_optimized and test_preds_lgbm_optimized are already zero-initialized from load_predictions_if_exist
    lgbm_optimized_params = {
        "objective": "multiclass",
        "num_class": num_classes,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "n_estimators": 2000, # Increased n_estimators
        "learning_rate": 0.01, # Adjusted learning rate
        "num_leaves": 60,
        "max_depth": 12,
        "subsample": 0.7,
        "min_child_samples": 25,
        "colsample_bytree": 0.6,
        "reg_alpha": 0.5,
        "reg_lambda": 0.5,
        "random_state": GLOBAL_RANDOM_STATE,
        "n_jobs": -1,
        "verbose": -1,
    }
    n_splits_cv_model7 = FOLDS
    early_stopping_rounds_cv_model7 = 150 # Adjusted early stopping
    model7_fold_scores = []
    model7_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model7 = StratifiedKFold(n_splits=n_splits_cv_model7, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model7.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 7) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model7_instance = lgb.LGBMClassifier(**lgbm_optimized_params)

        model7_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model7, verbose=False)])

        best_iteration = model7_instance.best_iteration_ if hasattr(model7_instance, 'best_iteration_') else lgbm_optimized_params['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm_optimized[valid_idx] = model7_instance.predict_proba(fold_valid_X)
        model7_test_pred_sum += model7_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm_optimized[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model7_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model7_instance
        gc.collect()

    test_preds_lgbm_optimized = model7_test_pred_sum / n_splits_cv_model7
    avg_cv_score_model7 = np.mean(model7_fold_scores)
    print(f"\nModel 7 (LightGBM Optimized) Average MAP@3 across {n_splits_cv_model7} folds: {avg_cv_score_model7:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_OPTIMIZED_OOF), oof_preds_lgbm_optimized)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_OPTIMIZED_TEST), test_preds_lgbm_optimized)
    print(f"Saved Model 7 predictions to {CFG.OUTPUT_DIR}")


# --- NEW Base Model 8: VotingClassifier ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 8: VotingClassifier ---")
start_time_model8 = datetime.now()

oof_preds_voting, test_preds_voting, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_VOTING_OOF, CFG.FNAME_VOTING_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 8.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 8. Starting training.")
    # oof_preds_voting and test_preds_voting are already zero-initialized from load_predictions_if_exist

    # Create dummy classifiers for the VotingClassifier.
    # These are placeholders and should ideally be trained or loaded actual models.
    # For a real VotingClassifier, you'd integrate the actual best-performing base models.
    clf1 = LogisticRegression(random_state=GLOBAL_RANDOM_STATE, solver='liblinear')
    clf2 = lgb.LGBMClassifier(random_state=GLOBAL_RANDOM_STATE, verbose=-1)
    clf3 = XGBClassifier(random_state=GLOBAL_RANDOM_STATE, use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)

    voting_classifier = VotingClassifier(
        estimators=[('lr', clf1), ('lgbm', clf2), ('xgb', clf3)],
        voting='soft', # 'soft' voting uses predicted probabilities
        weights=[0.1, 0.45, 0.45], # Example weights, should be tuned
        n_jobs=-1 # Use all available cores
    )

    n_splits_cv_model8 = FOLDS
    model8_fold_scores = []
    model8_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model8 = StratifiedKFold(n_splits=n_splits_cv_model8, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model8.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 8: VotingClassifier) ---")
        from sklearn.base import clone
        current_voting_classifier = clone(voting_classifier)

        current_voting_classifier.fit(X.iloc[train_idx], y_encoded.iloc[train_idx]) # Use X.iloc here
        
        oof_preds_voting[valid_idx] = current_voting_classifier.predict_proba(X.iloc[valid_idx]) # Use X.iloc here
        model8_test_pred_sum += current_voting_classifier.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_voting[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(y_encoded.iloc[valid_idx].values, top_3_preds) # Use y_encoded.iloc here
        model8_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del current_voting_classifier
        gc.collect()

    test_preds_voting = model8_test_pred_sum / n_splits_cv_model8
    avg_cv_score_model8 = np.mean(model8_fold_scores)
    print(f"\nModel 8 (VotingClassifier) Average MAP@3 across {n_splits_cv_model8} folds: {avg_cv_score_model8:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_VOTING_OOF), oof_preds_voting)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_VOTING_TEST), test_preds_voting)
    print(f"Saved Model 8 predictions to {CFG.OUTPUT_DIR}")
    
# --- NEW Base Model 9: XGBoost Optimized (from /kaggle/input/ensemble3/) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 9: XGBoost Optimized (from /kaggle/input/ensemble3/) ---")
start_time_model9 = datetime.now()
oof_preds_xgb_optimized_ensemble3, test_preds_xgb_optimized_ensemble3, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_OPTIMIZED_ENSEMBLE3_OOF, CFG.FNAME_XGB_OPTIMIZED_ENSEMBLE3_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 9. Skipping training.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 9. This model will be zero-initialized.")
    # Add training logic for Model 9 here if it's not expected to be loaded.
    # For now, it will remain zero-initialized if not found from the provided paths.


# --- NEW Base Model 10: XGBoost Custom (from /kaggle/input/ensemble3/) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 10: XGBoost Custom (from /kaggle/input/ensemble3/) ---")
start_time_model10 = datetime.now()
oof_preds_xgb_custom, test_preds_xgb_custom, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_XGB_CUSTOM_OOF, CFG.FNAME_XGB_CUSTOM_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Loaded existing predictions for Model 10. Skipping training.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 10. This model will be zero-initialized.")
    # Add training logic for Model 10 here if it's not expected to be loaded.
    # For now, it will remain zero-initialized if not found from the provided paths.


# --- NEW Base Model 11: Gaussian Naive Bayes ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 11: Gaussian Naive Bayes ---")
start_time_model11 = datetime.now()

oof_preds_gaussian_nb, test_preds_gaussian_nb, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_GAUSSIAN_NB_OOF, CFG.FNAME_GAUSSIAN_NB_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 11.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 11. Starting training.")
    
    n_splits_cv_model11 = FOLDS
    model11_fold_scores = []
    model11_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model11 = StratifiedKFold(n_splits=n_splits_cv_model11, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model11.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 11: Gaussian Naive Bayes) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model11_instance = GaussianNB()
        model11_instance.fit(fold_train_X, fold_train_y)

        # Naive Bayes predict_proba outputs probabilities
        oof_preds_gaussian_nb[valid_idx] = model11_instance.predict_proba(fold_valid_X)
        model11_test_pred_sum += model11_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_gaussian_nb[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model11_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model11_instance
        gc.collect()

    test_preds_gaussian_nb = model11_test_pred_sum / n_splits_cv_model11
    avg_cv_score_model11 = np.mean(model11_fold_scores)
    print(f"\nModel 11 (Gaussian Naive Bayes) Average MAP@3 across {n_splits_cv_model11} folds: {avg_cv_score_model11:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_GAUSSIAN_NB_OOF), oof_preds_gaussian_nb)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_GAUSSIAN_NB_TEST), test_preds_gaussian_nb)
    print(f"Saved Model 11 predictions to {CFG.OUTPUT_DIR}")


# --- NEW Base Model 12: Linear Discriminant Analysis (LDA) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 12: Linear Discriminant Analysis (LDA) ---")
start_time_model12 = datetime.now()

oof_preds_lda_base, test_preds_lda_base, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_LDA_OOF, CFG.FNAME_LDA_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 12 (LDA).")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 12 (LDA). Starting training.")
    
    n_splits_cv_model12 = FOLDS
    model12_fold_scores = []
    model12_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model12 = StratifiedKFold(n_splits=n_splits_cv_model12, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model12.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 12: LDA) ---")
        
        X_train_fold, y_train_fold = X.iloc[train_idx], y_encoded.iloc[train_idx]
        X_valid_fold, y_valid_fold = X.iloc[valid_idx], y_encoded.iloc[valid_idx]
        
        # Apply StandardScaler specifically to numerical columns for LDA
        scaler_lda = StandardScaler()
        X_train_scaled_lda = X_train_fold.copy()
        X_valid_scaled_lda = X_valid_fold.copy()
        X_test_scaled_lda = X_test.copy()

        # Scale only the numerical columns. LDA is sensitive to scaling.
        X_train_scaled_lda[numerical_cols] = scaler_lda.fit_transform(X_train_scaled_lda[numerical_cols])
        X_valid_scaled_lda[numerical_cols] = scaler_lda.transform(X_valid_scaled_lda[numerical_cols])
        X_test_scaled_lda[numerical_cols] = scaler_lda.transform(X_test_scaled_lda[numerical_cols])

        # Initialize LDA model
        # 'lsqr' solver is generally good for large datasets, 'eigen' for more complex covariance.
        # 'svd' is default and robust.
        # shrinkage can be used for regularization, but start without it or use 'auto'.
        lda_model = LinearDiscriminantAnalysis(solver='svd', n_components=min(num_classes - 1, X_train_scaled_lda.shape[1])) # n_components <= num_classes - 1
        
        lda_model.fit(X_train_scaled_lda, y_train_fold)
        
        # Predict probabilities
        oof_preds_lda_base[valid_idx] = lda_model.predict_proba(X_valid_scaled_lda)
        model12_test_pred_sum += lda_model.predict_proba(X_test_scaled_lda)

        top_3_preds = np.argsort(oof_preds_lda_base[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(y_valid_fold.values, top_3_preds)
        model12_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del scaler_lda, lda_model
        gc.collect()

    test_preds_lda_base = model12_test_pred_sum / n_splits_cv_model12
    avg_cv_score_model12 = np.mean(model12_fold_scores)
    print(f"\nModel 12 (LDA Base) Average MAP@3 across {n_splits_cv_model12} folds: {avg_cv_score_model12:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LDA_OOF), oof_preds_lda_base)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LDA_TEST), test_preds_lda_base)
    print(f"Saved Model 12 predictions to {CFG.OUTPUT_DIR}")


# --- NEW Base Model 13: HistGradientBoostingClassifier ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 13: HistGradientBoostingClassifier ---")
start_time_model13 = datetime.now()

oof_preds_hgb, test_preds_hgb, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_HGB_OOF, CFG.FNAME_HGB_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 13 (HGB).")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 13 (HGB). Starting training.")
    
    # HistGradientBoostingClassifier parameters provided by the user
    hgb_params = {
        'max_iter': 5000,
        'random_state': GLOBAL_RANDOM_STATE, # Using GLOBAL_RANDOM_STATE for consistency
        'early_stopping': True,
        'learning_rate': 0.1,
        'loss': 'log_loss',
        'l2_regularization': 2.786601939402124e-08,
        'max_depth': 5,
        'max_leaf_nodes': 37,
        'min_samples_leaf': 75,
        'n_iter_no_change': CFG.early_stopping_rounds # Using CFG for early stopping rounds
    }

    n_splits_cv_model13 = FOLDS
    model13_fold_scores = []
    model13_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model13 = StratifiedKFold(n_splits=n_splits_cv_model13, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model13.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 13: HistGradientBoostingClassifier) ---")
        
        X_train_fold, y_train_fold = X.iloc[train_idx], y_encoded.iloc[train_idx]
        X_valid_fold, y_valid_fold = X.iloc[valid_idx], y_encoded.iloc[valid_idx]
        
        hgb_model = HistGradientBoostingClassifier(
            **hgb_params,
            # Pass the indices of categorical features.
            # "from_dtype" would not work correctly as OrdinalEncoder converts them to int.
            categorical_features=categorical_feature_indices_hgb 
        )
        
        hgb_model.fit(X_train_fold, y_train_fold)
        
        oof_preds_hgb[valid_idx] = hgb_model.predict_proba(X_valid_fold)
        model13_test_pred_sum += hgb_model.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_hgb[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(y_valid_fold.values, top_3_preds)
        model13_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del hgb_model
        gc.collect()

    test_preds_hgb = model13_test_pred_sum / n_splits_cv_model13
    avg_cv_score_model13 = np.mean(model13_fold_scores)
    print(f"\nModel 13 (HistGradientBoostingClassifier) Average MAP@3 across {n_splits_cv_model13} folds: {avg_cv_score_model13:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_HGB_OOF), oof_preds_hgb)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_HGB_TEST), test_preds_hgb)
    print(f"Saved Model 13 predictions to {CFG.OUTPUT_DIR}")

# --- NEW Base Model 14: Yggdrasil Decision Forests (RandomForest) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 14: Yggdrasil Decision Forests (RandomForest) ---")
start_time_model14 = datetime.now()

oof_preds_ydf, test_preds_ydf, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_YDF_OOF, CFG.FNAME_YDF_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 14 (YDF).")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 14 (YDF). Starting training.")
    
    # YDF hyperparameters for training
    ydf_training_params = {
        'num_trees': 200,
        'max_depth': 15,
        'random_seed': GLOBAL_RANDOM_STATE, # Map Config.state to GLOBAL_RANDOM_STATE
        'growing_strategy': 'BEST_FIRST_GLOBAL',
    }

    n_splits_cv_model14 = FOLDS
    model14_fold_scores = []
    model14_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model14 = StratifiedKFold(n_splits=n_splits_cv_model14, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model14.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 14: YDF) ---")
        
        # Create a combined DataFrame for YDF with a 'label' column
        X_train_fold_ydf = X.iloc[train_idx].copy()
        X_valid_fold_ydf = X.iloc[valid_idx].copy()

        # YDF requires the label to be named 'label' in the DataFrame passed to train_model
        train_df_ydf = X_train_fold_ydf.copy()
        train_df_ydf['label'] = y_encoded.iloc[train_idx].values

        # For validation and test, no 'label' column is needed for predict_proba
        valid_df_ydf = X_valid_fold_ydf.copy()
        test_df_ydf = X_test.copy()

        # Initialize YDF learner for training, passing label and hyperparameters to the constructor
        ydf_learner = ydf.RandomForestLearner(label="label", **ydf_training_params)
        
        # Train YDF model
        ydf_trained_model = ydf_learner.train(train_df_ydf)
        
        # Predict probabilities (using .predict() as it returns probabilities for classification)
        oof_preds_ydf[valid_idx] = ydf_trained_model.predict(valid_df_ydf)
        model14_test_pred_sum += ydf_trained_model.predict(test_df_ydf)

        top_3_preds = np.argsort(oof_preds_ydf[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(y_encoded.iloc[valid_idx].values, top_3_preds)
        model14_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del ydf_learner, ydf_trained_model, train_df_ydf, valid_df_ydf
        gc.collect()

    test_preds_ydf = model14_test_pred_sum / n_splits_cv_model14
    avg_cv_score_model14 = np.mean(model14_fold_scores)
    print(f"\nModel 14 (YDF) Average MAP@3 across {n_splits_cv_model14} folds: {avg_cv_score_model14:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_YDF_OOF), oof_preds_ydf)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_YDF_TEST), test_preds_ydf)
    print(f"Saved Model 14 predictions to {CFG.OUTPUT_DIR}")


# --- Prepare Meta-Features for the Single-Layer Logistic Regression ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Preparing Meta-Features for Single-Layer Logistic Regression (14 Base Models) ---")
start_time_meta_features = datetime.now()

# At this point, oof_preds_* and test_preds_* variables should contain either
# loaded data or newly computed data (zero-initialized if not found and not computed).
# The load_predictions_if_exist function ensures they are proper NumPy arrays.

# X_meta_train for the Logistic Regression is directly the concatenated OOF predictions
X_meta_train = np.hstack([
    oof_preds_xgb_train_api1,
    oof_preds_xgb_sklearn,
    oof_preds_lgbm,
    oof_preds_xgb_api2,
    oof_preds_lgbm_goss,
    oof_preds_lr,
    oof_preds_lgbm_optimized,
    oof_preds_voting,
    oof_preds_xgb_optimized_ensemble3, # Model 9
    oof_preds_xgb_custom, # Model 10
    oof_preds_gaussian_nb, # Model 11
    oof_preds_lda_base, # Model 12 (LDA)
    oof_preds_hgb, # Model 13 (HistGradientBoostingClassifier)
    oof_preds_ydf # NEW Model 14 (YDF)
])
# X_meta_test for the Logistic Regression is directly the concatenated test predictions
X_meta_test = np.hstack([
    test_preds_xgb_train_api1,
    test_preds_xgb_sklearn,
    test_preds_lgbm,
    test_preds_xgb_api2,
    test_preds_lgbm_goss,
    test_preds_lr,
    test_preds_lgbm_optimized,
    test_preds_voting,
    test_preds_xgb_optimized_ensemble3, # Model 9
    test_preds_xgb_custom, # Model 10
    test_preds_gaussian_nb, # Model 11
    test_preds_lda_base, # Model 12 (LDA)
    test_preds_hgb, # Model 13 (HistGradientBoostingClassifier)
    test_preds_ydf # NEW Model 14 (YDF)
])

# Scale inputs for the final Logistic Regression meta-model
scaler_final_meta = StandardScaler()
X_meta_train_scaled = scaler_final_meta.fit_transform(X_meta_train)
X_meta_test_scaled = scaler_final_meta.transform(X_meta_test)

print(f"[{datetime.now().strftime('%H:%M:%S')}] Meta-training set shape for Logistic Regression: {X_meta_train_scaled.shape}")
print(f"[{datetime.now().strftime('%H:%M:%S')}] Meta-test set shape for Logistic Regression: {X_meta_test_scaled.shape}")

end_time_meta_features = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Meta-Features for Logistic Regression Prepared. Elapsed: {end_time_meta_features - start_time_meta_features} ---\n")


# --- 8. Train the Final Meta-Model (Logistic Regression) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Final Logistic Regression Meta-Model (Single Layer, 14 Base Models) ---")
start_time_final_model = datetime.now()

final_meta_model = LogisticRegression(
    solver='liblinear',
    C=0.1,
    random_state=GLOBAL_RANDOM_STATE,
    n_jobs=-1,
    multi_class='ovr'
)
final_meta_model.fit(X_meta_train_scaled, y_encoded)
print(f"[{datetime.now().strftime('%H:%M:%S')}] Final Logistic Regression Meta-Model training complete.")

end_time_final_model = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Final Logistic Regression Meta-Model Training Finished. Elapsed: {end_time_final_model - start_time_final_model} ---\n")


# --- 9. Generate Final Ensemble Predictions and Submission ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Generating Final Stacked Ensemble Predictions (Single Layer, 14 Base Models) ---")
start_time_submission = datetime.now()

final_ensemble_test_probs = final_meta_model.predict_proba(X_meta_test_scaled)
top_3_preds_ensemble = np.argsort(final_ensemble_test_probs, axis=1)[:, -3:][:, ::-1]

# Inverse transform to get original fertilizer names (strings)
top_3_labels_ensemble = le.inverse_transform(top_3_preds_ensemble.ravel()).reshape(top_3_preds_ensemble.shape)

# Create submission DataFrame
submission_ensemble = pd.DataFrame({
    "id": df_sub["id"],
    "Fertilizer Name": [' '.join(label for label in row) for row in top_3_labels_ensemble]
})

submission_filename = "submission_stacked_ensemble_14_models.csv"
submission_ensemble.to_csv(submission_filename, index=False)

print(f"📁 Final single-layer stacked ensemble submission saved to '{submission_filename}'")

# --- Display the head of the submission file (for verification) ---
print("\nFirst 5 rows of the final submission DataFrame (for display):")
with pd.option_context('display.max_colwidth', None, 'display.width', 1000):
    print(submission_ensemble.head())

end_time_submission = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Final Submission Generation Finished. Elapsed: {end_time_submission - start_time_submission} ---\n")

print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Single-Layer Stacked Ensemble Process Finished ---")



[16:58:10] --- Starting Data Loading and Initial Preprocessing ---
[16:58:11] --- Starting Ordinal and Label Encoding ---
HistGradientBoostingClassifier will treat columns at indices [3, 4] as categorical.
[16:58:12] --- Data Loading and Preprocessing Finished. Elapsed: 0:00:01.791497 ---

[16:58:12] --- Stacking Ensemble Setup Started ---

[16:58:12] --- Training Base Model 1: XGBoost (xgb.train API - Original Block 1) ---
Loading predictions from secondary /kaggle/input/boosting-output/: oof_preds_xgb_train_api1.npy, test_preds_xgb_train_api1.npy
[16:58:12] Skipping training for Model 1.
[16:58:12] --- Training Base Model 2: XGBoost (XGBClassifier API) ---
Loading predictions from secondary /kaggle/input/boosting-output/: oof_preds_xgb_sklearn.npy, test_preds_xgb_sklearn.npy
[16:58:12] Skipping training for Model 2.
[16:58:12] --- Training Base Model 3: LightGBM ---
Loading predictions from secondary /kaggle/input/boosting-output/: oof_preds_lgbm.npy, test_preds_lgbm.npy
[16:58:12] 

In [7]:
import pandas as pd
import numpy as np
import os
import optuna # Still imported but not used directly in the provided snippets
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Still imported but not used
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # For SVM in intermediate layer
from sklearn.naive_bayes import GaussianNB # For Naive Bayes in intermediate layer
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import warnings
import gc
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [3]:
# --- 1. Configuration and Global Random State ---
class CFG:
    seed = 42
    target = 'Fertilizer Name'
    n_splits = 5 # Number of folds for cross-validation
    learning_rate = 0.03
    num_boost_round = 5000
    early_stopping_rounds = 50
    verbose_eval = 200

    # --- Output Directory for current session (will be cleared on session end) ---
    OUTPUT_DIR = '/kaggle/working/outputs/'
    
    # --- Directory for User Uploaded Input Files (e.g., from a Kaggle Dataset) ---
    # Set to /kaggle/input/ensemble2/ as the primary uploaded input dir for new models
    UPLOADED_INPUT_DIR = '/kaggle/input/ensemble2/' 

    # Filenames for base model predictions
    FNAME_XGB_API1_OOF = 'oof_preds_xgb_train_api1.npy'
    FNAME_XGB_API1_TEST = 'test_preds_xgb_train_api1.npy'
    
    FNAME_XGB_SKLEARN_OOF = 'oof_preds_xgb_sklearn.npy'
    FNAME_XGB_SKLEARN_TEST = 'test_preds_xgb_sklearn.npy'
    
    FNAME_LGBM_OOF = 'oof_preds_lgbm.npy'
    FNAME_LGBM_TEST = 'test_preds_lgbm.npy'
    
    # Updated filenames for Model 4
    FNAME_XGB_API2_OOF = 'oof_preds_xgb_train_api2.npy'
    FNAME_XGB_API2_TEST = 'test_preds_xgb_train_api2.npy'
    
    FNAME_LGBM_GOSS_OOF = 'oof_preds_lgbm_goss.npy'
    FNAME_LGBM_GOSS_TEST = 'test_preds_lgbm_goss.npy'

    FNAME_LR_OOF = 'oof_preds_lr.npy'
    FNAME_LR_TEST = 'test_preds_lr.npy'

    FNAME_LGBM_OPTIMIZED_OOF = 'oof_preds_lgbm_optimized.npy'
    FNAME_LGBM_OPTIMIZED_TEST = 'test_preds_lgbm_optimized.npy'

    FNAME_VOTING_OOF = 'oof_preds_voting.npy'
    FNAME_VOTING_TEST = 'test_preds_voting.npy'

    # --- Filenames for the XGBoost models from /kaggle/input/ensemble3/ ---
    FNAME_XGB_OPTIMIZED_ENSEMBLE3_OOF = 'oof_preds_xgb_optimized.npy' 
    FNAME_XGB_OPTIMIZED_ENSEMBLE3_TEST = 'test_preds_xgb_optimized.npy'

    FNAME_XGB_CUSTOM_OOF = 'oof_preds_xgb_custom.npy'
    FNAME_XGB_CUSTOM_TEST = 'test_preds_xgb_custom.npy'

    # --- Filenames for the Gaussian Naive Bayes Model ---
    FNAME_GAUSSIAN_NB_OOF = 'oof_preds_gaussian_nb.npy'
    FNAME_GAUSSIAN_NB_TEST = 'test_preds_gaussian_nb.npy'

    # --- NEW: Filenames for the KNN Base Model ---
    FNAME_KNN_OOF = 'oof_preds_knn_base.npy'
    FNAME_KNN_TEST = 'test_preds_knn_base.npy'


# Create the output directory if it doesn't exist. This is essential for saving.
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

# Define FOLDS based on CFG for consistency
FOLDS = CFG.n_splits
GLOBAL_RANDOM_STATE = CFG.seed
np.random.seed(GLOBAL_RANDOM_STATE)

# --- Helper function to check and load predictions from either source ---
def load_predictions_if_exist(oof_filename, test_filename, X_shape, X_test_shape, num_classes):
    """
    Checks if prediction files exist in the current session's OUTPUT_DIR,
    the primary UPLOADED_INPUT_DIR (CFG.UPLOADED_INPUT_DIR),
    the /kaggle/input/ensemble3/ directory, or
    the secondary /kaggle/input/boosting-output/ directory, and loads them.
    If files are not found, it returns zero-initialized NumPy arrays of the correct shape.

    Args:
        oof_filename (str): The filename for the OOF predictions.
        test_filename (str): The filename for the test predictions.
        X_shape (tuple): Shape of the full training features (e.g., (num_samples, num_features)).
        X_test_shape (tuple): Shape of the full test features (e.g., (num_test_samples, num_features)).
        num_classes (int): Number of target classes.

    Returns:
        tuple: (oof_preds_array, test_preds_array, loaded_from_disk_flag).
               Returns (zero_initialized_oof_array, zero_initialized_test_array, False)
               if files are not found.
    """
    oof_path_output = os.path.join(CFG.OUTPUT_DIR, oof_filename)
    test_path_output = os.path.join(CFG.OUTPUT_DIR, test_filename)
    
    uploaded_input_dir_exists = hasattr(CFG, 'UPLOADED_INPUT_DIR') and CFG.UPLOADED_INPUT_DIR
    
    oof_path_primary_input = None
    test_path_primary_input = None
    if uploaded_input_dir_exists:
        oof_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, oof_filename) # /kaggle/input/ensemble2/
        test_path_primary_input = os.path.join(CFG.UPLOADED_INPUT_DIR, test_filename)

    # NEW: Specific path for /kaggle/input/ensemble3/
    oof_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', oof_filename)
    test_path_ensemble3_input = os.path.join('/kaggle/input/ensemble3/', test_filename)

    # Original secondary input directory for boosting-output files
    oof_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', oof_filename)
    test_path_boosting_output_input = os.path.join('/kaggle/input/boosting-output/', test_filename)


    # Priority 1: Check in the current session's output directory
    if os.path.exists(oof_path_output) and os.path.exists(test_path_output):
        print(f"Loading predictions from current session's OUTPUT_DIR: {oof_filename}, {test_filename}")
        return np.load(oof_path_output), np.load(test_path_output), True
    # Priority 2: Check in the primary user-uploaded input directory (e.g., /kaggle/input/ensemble2/)
    elif uploaded_input_dir_exists and os.path.exists(oof_path_primary_input) and os.path.exists(test_path_primary_input):
        print(f"Loading predictions from primary UPLOADED_INPUT_DIR ({CFG.UPLOADED_INPUT_DIR}): {oof_filename}, {test_filename}")
        return np.load(oof_path_primary_input), np.load(test_path_primary_input), True
    # Priority 3: Check in the /kaggle/input/ensemble3/ directory
    elif os.path.exists(oof_path_ensemble3_input) and os.path.exists(test_path_ensemble3_input):
        print(f"Loading predictions from /kaggle/input/ensemble3/: {oof_filename}, {test_filename}")
        return np.load(oof_path_ensemble3_input), np.load(test_path_ensemble3_input), True
    # Priority 4: Check in the secondary user-uploaded input directory (e.g., /kaggle/input/boosting-output/)
    elif os.path.exists(oof_path_boosting_output_input) and os.path.exists(test_path_boosting_output_input):
        print(f"Loading predictions from secondary /kaggle/input/boosting-output/: {oof_filename}, {test_filename}")
        return np.load(oof_path_boosting_output_input), np.load(test_path_boosting_output_input), True
    else:
        # If files are not found, return zero-initialized arrays
        print(f"No existing predictions found for {oof_filename} or {test_filename}. Will initialize as zeros.")
        return np.zeros((X_shape[0], num_classes)), np.zeros((X_test_shape[0], num_classes)), False



In [4]:
# --- 2. Data Loading and Initial Preprocessing ---
print("--- Data Loading and Initial Preprocessing ---")
df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')
df_original = pd.read_csv('/kaggle/input/original/Fertilizer Prediction .csv')

# Drop 'id' columns if they exist in train/test sets as per original notebook
df_train = df_train.drop(columns=['id'])
if 'id' in df_test.columns:
    df_test = df_test.drop(columns=['id'])

# Concatenate original dataset to the training data
df_train = pd.concat([df_train, df_original], axis=0, ignore_index=True)


--- Data Loading and Initial Preprocessing ---


In [5]:
# --- 3. Ordinal and Label Encoding ---
print("--- Ordinal and Label Encoding ---")
cat_cols_for_ordinal = df_train.select_dtypes(include='object').columns.tolist()
if 'Fertilizer Name' in cat_cols_for_ordinal:
    cat_cols_for_ordinal.remove('Fertilizer Name')

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_train[cat_cols_for_ordinal] = ordinal_encoder.fit_transform(df_train[cat_cols_for_ordinal].astype(str)).astype(int)

cat_cols_for_test = [col for col in cat_cols_for_ordinal if col in df_test.columns]
df_test[cat_cols_for_test] = ordinal_encoder.transform(df_test[cat_cols_for_test].astype(str)).astype(int)

le = LabelEncoder()
df_train['Fertilizer Name'] = le.fit_transform(df_train['Fertilizer Name'])
num_classes = len(np.unique(df_train['Fertilizer Name']))

y_encoded = df_train['Fertilizer Name'] # Target for training
X = df_train.drop(columns=['Fertilizer Name']) # Features for training
X_test = df_test # Features for final test prediction
# Define numerical columns for scaling
# Moved this definition here to ensure it's available globally before any model training or helper functions use it.
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
# Filter out any columns that might have been ordinal encoded but are numeric-like
numerical_cols = [col for col in numerical_cols if col not in cat_cols_for_ordinal]


end_time_data_load = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Data Loading and Preprocessing Finished. Elapsed: {end_time_data_load - start_time_data_load} ---\n")

# Define mapk function
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k) if min(len(a), k) > 0 else 0.0

    if not isinstance(actual[0], (list, np.ndarray)):
        actual = [[a] for a in actual]

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

print("--- Stacking Ensemble Setup Started ---")

# --- Initialize OOF and Test Prediction Arrays for Each Base Model ---
# These will be initialized to zeros, then potentially loaded or filled during training
oof_preds_xgb_train_api1 = np.zeros((len(X), num_classes))
oof_preds_xgb_sklearn = np.zeros((len(X), num_classes))
oof_preds_lgbm = np.zeros((len(X), num_classes))
oof_preds_xgb_train_api2 = np.zeros((len(X), num_classes))
oof_preds_lgbm_goss = np.zeros((len(X), num_classes))

test_preds_xgb_train_api1 = np.zeros((len(X_test), num_classes))
test_preds_xgb_sklearn = np.zeros((len(X_test), num_classes))
test_preds_lgbm = np.zeros((len(X_test), num_classes))
test_preds_xgb_train_api2 = np.zeros((len(X_test), num_classes))
test_preds_lgbm_goss = np.zeros((len(X_test), num_classes))



--- Ordinal and Label Encoding ---


NameError: name 'start_time_data_load' is not defined

In [None]:
# --- Base Model 1: XGBoost (using xgb.train API - first block in original notebook) ---
print("\n--- Training Base Model 1: XGBoost (xgb.train API - Original Block 1) ---")
if os.path.exists(CFG.FNAME_XGB_API1_OOF) and os.path.exists(CFG.FNAME_XGB_API1_TEST):
    oof_preds_xgb_train_api1 = np.load(CFG.FNAME_XGB_API1_OOF)
    test_preds_xgb_train_api1 = np.load(CFG.FNAME_XGB_API1_TEST)
    print(f"Loaded existing predictions for Model 1 from {CFG.OUTPUT_DIR}. Skipping training.")
else:
    kf_model1 = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    model1_logloss_scores = []
    model1_test_pred_sum = np.zeros((len(X_test), num_classes))

    for i, (train_idx, valid_idx) in enumerate(kf_model1.split(X, y_encoded)):
        print(f"\n{'#'*10} Fold {i+1} (Model 1) {'#'*10}")

        x_train_fold, y_train_fold = X.iloc[train_idx].copy(), y_encoded.iloc[train_idx]
        x_valid_fold, y_valid_fold = X.iloc[valid_idx].copy(), y_encoded.iloc[valid_idx]

        dtrain = xgb.DMatrix(x_train_fold, label=y_train_fold)
        dvalid = xgb.DMatrix(x_valid_fold, label=y_valid_fold)
        dtest = xgb.DMatrix(X_test)

        params_model1 = {
            'objective': 'multi:softprob',
            'num_class': num_classes,
            'max_depth': 16,
            'learning_rate': 0.03,
            'min_child_weight': 2,
            'alpha': 0.8,
            'reg_lambda': 4.0,
            'colsample_bytree': 0.3,
            'subsample': 0.8,
            'max_bin': 128,
            'colsample_bylevel': 1,
            'colsample_bynode': 1,
            'tree_method': 'hist',
            'random_state': 42,
            'eval_metric': 'mlogloss'
        }

        model1_instance = xgb.train(
            params_model1,
            dtrain,
            num_boost_round=CFG.num_boost_round, # Use CFG value
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=CFG.early_stopping_rounds, # Use CFG value
            verbose_eval=CFG.verbose_eval
        )

        oof_preds_xgb_train_api1[valid_idx] = model1_instance.predict(dvalid, iteration_range=(0, model1_instance.best_iteration + 1))
        model1_test_pred_sum += model1_instance.predict(dtest, iteration_range=(0, model1_instance.best_iteration + 1))

        log_loss_value = log_loss(y_valid_fold, oof_preds_xgb_train_api1[valid_idx])
        print(f"Fold {i+1} log_loss: {log_loss_value:.4f}")
        model1_logloss_scores.append(log_loss_value)

        del model1_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_train_api1 = model1_test_pred_sum / FOLDS
    avg_log_loss_model1 = np.mean(model1_logloss_scores)
    print(f"\nModel 1 (xgb.train API 1) Final CV log_loss: {avg_log_loss_model1:.4f}")
    top_3_oof_preds_model1 = np.argsort(oof_preds_xgb_train_api1, axis=1)[:, -3:][:, ::-1]
    map3_score_model1 = mapk(y_encoded.values, top_3_oof_preds_model1)
    print(f"Model 1 (xgb.train API 1) OOF MAP@3 Score: {map3_score_model1:.5f}")

    np.save(CFG.FNAME_XGB_API1_OOF, oof_preds_xgb_train_api1)
    np.save(CFG.FNAME_XGB_API1_TEST, test_preds_xgb_train_api1)
    print(f"Saved Model 1 predictions to {CFG.OUTPUT_DIR}")




--- Training Base Model 1: XGBoost (xgb.train API - Original Block 1) ---

########## Fold 1 (Model 1) ##########
[0]	valid-mlogloss:1.94564


In [None]:
# --- Base Model 2: XGBoost (using XGBClassifier API - second block in original notebook) ---
print("\n--- Training Base Model 2: XGBoost (XGBClassifier API) ---")
if os.path.exists(CFG.FNAME_XGB_SKLEARN_OOF) and os.path.exists(CFG.FNAME_XGB_SKLEARN_TEST):
    oof_preds_xgb_sklearn = np.load(CFG.FNAME_XGB_SKLEARN_OOF)
    test_preds_xgb_sklearn = np.load(CFG.FNAME_XGB_SKLEARN_TEST)
    print(f"Loaded existing predictions for Model 2 from {CFG.OUTPUT_DIR}. Skipping training.")
else:
    fixed_xgb_params_model2 = {
        'max_depth': 12,
        'colsample_bytree': 0.467,
        'subsample': 0.86,
        'n_estimators': 4000,
        'learning_rate': 0.03,
        'gamma': 0.26,
        'max_delta_step': 4,
        'reg_alpha': 2.7,
        'reg_lambda': 1.4,
        'objective': 'multi:softprob',
        'random_state': 13,
        'enable_categorical': True,
        'tree_method': 'hist',
        'early_stopping_rounds': 100,
        'eval_metric': 'mlogloss',
        'num_class': num_classes,
        'n_jobs': -1
    }
    n_splits_cv_model2 = FOLDS # Use CFG.n_splits
    model2_fold_scores = []
    model2_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model2 = StratifiedKFold(n_splits=n_splits_cv_model2, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model2.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 2) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model2_instance = XGBClassifier(**fixed_xgb_params_model2)
        model2_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            verbose=False)

        best_iteration = model2_instance.best_iteration if hasattr(model2_instance, 'best_iteration') else fixed_xgb_params_model2['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_xgb_sklearn[valid_idx] = model2_instance.predict_proba(fold_valid_X, iteration_range=(0, best_iteration))
        model2_test_pred_sum += model2_instance.predict_proba(X_test, iteration_range=(0, best_iteration))

        top_3_preds = np.argsort(oof_preds_xgb_sklearn[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model2_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model2_instance
        gc.collect()

    test_preds_xgb_sklearn = model2_test_pred_sum / n_splits_cv_model2
    avg_cv_score_model2 = np.mean(model2_fold_scores)
    print(f"\nModel 2 (XGBClassifier) Average MAP@3 across {n_splits_cv_model2} folds: {avg_cv_score_model2:.4f}")

    np.save(CFG.FNAME_XGB_SKLEARN_OOF, oof_preds_xgb_sklearn)
    np.save(CFG.FNAME_XGB_SKLEARN_TEST, test_preds_xgb_sklearn)
    print(f"Saved Model 2 predictions to {CFG.OUTPUT_DIR}")


In [None]:
# --- Base Model 3: LightGBM Model ---
print("\n--- Training Base Model 3: LightGBM ---")
if os.path.exists(CFG.FNAME_LGBM_OOF) and os.path.exists(CFG.FNAME_LGBM_TEST):
    oof_preds_lgbm = np.load(CFG.FNAME_LGBM_OOF)
    test_preds_lgbm = np.load(CFG.FNAME_LGBM_TEST)
    print(f"Loaded existing predictions for Model 3 from {CFG.OUTPUT_DIR}. Skipping training.")
else:
    fixed_lgbm_params = {
        "objective": "multiclass",
        "num_class": num_classes,
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "n_estimators": 1214,
        "learning_rate": 0.064080948,
        "num_leaves": 169,
        "max_depth": 10,
        "subsample": 0.642,
        "min_child_samples": 19,
        "colsample_bytree": 0.7,
        "reg_alpha": 6.2941,
        "reg_lambda": 5.556,
        "random_state": GLOBAL_RANDOM_STATE,
        "n_jobs": -1,
        "verbose": -1,
    }
    n_splits_cv_model3 = FOLDS # Use CFG.n_splits
    early_stopping_rounds_cv_model3 = 100
    model3_fold_scores = []
    model3_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model3 = StratifiedKFold(n_splits=n_splits_cv_model3, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model3.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 3) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model3_instance = lgb.LGBMClassifier(**fixed_lgbm_params)

        model3_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model3, verbose=False)])

        best_iteration = model3_instance.best_iteration_ if hasattr(model3_instance, 'best_iteration_') else fixed_lgbm_params['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm[valid_idx] = model3_instance.predict_proba(fold_valid_X)
        model3_test_pred_sum += model3_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model3_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model3_instance
        gc.collect()

    test_preds_lgbm = model3_test_pred_sum / n_splits_cv_model3
    avg_cv_score_model3 = np.mean(model3_fold_scores)
    print(f"\nModel 3 (LightGBM) Average MAP@3 across {n_splits_cv_model3} folds: {avg_cv_score_model3:.4f}")

    np.save(CFG.FNAME_LGBM_OOF, oof_preds_lgbm)
    np.save(CFG.FNAME_LGBM_TEST, test_preds_lgbm)
    print(f"Saved Model 3 predictions to {CFG.OUTPUT_DIR}")



In [None]:
# --- Base Model 4: XGBoost (using xgb.train API with CFG parameters) ---
print("\n--- Training Base Model 4: XGBoost (xgb.train API with CFG parameters) ---")
if os.path.exists(CFG.FNAME_XGB_API2_OOF) and os.path.exists(CFG.FNAME_XGB_API2_TEST):
    oof_preds_xgb_train_api2 = np.load(CFG.FNAME_XGB_API2_OOF)
    test_preds_xgb_train_api2 = np.load(CFG.FNAME_XGB_API2_TEST)
    print(f"Loaded existing predictions for Model 4 from {CFG.OUTPUT_DIR}. Skipping training.")
else:
    params_model4 = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'seed': CFG.seed,
        'max_depth': 32,
        'learning_rate': CFG.learning_rate,
        'min_child_weight': 2,
        'alpha': 5.6,
        'reg_lambda': 0.06,
        'subsample': 0.8,
        'colsample_bytree': 0.3,
        'colsample_bylevel': 1,
        'colsample_bynode': 1,
        'tree_method': 'hist',
        'device': "cuda" if os.environ.get('ACCELERATOR_TYPE') == 'GPU' else 'cpu'
    }
    model4_fold_scores = []
    model4_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model4 = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

    for fold, (trn_idx, val_idx) in enumerate(kf_model4.split(X, y_encoded)):
        X_train_fold = X.iloc[trn_idx]
        y_train_fold = y_encoded.iloc[trn_idx]
        X_valid_fold = X.iloc[val_idx]
        y_valid_fold = y_encoded.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
        dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_fold, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        ES = xgb.callback.EarlyStopping(
            rounds=CFG.early_stopping_rounds,
            maximize=False,
            save_best=True,
        )

        model4_instance = xgb.train(
            params_model4,
            dtrain,
            num_boost_round=CFG.num_boost_round,
            evals=[(dtrain, 'train'), (dvalid, 'validation')],
            verbose_eval=CFG.verbose_eval,
            callbacks=[ES]
        )

        oof_preds_xgb_train_api2[val_idx] = model4_instance.predict(dvalid, iteration_range=(0, model4_instance.best_iteration + 1))
        model4_test_pred_sum += model4_instance.predict(dtest, iteration_range=(0, model4_instance.best_iteration + 1))

        top3_preds = np.argsort(oof_preds_xgb_train_api2[val_idx], axis=1)[:, -3:][:, ::-1]
        actual = [[label] for label in y_valid_fold.values]
        map3_score_fold = mapk(actual, top3_preds)
        model4_fold_scores.append(map3_score_fold)
        print("----------------------------------------------------------------")
        print(f"fold: {fold:02d}, map@3: {map3_score_fold:.6f}, best iteration: {model4_instance.best_iteration}, best score: {model4_instance.best_score: .6f}\n")

        del model4_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_train_api2 = model4_test_pred_sum / CFG.n_splits
    avg_map3_score_model4 = np.mean(model4_fold_scores)
    print("----------------------------------------------------------------")
    print(f"Model 4 (xgb.train CFG) Average MAP@3: {avg_map3_score_model4:.6f}")

    np.save(CFG.FNAME_XGB_API2_OOF, oof_preds_xgb_train_api2)
    np.save(CFG.FNAME_XGB_API2_TEST, test_preds_xgb_train_api2)
    print(f"Saved Model 4 predictions to {CFG.OUTPUT_DIR}")


In [None]:
# --- NEW Base Model 5: LightGBM (GOSS Boosting Type) ---
print("\n--- Training Base Model 5: LightGBM (GOSS) ---")
if os.path.exists(CFG.FNAME_LGBM_GOSS_OOF) and os.path.exists(CFG.FNAME_LGBM_GOSS_TEST):
    oof_preds_lgbm_goss = np.load(CFG.FNAME_LGBM_GOSS_OOF)
    test_preds_lgbm_goss = np.load(CFG.FNAME_LGBM_GOSS_TEST)
    print(f"Loaded existing predictions for Model 5 from {CFG.OUTPUT_DIR}. Skipping training.")
else:
    lgbm_goss_params = {
        "objective": "multiclass",
        "num_class": num_classes,
        "metric": "multi_logloss",
        "boosting_type": "goss", # GOSS boosting type
        "colsample_bytree": 0.39736332491996407,
        "learning_rate": 0.008033740989500222,
        "min_child_samples": 29,
        "min_child_weight": 0.6732469853333759,
        "n_estimators": 10000,
        "n_jobs": -1,
        "num_leaves": 89,
        "random_state": GLOBAL_RANDOM_STATE,
        "reg_alpha": 15.595856670965969,
        "reg_lambda": 51.43625034648377,
        "subsample": 0.07846482736630467,
        "verbose": -1,
    }
    n_splits_cv_model5 = FOLDS # Use CFG.n_splits
    early_stopping_rounds_cv_model5 = 100
    model5_fold_scores = []
    model5_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model5 = StratifiedKFold(n_splits=n_splits_cv_model5, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model5.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 5) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model5_instance = lgb.LGBMClassifier(**lgbm_goss_params)

        model5_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model5, verbose=False)])

        best_iteration = model5_instance.best_iteration_ if hasattr(model5_instance, 'best_iteration_') else lgbm_goss_params['n_estimators']
        print(f"Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm_goss[valid_idx] = model5_instance.predict_proba(fold_valid_X)
        model5_test_pred_sum += model5_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm_goss[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model5_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model5_instance
        gc.collect()

    test_preds_lgbm_goss = model5_test_pred_sum / n_splits_cv_model5
    avg_cv_score_model5 = np.mean(model5_fold_scores)
    print(f"\nModel 5 (LightGBM GOSS) Average MAP@3 across {n_splits_cv_model5} folds: {avg_cv_score_model5:.4f}")

    np.save(CFG.FNAME_LGBM_GOSS_OOF, oof_preds_lgbm_goss)
    np.save(CFG.FNAME_LGBM_GOSS_TEST, test_preds_lgbm_goss)
    print(f"Saved Model 5 predictions to {CFG.OUTPUT_DIR}")



In [21]:
# --- Base Model 6: Logistic Regression (already added) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 6: Logistic Regression ---")
start_time_model6 = datetime.now()
loaded_oof, loaded_test, loaded_from_disk = load_predictions_if_exist(CFG.FNAME_LR_OOF, CFG.FNAME_LR_TEST)
if loaded_from_disk:
    oof_preds_lr = loaded_oof
    test_preds_lr = loaded_test
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 6.")
else:
    # --- IMPORTANT FIX: Initialize arrays if not loaded from disk ---
    oof_preds_lr = np.zeros((len(X), num_classes))
    test_preds_lr = np.zeros((len(X_test), num_classes))
    # ---------------------------------------------------------------

    lr_params = {
        'C': 0.3691232889729139, 'fit_intercept': True, 'max_iter': 10000, 'random_state': 42,
        'tol': 0.0021938847672756667, 'solver': "liblinear", 'penalty': "l2",
        'multi_class': 'ovr', 'n_jobs': -1
    }
    n_splits_cv_model6 = FOLDS
    model6_fold_scores = []
    model6_test_pred_sum = np.zeros((len(X_test), num_classes)) # This is correctly initialized for the sum

    kf_model6 = StratifiedKFold(n_splits=n_splits_cv_model6, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model6.split(X, y_encoded)):
        fold_start_time = datetime.now()
        print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Fold {fold+1}/{FOLDS} (Model 6: Logistic Regression) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]
        model6_instance = LogisticRegression(**lr_params)
        model6_instance.fit(fold_train_X, fold_train_y)
        oof_preds_lr[valid_idx] = model6_instance.predict_proba(fold_valid_X)
        model6_test_pred_sum += model6_instance.predict_proba(X_test)
        fold_score = log_loss(fold_valid_y, oof_preds_lr[valid_idx])
        model6_fold_scores.append(fold_score)
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {fold+1} Logistic Regression LogLoss: {fold_score:.4f}. Elapsed for fold: {datetime.now() - fold_start_time}")
        del model6_instance; gc.collect()
    test_preds_lr = model6_test_pred_sum / n_splits_cv_model6
    avg_cv_score_model6 = np.mean(model6_fold_scores)
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Model 6 (Logistic Regression) Average LogLoss across {n_splits_cv_model6} folds: {avg_cv_score_model6:.4f}")
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LR_OOF), oof_preds_lr)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LR_TEST), test_preds_lr)
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Saved Model 6 predictions to {CFG.OUTPUT_DIR}")
end_time_model6 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 6 Training Finished. Elapsed: {end_time_model6 - start_time_model6} ---\n")



[09:51:32] --- Training Base Model 6: Logistic Regression ---
[09:51:32] --- Training Fold 1/5 (Model 6: Logistic Regression) ---
[09:51:49] Fold 1 Logistic Regression LogLoss: 1.9427. Elapsed for fold: 0:00:16.759779
[09:51:50] --- Training Fold 2/5 (Model 6: Logistic Regression) ---
[09:52:06] Fold 2 Logistic Regression LogLoss: 1.9427. Elapsed for fold: 0:00:16.309629
[09:52:06] --- Training Fold 3/5 (Model 6: Logistic Regression) ---
[09:52:23] Fold 3 Logistic Regression LogLoss: 1.9428. Elapsed for fold: 0:00:16.702962
[09:52:23] --- Training Fold 4/5 (Model 6: Logistic Regression) ---
[09:52:41] Fold 4 Logistic Regression LogLoss: 1.9425. Elapsed for fold: 0:00:17.571211
[09:52:41] --- Training Fold 5/5 (Model 6: Logistic Regression) ---
[09:52:57] Fold 5 Logistic Regression LogLoss: 1.9428. Elapsed for fold: 0:00:16.477340

[09:52:57] Model 6 (Logistic Regression) Average LogLoss across 5 folds: 1.9427
[09:52:57] Saved Model 6 predictions to /kaggle/working/outputs/
[09:52:57] -

In [23]:
# --- NEW Base Model 7: LightGBM (Optimized Hyperparameters) ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 7: LightGBM (Optimized Hyperparameters) ---")
start_time_model7 = datetime.now()

loaded_oof, loaded_test, loaded_from_disk = load_predictions_if_exist(CFG.FNAME_LGBM_OPTIMIZED_OOF, CFG.FNAME_LGBM_OPTIMIZED_TEST)
if loaded_from_disk:
    oof_preds_lgbm_optimized = loaded_oof
    test_preds_lgbm_optimized = loaded_test
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 7.")
else:
    # --- IMPORTANT FIX: Initialize arrays if not loaded from disk ---
    oof_preds_lgbm_optimized = np.zeros((len(X), num_classes))
    test_preds_lgbm_optimized = np.zeros((len(X_test), num_classes))
    # ---------------------------------------------------------------

    lgbm_optimized_params = {
        'n_estimators': 1446, 'num_leaves': 26, 'min_child_samples': 4,
        'learning_rate': 0.14474325014552236, 'max_bin': 2**9 - 1, # log_max_bin 9 means 2^9-1
        'colsample_bytree': 0.41898958941402753, 'reg_alpha': 0.02619349902619489,
        'reg_lambda': 0.012988591514850135,
        "objective": "multiclass", "num_class": num_classes, "metric": "multi_logloss",
        "boosting_type": "gbdt", "n_jobs": -1, "random_state": GLOBAL_RANDOM_STATE, "verbose": -1,
    }
    n_splits_cv_model7 = FOLDS
    early_stopping_rounds_cv_model7 = 100
    model7_fold_scores = []
    model7_test_pred_sum = np.zeros((len(X_test), num_classes))

    for fold, (train_idx, valid_idx) in enumerate(kf_model7.split(X, y_encoded)):
        fold_start_time = datetime.now()
        print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Fold {fold+1}/{FOLDS} (Model 7: LightGBM Optimized) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model7_instance = lgb.LGBMClassifier(**lgbm_optimized_params)
        model7_instance.fit(fold_train_X, fold_train_y,
                            eval_set=[(fold_valid_X, fold_valid_y)],
                            callbacks=[lgb.early_stopping(early_stopping_rounds_cv_model7, verbose=False)])

        best_iteration = model7_instance.best_iteration_ if hasattr(model7_instance, 'best_iteration_') else lgbm_optimized_params['n_estimators']
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {fold+1} best iteration: {best_iteration}")

        oof_preds_lgbm_optimized[valid_idx] = model7_instance.predict_proba(fold_valid_X)
        model7_test_pred_sum += model7_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_lgbm_optimized[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model7_fold_scores.append(fold_score)
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {fold+1} MAP@3: {fold_score:.4f}. Elapsed for fold: {datetime.now() - fold_start_time}")

        del model7_instance; gc.collect()

    test_preds_lgbm_optimized = model7_test_pred_sum / n_splits_cv_model7
    avg_cv_score_model7 = np.mean(model7_fold_scores)
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Model 7 (LightGBM Optimized) Average MAP@3 across {n_splits_cv_model7} folds: {avg_cv_score_model7:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_OPTIMIZED_OOF), oof_preds_lgbm_optimized)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_LGBM_OPTIMIZED_TEST), test_preds_lgbm_optimized)
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Saved Model 7 predictions to {CFG.OUTPUT_DIR}")

end_time_model7 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 7 Training Finished. Elapsed: {end_time_model7 - start_time_model7} ---\n")


[10:13:46] --- Training Base Model 7: LightGBM (Optimized Hyperparameters) ---
[10:13:46] --- Training Fold 1/5 (Model 7: LightGBM Optimized) ---
[10:17:24] Fold 1 best iteration: 1446
[10:19:29] Fold 1 MAP@3: 0.3543. Elapsed for fold: 0:05:42.819257
[10:19:29] --- Training Fold 2/5 (Model 7: LightGBM Optimized) ---
[10:23:14] Fold 2 best iteration: 1421
[10:25:12] Fold 2 MAP@3: 0.3538. Elapsed for fold: 0:05:43.064845
[10:25:12] --- Training Fold 3/5 (Model 7: LightGBM Optimized) ---
[10:28:59] Fold 3 best iteration: 1441
[10:31:05] Fold 3 MAP@3: 0.3537. Elapsed for fold: 0:05:52.618210
[10:31:05] --- Training Fold 4/5 (Model 7: LightGBM Optimized) ---
[10:34:44] Fold 4 best iteration: 1439
[10:36:51] Fold 4 MAP@3: 0.3533. Elapsed for fold: 0:05:46.062700
[10:36:51] --- Training Fold 5/5 (Model 7: LightGBM Optimized) ---
[10:40:32] Fold 5 best iteration: 1444
[10:42:26] Fold 5 MAP@3: 0.3513. Elapsed for fold: 0:05:34.685056

[10:42:26] Model 7 (LightGBM Optimized) Average MAP@3 across

In [13]:
# --- NEW Base Model 8: VotingClassifier ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 8: VotingClassifier ---")
start_time_model8 = datetime.now()

loaded_oof, loaded_test, loaded_from_disk = load_predictions_if_exist(CFG.FNAME_VOTING_OOF, CFG.FNAME_VOTING_TEST)
if loaded_from_disk:
    oof_preds_voting = loaded_oof
    test_preds_voting = loaded_test
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 8.")
else:
    # --- IMPORTANT FIX: Initialize arrays if not loaded from disk ---
    oof_preds_voting = np.zeros((len(X), num_classes))
    test_preds_voting = np.zeros((len(X_test), num_classes))
    # ---------------------------------------------------------------

    n_splits_cv_model8 = FOLDS
    model8_fold_scores = []
    model8_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model8 = StratifiedKFold(n_splits=n_splits_cv_model8, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model8.split(X, y_encoded)):
        fold_start_time = datetime.now()
        print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Fold {fold+1}/{FOLDS} (Model 8: VotingClassifier) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        # Define the internal estimators for the VotingClassifier for this fold.
        # Using simplified parameters to keep them distinct from main base models but representative.
        estimators_for_voting = [
            ('xgb1', XGBClassifier(objective='multi:softprob', num_class=num_classes, n_estimators=500, learning_rate=0.05, max_depth=8, random_state=GLOBAL_RANDOM_STATE, n_jobs=-1, verbose=0, tree_method='hist')),
            ('lgbm1', lgb.LGBMClassifier(objective='multiclass', num_class=num_classes, n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=GLOBAL_RANDOM_STATE, n_jobs=-1, verbose=-1)),
            ('xgb2', XGBClassifier(objective='multi:softprob', num_class=num_classes, n_estimators=300, learning_rate=0.08, max_depth=6, random_state=GLOBAL_RANDOM_STATE + 1, n_jobs=-1, verbose=0, tree_method='hist')),
            ('lgbm_goss', lgb.LGBMClassifier(objective='multiclass', num_class=num_classes, boosting_type='goss', n_estimators=400, learning_rate=0.03, num_leaves=64, random_state=GLOBAL_RANDOM_STATE + 2, n_jobs=-1, verbose=-1)),
            ('lr_vote', LogisticRegression(solver='liblinear', C=0.5, random_state=GLOBAL_RANDOM_STATE, n_jobs=-1, multi_class='ovr', max_iter=2000)) # A simple LR in the mix
        ]
        
        model8_instance = VotingClassifier(estimators=estimators_for_voting, voting='soft', n_jobs=-1)
        model8_instance.fit(fold_train_X, fold_train_y)

        oof_preds_voting[valid_idx] = model8_instance.predict_proba(fold_valid_X)
        model8_test_pred_sum += model8_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_voting[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model8_fold_scores.append(fold_score)
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {fold+1} VotingClassifier MAP@3: {fold_score:.4f}. Elapsed for fold: {datetime.now() - fold_start_time}")

        del model8_instance; gc.collect()

    test_preds_voting = model8_test_pred_sum / n_splits_cv_model8
    avg_cv_score_model8 = np.mean(model8_fold_scores)
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Model 8 (VotingClassifier) Average MAP@3 across {n_splits_cv_model8} folds: {avg_cv_score_model8:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_VOTING_OOF), oof_preds_voting)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_VOTING_TEST), test_preds_voting)
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Saved Model 8 predictions to {CFG.OUTPUT_DIR}")

end_time_model8 = datetime.now()
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Model 8 Training Finished. Elapsed: {end_time_model8 - start_time_model8} ---\n")



[11:50:29] --- Training Base Model 8: VotingClassifier ---
[11:50:29] --- Training Fold 1/5 (Model 8: VotingClassifier) ---


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



[12:22:17] Fold 1 VotingClassifier MAP@3: 0.3357. Elapsed for fold: 0:31:48.151599
[12:22:17] --- Training Fold 2/5 (Model 8: VotingClassifier) ---


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



[12:56:36] Fold 2 VotingClassifier MAP@3: 0.3342. Elapsed for fold: 0:34:19.297755
[12:56:37] --- Training Fold 3/5 (Model 8: VotingClassifier) ---


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



[13:31:46] Fold 3 VotingClassifier MAP@3: 0.3348. Elapsed for fold: 0:35:09.134209
[13:31:46] --- Training Fold 4/5 (Model 8: VotingClassifier) ---


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



[14:03:47] Fold 4 VotingClassifier MAP@3: 0.3343. Elapsed for fold: 0:32:01.144742
[14:03:47] --- Training Fold 5/5 (Model 8: VotingClassifier) ---


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



[14:38:43] Fold 5 VotingClassifier MAP@3: 0.3344. Elapsed for fold: 0:34:56.092369

[14:38:43] Model 8 (VotingClassifier) Average MAP@3 across 5 folds: 0.3347
[14:38:43] Saved Model 8 predictions to /kaggle/working/outputs/
[14:38:43] --- Model 8 Training Finished. Elapsed: 2:48:14.806256 ---



In [11]:
# --- NEW Base Model 6: XGBoost (Optimized Parameters) ---
print("\n--- Training Base Model 6: XGBoost (Optimized Parameters) ---")
oof_preds_xgb_optimized, test_preds_xgb_optimized, loaded_flag = load_predictions_if_exist(
    CFG.FNAME_XGB_OPTIMIZED_OOF, CFG.FNAME_XGB_OPTIMIZED_TEST, X.shape, X_test.shape, num_classes
)
if loaded_flag:
    print(f"Loaded existing predictions for Model 6. Skipping training.")
else:
    print(f"No existing predictions found for Model 6. Starting training.")
    # oof_preds_xgb_optimized and test_preds_xgb_optimized are already zero-initialized from load_predictions_if_exist
    params_model6 = {
        'objective': 'multi:softprob',
        'num_class': num_classes, # Use `num_classes` as derived from `y_encoded`
        'max_depth': 10,
        'learning_rate': 0.03,
        'min_child_weight' : 2,
        'n_estimators': 10000, # Use as num_boost_round
        'alpha': 0.8,
        'reg_lambda': 4.0,
        'colsample_bytree': 0.5,
        'subsample': 0.7,
        'max_bin': 128,
        'colsample_bylevel': 1,
        'colsample_bynode': 1,
        'verbose': 0, # Suppress verbose output within xgb.train
        'tree_method': 'hist',
        'random_state': 42,
        'eval_metric': 'mlogloss',
    }
    model6_fold_scores = []
    model6_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model6 = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

    for fold, (trn_idx, val_idx) in enumerate(kf_model6.split(X, y_encoded)):
        print(f"\n{'#'*10} Fold {fold+1} (Model 6) {'#'*10}")
        X_train_fold = X.iloc[trn_idx]
        y_train_fold = y_encoded.iloc[trn_idx]
        X_valid_fold = X.iloc[val_idx]
        y_valid_fold = y_encoded.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
        dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_fold, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        ES = xgb.callback.EarlyStopping(
            rounds=CFG.early_stopping_rounds, # Use CFG value for early stopping
            maximize=False,
            save_best=True,
        )

        model6_instance = xgb.train(
            params_model6,
            dtrain,
            num_boost_round=params_model6['n_estimators'], # Use n_estimators from params_model6 as num_boost_round
            evals=[(dtrain, 'train'), (dvalid, 'validation')],
            verbose_eval=CFG.verbose_eval, # Use CFG value for verbose_eval
            callbacks=[ES]
        )

        oof_preds_xgb_optimized[val_idx] = model6_instance.predict(dvalid, iteration_range=(0, model6_instance.best_iteration + 1))
        model6_test_pred_sum += model6_instance.predict(dtest, iteration_range=(0, model6_instance.best_iteration + 1))

        top3_preds = np.argsort(oof_preds_xgb_optimized[val_idx], axis=1)[:, -3:][:, ::-1]
        actual = [[label] for label in y_valid_fold.values]
        map3_score_fold = mapk(actual, top3_preds)
        model6_fold_scores.append(map3_score_fold)
        print("----------------------------------------------------------------")
        print(f"fold: {fold:02d}, map@3: {map3_score_fold:.6f}, best iteration: {model6_instance.best_iteration}, best score: {model6_instance.best_score: .6f}\n")

        del model6_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_optimized = model6_test_pred_sum / CFG.n_splits
    avg_map3_score_model6 = np.mean(model6_fold_scores)
    print("----------------------------------------------------------------")
    print(f"Model 6 (xgb.train Optimized) Average MAP@3: {avg_map3_score_model6:.6f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_OPTIMIZED_OOF), oof_preds_xgb_optimized)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_OPTIMIZED_TEST), test_preds_xgb_optimized)
    print(f"Saved Model 6 predictions to {CFG.OUTPUT_DIR}")



--- Training Base Model 6: XGBoost (Optimized Parameters) ---
No existing predictions found for oof_preds_xgb_optimized.npy or test_preds_xgb_optimized.npy. Will initialize as zeros.
No existing predictions found for Model 6. Starting training.

########## Fold 1 (Model 6) ##########
[0]	train-mlogloss:1.94531	validation-mlogloss:1.94559
[200]	train-mlogloss:1.86911	validation-mlogloss:1.91857
[400]	train-mlogloss:1.81506	validation-mlogloss:1.90824
[600]	train-mlogloss:1.76771	validation-mlogloss:1.90251
[800]	train-mlogloss:1.72459	validation-mlogloss:1.89919
[1000]	train-mlogloss:1.68494	validation-mlogloss:1.89745
[1200]	train-mlogloss:1.64787	validation-mlogloss:1.89685
[1242]	train-mlogloss:1.64027	validation-mlogloss:1.89686
----------------------------------------------------------------
fold: 00, map@3: 0.358741, best iteration: 1192, best score:  1.896845


########## Fold 2 (Model 6) ##########
[0]	train-mlogloss:1.94531	validation-mlogloss:1.94558
[200]	train-mlogloss:1.86

In [13]:
# --- NEW Base Model 7: XGBoost (Custom Parameters) ---
print("\n--- Training Base Model 7: XGBoost (Custom Parameters) ---")
oof_preds_xgb_custom, test_preds_xgb_custom, loaded_flag = load_predictions_if_exist(
    CFG.FNAME_XGB_CUSTOM_OOF, CFG.FNAME_XGB_CUSTOM_TEST, X.shape, X_test.shape, num_classes
)
if loaded_flag:
    print(f"Loaded existing predictions for Model 7. Skipping training.")
else:
    print(f"No existing predictions found for Model 7. Starting training.")
    # oof_preds_xgb_custom and test_preds_xgb_custom are already zero-initialized from load_predictions_if_exist
    params_model7 = {
        'objective': 'multi:softprob',
        'num_class': num_classes, # Dynamically set num_class based on data
        'max_depth': 16,
        'learning_rate': 0.01,
        'n_estimators': 100_000, # This will be passed as num_boost_round
        'reg_alpha': 3,
        'reg_lambda': 1.4,
        'gamma': 0.26,
        'max_delta_step': 5,
        'subsample': 0.86,
        'colsample_bytree': 0.4,
        'min_child_weight': 5,
        'random_state': 42,
        'n_jobs': -1,
        'eval_metric': 'mlogloss',
        'enable_categorical': True,
        'tree_method': 'hist', # Good for performance and categorical features
    }
    model7_fold_scores = []
    model7_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model7 = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

    for fold, (trn_idx, val_idx) in enumerate(kf_model7.split(X, y_encoded)):
        print(f"\n{'#'*10} Fold {fold+1} (Model 7) {'#'*10}")
        X_train_fold = X.iloc[trn_idx]
        y_train_fold = y_encoded.iloc[trn_idx]
        X_valid_fold = X.iloc[val_idx]
        y_valid_fold = y_encoded.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold, enable_categorical=True)
        dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_fold, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        ES = xgb.callback.EarlyStopping(
            rounds=CFG.early_stopping_rounds, # Use CFG value for early stopping
            maximize=False,
            save_best=True,
        )

        model7_instance = xgb.train(
            params_model7,
            dtrain,
            num_boost_round=params_model7['n_estimators'], # Use n_estimators from params as num_boost_round
            evals=[(dtrain, 'train'), (dvalid, 'validation')],
            verbose_eval=CFG.verbose_eval, # Use CFG value for verbose_eval
            callbacks=[ES]
        )

        oof_preds_xgb_custom[val_idx] = model7_instance.predict(dvalid, iteration_range=(0, model7_instance.best_iteration + 1))
        model7_test_pred_sum += model7_instance.predict(dtest, iteration_range=(0, model7_instance.best_iteration + 1))

        top3_preds = np.argsort(oof_preds_xgb_custom[val_idx], axis=1)[:, -3:][:, ::-1]
        actual = [[label] for label in y_valid_fold.values]
        map3_score_fold = mapk(actual, top3_preds)
        model7_fold_scores.append(map3_score_fold)
        print("----------------------------------------------------------------")
        print(f"fold: {fold:02d}, map@3: {map3_score_fold:.6f}, best iteration: {model7_instance.best_iteration}, best score: {model7_instance.best_score: .6f}\n")

        del model7_instance, dtrain, dvalid, dtest
        gc.collect()

    test_preds_xgb_custom = model7_test_pred_sum / CFG.n_splits
    avg_map3_score_model7 = np.mean(model7_fold_scores)
    print("----------------------------------------------------------------")
    print(f"Model 7 (xgb.train Custom) Average MAP@3: {avg_map3_score_model7:.6f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_CUSTOM_OOF), oof_preds_xgb_custom)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_XGB_CUSTOM_TEST), test_preds_xgb_custom)
    print(f"Saved Model 7 predictions to {CFG.OUTPUT_DIR}")




--- Training Base Model 7: XGBoost (Custom Parameters) ---
No existing predictions found for oof_preds_xgb_custom.npy or test_preds_xgb_custom.npy. Will initialize as zeros.
No existing predictions found for Model 7. Starting training.

########## Fold 1 (Model 7) ##########
[0]	train-mlogloss:1.94566	validation-mlogloss:1.94578
[200]	train-mlogloss:1.90013	validation-mlogloss:1.92738
[400]	train-mlogloss:1.86372	validation-mlogloss:1.91591
[600]	train-mlogloss:1.83239	validation-mlogloss:1.90780
[800]	train-mlogloss:1.80473	validation-mlogloss:1.90192
[1000]	train-mlogloss:1.78030	validation-mlogloss:1.89758
[1200]	train-mlogloss:1.75814	validation-mlogloss:1.89416
[1400]	train-mlogloss:1.73840	validation-mlogloss:1.89163
[1600]	train-mlogloss:1.72106	validation-mlogloss:1.88976
[1800]	train-mlogloss:1.70679	validation-mlogloss:1.88846
[2000]	train-mlogloss:1.69410	validation-mlogloss:1.88753
[2200]	train-mlogloss:1.68313	validation-mlogloss:1.88692
[2400]	train-mlogloss:1.67382	vali

In [7]:
# --- NEW Base Model 11: Gaussian Naive Bayes ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 11: Gaussian Naive Bayes ---")
start_time_model11 = datetime.now()

oof_preds_gaussian_nb, test_preds_gaussian_nb, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_GAUSSIAN_NB_OOF, CFG.FNAME_GAUSSIAN_NB_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 11.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 11. Starting training.")
    # oof_preds_gaussian_nb and test_preds_gaussian_nb are already zero-initialized from load_predictions_if_exist
    
    n_splits_cv_model11 = FOLDS
    model11_fold_scores = []
    model11_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model11 = StratifiedKFold(n_splits=n_splits_cv_model11, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    for fold, (train_idx, valid_idx) in enumerate(kf_model11.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 11: Gaussian Naive Bayes) ---")
        fold_train_X, fold_valid_X = X.iloc[train_idx], X.iloc[valid_idx]
        fold_train_y, fold_valid_y = y_encoded.iloc[train_idx], y_encoded.iloc[valid_idx]

        model11_instance = GaussianNB(var_smoothing=1e-9)
        model11_instance.fit(fold_train_X, fold_train_y)

        # Naive Bayes predict_proba outputs probabilities
        oof_preds_gaussian_nb[valid_idx] = model11_instance.predict_proba(fold_valid_X)
        model11_test_pred_sum += model11_instance.predict_proba(X_test)

        top_3_preds = np.argsort(oof_preds_gaussian_nb[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(fold_valid_y.values, top_3_preds)
        model11_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del model11_instance
        gc.collect()

    test_preds_gaussian_nb = model11_test_pred_sum / n_splits_cv_model11
    avg_cv_score_model11 = np.mean(model11_fold_scores)
    print(f"\nModel 11 (Gaussian Naive Bayes) Average MAP@3 across {n_splits_cv_model11} folds: {avg_cv_score_model11:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_GAUSSIAN_NB_OOF), oof_preds_gaussian_nb)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_GAUSSIAN_NB_TEST), test_preds_gaussian_nb)
    print(f"Saved Model 11 predictions to {CFG.OUTPUT_DIR}")



[17:38:58] --- Training Base Model 11: Gaussian Naive Bayes ---
No existing predictions found for oof_preds_gaussian_nb.npy or test_preds_gaussian_nb.npy. Will initialize as zeros.
[17:38:58] No existing predictions found for Model 11. Starting training.
--- Training Fold 1 (Model 11: Gaussian Naive Bayes) ---
Fold 1 MAP@3: 0.2835
--- Training Fold 2 (Model 11: Gaussian Naive Bayes) ---
Fold 2 MAP@3: 0.2832
--- Training Fold 3 (Model 11: Gaussian Naive Bayes) ---
Fold 3 MAP@3: 0.2819
--- Training Fold 4 (Model 11: Gaussian Naive Bayes) ---
Fold 4 MAP@3: 0.2837
--- Training Fold 5 (Model 11: Gaussian Naive Bayes) ---
Fold 5 MAP@3: 0.2823

Model 11 (Gaussian Naive Bayes) Average MAP@3 across 5 folds: 0.2829
Saved Model 11 predictions to /kaggle/working/outputs/


In [8]:
# --- NEW Base Model 12: K-Nearest Neighbors (KNN) with Hyperparameter Tuning ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] --- Training Base Model 12: K-Nearest Neighbors (KNN) ---")
start_time_model12 = datetime.now()

oof_preds_knn_base, test_preds_knn_base, loaded_from_disk = load_predictions_if_exist(
    CFG.FNAME_KNN_OOF, CFG.FNAME_KNN_TEST, X.shape, X_test.shape, num_classes
)
if loaded_from_disk:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Skipping training for Model 12.")
else:
    print(f"[{datetime.now().strftime('%H:%M:%S')}] No existing predictions found for Model 12. Starting training.")
    
    n_splits_cv_model12 = FOLDS
    model12_fold_scores = []
    model12_test_pred_sum = np.zeros((len(X_test), num_classes))
    kf_model12 = StratifiedKFold(n_splits=n_splits_cv_model12, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

    # Define hyperparameter grid for KNN tuning within each fold
    # This range is kept small for practical execution within an ensemble
    knn_param_grid = {
        'n_neighbors': [3, 5, 7, 9], # Common values for k
        'weights': ['uniform', 'distance'], # How to weight neighbors
        'metric': ['euclidean', 'manhattan'] # Distance metrics
    }

    for fold, (train_idx, valid_idx) in enumerate(kf_model12.split(X, y_encoded)):
        print(f"--- Training Fold {fold+1} (Model 12: KNN) ---")
        
        X_train_fold, y_train_fold = X.iloc[train_idx], y_encoded.iloc[train_idx]
        X_valid_fold, y_valid_fold = X.iloc[valid_idx], y_encoded.iloc[valid_idx]
        
        # Apply StandardScaler specifically to numerical columns for KNN
        scaler_knn = StandardScaler()
        X_train_scaled_knn = X_train_fold.copy()
        X_valid_scaled_knn = X_valid_fold.copy()
        X_test_scaled_knn = X_test.copy()

        # Scale only the numerical columns. KNN is highly sensitive to scaling.
        X_train_scaled_knn[numerical_cols] = scaler_knn.fit_transform(X_train_scaled_knn[numerical_cols])
        X_valid_scaled_knn[numerical_cols] = scaler_knn.transform(X_valid_scaled_knn[numerical_cols])
        X_test_scaled_knn[numerical_cols] = scaler_knn.transform(X_test_scaled_knn[numerical_cols])

        # Perform GridSearchCV for hyperparameter tuning for KNN in this fold
        grid_search_knn = GridSearchCV(
            estimator=KNeighborsClassifier(),
            param_grid=knn_param_grid,
            scoring='log_loss', # Use log_loss as evaluation metric for probability models
            cv=3, # Smaller internal CV for speed, can be increased if needed
            n_jobs=-1, # Use all available cores
            verbose=0 # Suppress verbose output from GridSearchCV
        )
        
        grid_search_knn.fit(X_train_scaled_knn, y_train_fold)
        
        best_knn_model = grid_search_knn.best_estimator_
        print(f"  Fold {fold+1} KNN Best Params: {grid_search_knn.best_params_}")
        print(f"  Fold {fold+1} KNN Best CV LogLoss: {-grid_search_knn.best_score_:.4f}") # GridSearchCV returns negative score for loss metrics

        # Predict probabilities
        oof_preds_knn_base[valid_idx] = best_knn_model.predict_proba(X_valid_scaled_knn)
        model12_test_pred_sum += best_knn_model.predict_proba(X_test_scaled_knn)

        top_3_preds = np.argsort(oof_preds_knn_base[valid_idx], axis=1)[:, -3:][:, ::-1]
        fold_score = mapk(y_valid_fold.values, top_3_preds)
        model12_fold_scores.append(fold_score)
        print(f"Fold {fold+1} MAP@3: {fold_score:.4f}")

        del scaler_knn, grid_search_knn, best_knn_model
        gc.collect()

    test_preds_knn_base = model12_test_pred_sum / n_splits_cv_model12
    avg_cv_score_model12 = np.mean(model12_fold_scores)
    print(f"\nModel 12 (KNN Base) Average MAP@3 across {n_splits_cv_model12} folds: {avg_cv_score_model12:.4f}")

    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_KNN_OOF), oof_preds_knn_base)
    np.save(os.path.join(CFG.OUTPUT_DIR, CFG.FNAME_KNN_TEST), test_preds_knn_base)
    print(f"Saved Model 12 predictions to {CFG.OUTPUT_DIR}")


[10:01:20] --- Training Base Model 12: K-Nearest Neighbors (KNN) ---
No existing predictions found for oof_preds_knn_base.npy or test_preds_knn_base.npy. Will initialize as zeros.
[10:01:20] No existing predictions found for Model 12. Starting training.
--- Training Fold 1 (Model 12: KNN) ---


ValueError: 'log_loss' is not a valid scoring value. Use sklearn.metrics.get_scorer_names() to get valid options.