In [1]:
from __future__ import annotations
# add at top with other imports
import sys
import argparse
import json
import os
import warnings
from dataclasses import dataclass
from typing import Callable, Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler

import lightgbm as lgb
import optuna
import xgboost as xgb
from optuna.samplers import TPESampler

from itertools import combinations
from typing import Dict, List, Sequence, Tuple, Optional, Union
from sklearn.linear_model import TweedieRegressor

ArrayLike = Union[np.ndarray, Sequence[np.ndarray], Dict[str, np.ndarray]]

# --- CatBoost import (install if missing) ---
try:
    from catboost import CatBoostRegressor
except ImportError as e:
    raise ImportError(
        "catboost is not installed. Install with:\n  pip install catboost"
    ) from e

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)


In [2]:
# =====================================================================
# CONFIG
# =====================================================================

@dataclass
class RunConfig:
    """All configurable knobs for the run."""
    train_path: str = "train_v9rqX0R.csv"
    test_path: str = "test_AbJTz2l.csv"
    param_cache_path: str = "tuned_params.json"
    submission_path: str = "submission_generic_ensemble.csv"
    seeds: List[int] = None
    n_folds: int = 10
    tune: bool = True              # re-tune and overwrite cache for selected models
    xgb_trials: int = 50
    lgb_trials: int = 100
    cb_trials: int = 30
    early_stopping_rounds: int = 50
    models: List[str] = None        # which model keys from registry to use
    meta_type: str = "ridge"        # "linear" or "ridge"
    needs_scaling: bool = True
    
    def __post_init__(self):
        if self.seeds is None:
            self.seeds = [42, 2021, 7, 1337, 2025] #[42, 2021, 7, 1337, 2025, 999, 111, 8888, 2024, 3333]
        if self.models is None:
            self.models = ["xgb", "lgb", "cb"] #,"twlr"
            
            
# =====================================================================
# PARAM CACHING (GENERIC)
# =====================================================================

def load_cached_params(path: str) -> Dict[str, Dict]:
    """Load per-model tuned params dictionary; returns {} if not present."""
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


def save_cached_params(path: str, params_by_model: Dict[str, Dict]) -> None:
    """Persist the dict {model_name: best_params} to JSON."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(params_by_model, f, indent=2)            

In [3]:
# =====================================================================
# DATA LOADING / PREPROCESSING / FEATURE ENGINEERING
# =====================================================================

def load_data(cfg: RunConfig) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Load train/test CSV and return (train, test, test_id_frame)."""
    train = pd.read_csv(cfg.train_path)
    test = pd.read_csv(cfg.test_path)
    ids = test[['Item_Identifier', 'Outlet_Identifier']].copy()
    return train, test, ids


def clean_and_engineer_features(train: pd.DataFrame, test: pd.DataFrame
                                ) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    """
    Clean + feature engineer. Returns train_final, test_final, feature_cols.
    (Logic kept close to your original for comparability.)
    """
    train = train.copy(); test = test.copy()
    train['source'] = 'train'; test['source'] = 'test'
    data = pd.concat([train, test], ignore_index=True)

    # --- Label fixes ---
    data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
        'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'
    })

    # --- Item_Weight impute (mean per item, else global) ---
    item_avg_weight = data.groupby('Item_Identifier')['Item_Weight'].mean()
    missing_bool = data['Item_Weight'].isnull()
    data.loc[missing_bool, 'Item_Weight'] = data.loc[missing_bool, 'Item_Identifier'].map(item_avg_weight)
    data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)

    # --- Visibility zero fix (type-wise mean) ---
    visibility_avg = data[data['Item_Visibility'] > 0].groupby('Item_Type')['Item_Visibility'].mean()
    zero_bool = data['Item_Visibility'] == 0
    data.loc[zero_bool, 'Item_Visibility'] = data.loc[zero_bool, 'Item_Type'].map(visibility_avg)
    data['Item_Visibility'].fillna(data['Item_Visibility'].mean(), inplace=True)

    # --- Impute Outlet_Size by Outlet_Type mode ---
    outlet_size_mode = data.groupby('Outlet_Type')['Outlet_Size'].agg(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Medium'
    )
    missing_bool = data['Outlet_Size'].isnull()
    data.loc[missing_bool, 'Outlet_Size'] = data[missing_bool]['Outlet_Type'].map(outlet_size_mode)

    # --- Core features ---
    data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']
    data['Item_Type_Combined'] = data['Item_Identifier'].str[:2].map({'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'})
    data.loc[data['Item_Type_Combined'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'

    # --- Extra features ---
    data['Item_MRP_Bins'] = pd.cut(data['Item_MRP'], bins=4, labels=['Low', 'Medium', 'High', 'Very_High'])
    data['Item_Visibility_Bins'] = pd.qcut(
        data['Item_Visibility'], q=5, labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High']
    )
    data['Price_Per_Weight'] = data['Item_MRP'] / (data['Item_Weight'] + 0.01)
    data['Visibility_MRP_Ratio'] = data['Item_Visibility'] / data['Item_MRP']

    # --- Aggregations ---
    store_item_count = data.groupby('Outlet_Identifier')['Item_Identifier'].count().to_dict()
    data['Store_Item_Count'] = data['Outlet_Identifier'].map(store_item_count)

    mean_sales_by_store = train.groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean().to_dict()
    data['Store_Avg_Sales'] = data['Outlet_Identifier'].map(mean_sales_by_store)
    data['Store_Avg_Sales'].fillna(data['Store_Avg_Sales'].mean(), inplace=True)

    # --- Ordinals ---
    data['Outlet_Type_Num'] = data['Outlet_Type'].map({'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3})
    data['Outlet_Location_Type_Num'] = data['Outlet_Location_Type'].map({'Tier 3': 0, 'Tier 2': 1, 'Tier 1': 2})
    data['Outlet_Size_Num'] = data['Outlet_Size'].map({'Small': 0, 'Medium': 1, 'High': 2})

    # --- Encodings ---
    from sklearn.preprocessing import LabelEncoder
    cat_cols = ['Item_Fat_Content', 'Item_Type', 'Item_Type_Combined', 'Item_MRP_Bins', 'Item_Visibility_Bins']
    for col in cat_cols:
        le = LabelEncoder()
        data[col + '_Encoded'] = le.fit_transform(data[col].astype(str))

    # --- One-hot outlets ---
    outlet_dummies = pd.get_dummies(data['Outlet_Identifier'], prefix='Outlet')
    data = pd.concat([data, outlet_dummies], axis=1)

    # --- Item frequency ---
    item_counts = data['Item_Identifier'].value_counts().to_dict()
    data['Item_Count'] = data['Item_Identifier'].map(item_counts)

    # --- Final splits ---
    train_final = data[data['source'] == 'train'].copy()
    test_final = data[data['source'] == 'test'].copy()

    feature_cols = [
        'Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Years',
        'Price_Per_Weight', 'Visibility_MRP_Ratio', 'Store_Item_Count', 
        'Store_Avg_Sales', 'Item_Count',
        'Outlet_Type_Num', 'Outlet_Location_Type_Num', 'Outlet_Size_Num',
        'Item_Fat_Content_Encoded', 'Item_Type_Encoded', 
        'Item_Type_Combined_Encoded', 'Item_MRP_Bins_Encoded', 
        'Item_Visibility_Bins_Encoded'
    ]
    outlet_cols = [c for c in data.columns if c.startswith('Outlet_OUT')]
    feature_cols.extend(outlet_cols)

    return train_final, test_final, feature_cols

# def clean_and_engineer_features(train: pd.DataFrame, test: pd.DataFrame
#                                 ) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
#     """
#     Clean + feature engineer. Returns train_final, test_final, feature_cols.
#     (Logic kept close to your original for comparability.)
#     """
#     train = train.copy(); test = test.copy()
#     train['source'] = 'train'; test['source'] = 'test'
#     data = pd.concat([train, test], ignore_index=True)

#     # --- Label fixes ---
#     data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
#         'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'
#     })

#     # --- Item_Weight impute (mean per item, else global) ---
#     item_avg_weight = data.groupby('Item_Identifier')['Item_Weight'].mean()
#     missing_bool = data['Item_Weight'].isnull()
#     data.loc[missing_bool, 'Item_Weight'] = data.loc[missing_bool, 'Item_Identifier'].map(item_avg_weight)
#     data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)

#     # --- Visibility zero fix (type-wise mean) ---
#     visibility_avg = data[data['Item_Visibility'] > 0].groupby('Item_Type')['Item_Visibility'].mean()
#     zero_bool = data['Item_Visibility'] == 0
#     data.loc[zero_bool, 'Item_Visibility'] = data.loc[zero_bool, 'Item_Type'].map(visibility_avg)
#     data['Item_Visibility'].fillna(data['Item_Visibility'].mean(), inplace=True)

#     # --- Impute Outlet_Size by Outlet_Type mode ---
#     outlet_size_mode = data.groupby('Outlet_Type')['Outlet_Size'].agg(
#         lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Medium'
#     )
#     missing_bool = data['Outlet_Size'].isnull()
#     data.loc[missing_bool, 'Outlet_Size'] = data[missing_bool]['Outlet_Type'].map(outlet_size_mode)

#     # --- Core features ---
#     data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']
#     data['Item_Type_Combined'] = data['Item_Identifier'].str[:2].map({'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'})
#     data.loc[data['Item_Type_Combined'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'

#     # --- Extra features ---
#     import numpy as np
#     data['Item_MRP_Bins'] = pd.cut(data['Item_MRP'], bins=4, labels=['Low', 'Medium', 'High', 'Very_High'])
#     data['Item_Visibility_Bins'] = pd.qcut(
#         data['Item_Visibility'], q=5, labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High']
#     )
#     data['Price_Per_Weight'] = data['Item_MRP'] / (data['Item_Weight'] + 0.01)
#     data['Visibility_MRP_Ratio'] = data['Item_Visibility'] / data['Item_MRP']
#     # NEW: log transforms to reduce skew
#     #data['Log_Item_MRP'] = np.log1p(data['Item_MRP'])                 # NEW
#     #data['Log_Item_Visibility'] = np.log1p(data['Item_Visibility'])   # NEW
#     #data['Log_Price_Per_Weight'] = np.log1p(data['Price_Per_Weight']) # NEW

#     # --- Store- and category-aware counts (less collinear than raw totals) ---
#     outlet_total_items = data.groupby('Outlet_Identifier')['Item_Identifier'].transform('count')
#     outlet_cat_items = data.groupby(['Outlet_Identifier', 'Item_Type_Combined'])['Item_Identifier'].transform('count')
#     data['Outlet_Cat_Share'] = (outlet_cat_items / (outlet_total_items.replace(0, np.nan))).fillna(0)  # NEW
#     data['Outlet_Cat_Unique_Count'] = data.groupby('Outlet_Identifier')['Item_Type_Combined'].transform('nunique')  # NEW

#     # --- Relative pricing & visibility (within outlet) ---
#     outlet_cat_median_mrp = data.groupby(['Outlet_Identifier', 'Item_Type_Combined'])['Item_MRP'].transform('median')
#     data['MRP_Relative_to_Outlet'] = (data['Item_MRP'] - outlet_cat_median_mrp) / (outlet_cat_median_mrp + 1e-6)  # NEW

#     vis_median_by_outlet = data.loc[data['Item_Visibility'] > 0].groupby('Outlet_Identifier')['Item_Visibility'].median()
#     data['Outlet_Vis_Median'] = data['Outlet_Identifier'].map(vis_median_by_outlet)  # helper (not used as feature)
#     data['Outlet_Vis_Median'].fillna(data['Item_Visibility'].median(), inplace=True)
#     data['Visibility_Relative_to_Outlet'] = (data['Item_Visibility'] - data['Outlet_Vis_Median']) / (data['Outlet_Vis_Median'] + 1e-6)  # NEW

#     # --- Aggregations (keep non-target based; avoid leakage) ---
#     store_item_count = data.groupby('Outlet_Identifier')['Item_Identifier'].count().to_dict()
#     data['Store_Item_Count'] = data['Outlet_Identifier'].map(store_item_count)

#     # Historical sales encodings with leakage control (Leave-One-Out + smoothing)
#     global_mean = train['Item_Outlet_Sales'].mean()
#     prior = 5.0  # smoothing strength

#     # Use engineered TRAIN slice for group stats to avoid KeyError on engineered cols
#     train_ext = data[data['source'] == 'train']

#     def add_mean_encoding(group_col: str, new_col: str):
#         sums = train_ext.groupby(group_col)['Item_Outlet_Sales'].sum()
#         cnts = train_ext.groupby(group_col)['Item_Outlet_Sales'].count()
#         gsum = data[group_col].map(sums)
#         gcnt = data[group_col].map(cnts)
#         enc = pd.Series(index=data.index, dtype='float64')
#         tr_idx = data['source'] == 'train'
#         te_idx = data['source'] == 'test'
#         y_tr = data.loc[tr_idx, 'Item_Outlet_Sales']
#         enc.loc[tr_idx] = ((gsum.loc[tr_idx] - y_tr) + prior * global_mean) / (((gcnt.loc[tr_idx] - 1).clip(lower=0)) + prior)
#         enc.loc[te_idx] = ((gsum.loc[te_idx]) + prior * global_mean) / ((gcnt.loc[te_idx]) + prior)
#         data[new_col] = enc.fillna(global_mean)

#     # Powerful historical demand signals (item / outlet / category)
#     #add_mean_encoding('Item_Identifier', 'Enc_Item_Mean_Sales')          # NEW: target-mean encoding per item
#     #add_mean_encoding('Outlet_Identifier', 'Enc_Outlet_Mean_Sales')      # NEW: target-mean encoding per outlet
#     #add_mean_encoding('Item_Type_Combined', 'Enc_Cat_Mean_Sales')        # NEW: target-mean encoding per category

#     # (Legacy) Mean sales by store for completeness (not used in features to avoid leakage risk)
#     mean_sales_by_store = train.groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean().to_dict()
#     data['Store_Avg_Sales'] = data['Outlet_Identifier'].map(mean_sales_by_store)
#     data['Store_Avg_Sales'].fillna(data['Store_Avg_Sales'].mean(), inplace=True)

#     # --- Ordinals ---
#     data['Outlet_Type_Num'] = data['Outlet_Type'].map({'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3})
#     data['Outlet_Location_Type_Num'] = data['Outlet_Location_Type'].map({'Tier 3': 0, 'Tier 2': 1, 'Tier 1': 2})
#     data['Outlet_Size_Num'] = data['Outlet_Size'].map({'Small': 0, 'Medium': 1, 'High': 2})

#     # --- Encodings ---
#     from sklearn.preprocessing import LabelEncoder
#     cat_cols = ['Item_Fat_Content', 'Item_Type', 'Item_Type_Combined', 'Item_MRP_Bins', 'Item_Visibility_Bins']
#     for col in cat_cols:
#         le = LabelEncoder()
#         data[col + '_Encoded'] = le.fit_transform(data[col].astype(str))

#     # --- One-hot outlets ---
#     outlet_dummies = pd.get_dummies(data['Outlet_Identifier'], prefix='Outlet')
#     data = pd.concat([data, outlet_dummies], axis=1)

#     # --- Item frequency ---
#     item_counts = data['Item_Identifier'].value_counts().to_dict()
#     data['Item_Count'] = data['Item_Identifier'].map(item_counts)

#     # --- Final splits ---
#     train_final = data[data['source'] == 'train'].copy()
#     test_final = data[data['source'] == 'test'].copy()

#     # --- Feature columns (AUTO SELECT + optional drop) ---
#     # Build from all numeric columns, then remove targets/helpers/known leakage.
#     non_feature_cols = {
#         'Item_Outlet_Sales',   # target (train only)
#         'Outlet_Vis_Median',   # helper column
#         #'Store_Avg_Sales',     # legacy mean target by store (leak-prone)
#         'Outlet_Establishment_Year'  # superseded by Outlet_Years
#     }
#     numeric_cols = [c for c in data.select_dtypes(include=[np.number]).columns]
#     feature_cols = [c for c in numeric_cols if c not in non_feature_cols]

#     # Optional user-defined drops via DataFrame attrs (no signature change)
#     user_drop = set()
#     if hasattr(train, 'attrs') and isinstance(train.attrs.get('drop_features', None), list):
#         user_drop |= set(train.attrs['drop_features'])
#     if hasattr(test, 'attrs') and isinstance(test.attrs.get('drop_features', None), list):
#         user_drop |= set(test.attrs['drop_features'])
#     if user_drop:
#         feature_cols = [c for c in feature_cols if c not in user_drop]
        
#     print(len(feature_cols), ":::",feature_cols)
#     return train_final, test_final, feature_cols


In [4]:
# =====================================================================
# MODEL SPEC INTERFACE (GENERIC)
# =====================================================================

class ModelSpec:
    """
    Describes how to tune, build, fit, and predict with a base model.
    Adding a model = implement 3 small functions + register it here.
    """
    def __init__(
        self,
        name: str,
        tuner: Callable[[pd.DataFrame, pd.Series, "RunConfig"], Dict],
        builder: Callable[[Dict, int], object],
        fit_fn: Callable[[object, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series, "RunConfig"], None],
        predict_fn: Callable[[object, pd.DataFrame], np.ndarray] = None,
        needs_scaling: bool = False,   # set True for models that require scaling
    ):
        self.name = name
        self.tuner = tuner
        self.builder = builder
        self.fit_fn = fit_fn
        self.predict_fn = predict_fn or (lambda model, X: model.predict(X))
        self.needs_scaling = needs_scaling


# =====================================================================
# TUNERS (RE-USED ACROSS MODELS) — NO GLOBAL SCALING
# =====================================================================

def tune_xgb(X: pd.DataFrame, y: pd.Series, cfg: RunConfig) -> Dict:
    """Optuna tuner for XGBoost (Tweedie)."""
    def objective(trial):
        params = {
            'objective': trial.suggest_categorical('objective', ['reg:tweedie']),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'random_state': 42, 'verbosity': 0
        }
        if params['objective'] == 'reg:tweedie':
            params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1.1, 1.9)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        for tr, va in kf.split(X):
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]
            m = xgb.XGBRegressor(**params)
            m.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
                  early_stopping_rounds=cfg.early_stopping_rounds, verbose=False)
            p = m.predict(X_va)
            scores.append(np.sqrt(mean_squared_error(y_va, p)))
        return float(np.mean(scores))
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=cfg.xgb_trials, show_progress_bar=True)
    return study.best_params


def tune_lgb(X: pd.DataFrame, y: pd.Series, cfg: RunConfig) -> Dict:
    """Optuna tuner for LightGBM (regression/Tweedie)."""
    def objective(trial):
        objective = trial.suggest_categorical('objective', ['regression', 'tweedie'])
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'objective': objective, 'metric': 'rmse',
            'random_state': 42, 'verbosity': -1
        }
        if objective == 'tweedie':
            params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1.1, 1.9)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        for tr, va in kf.split(X):
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]
            m = lgb.LGBMRegressor(**params)
            m.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
                  callbacks=[lgb.early_stopping(cfg.early_stopping_rounds), lgb.log_evaluation(0)])
            p = m.predict(X_va)
            scores.append(np.sqrt(mean_squared_error(y_va, p)))
        return float(np.mean(scores))
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=cfg.lgb_trials, show_progress_bar=True)
    return study.best_params


def tune_cb(X: pd.DataFrame, y: pd.Series, cfg: RunConfig) -> Dict:
    """Optuna tuner for CatBoost (RMSE/Tweedie)."""
    def objective(trial):
        bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
        use_tweedie = trial.suggest_categorical('use_tweedie', [True])
        if use_tweedie:
            vp = trial.suggest_float('tweedie_variance_power', 1.1, 1.9)
            loss = f'Tweedie:variance_power={vp}'
        else:
            loss = 'RMSE'
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'rsm': trial.suggest_float('rsm', 0.5, 1.0),
            'bootstrap_type': bootstrap_type,
            'loss_function': loss, 'eval_metric': 'RMSE',
            'random_state': 42, 'verbose': False, 'allow_writing_files': False, 'task_type': 'CPU'
        }
        if bootstrap_type == 'Bayesian':
            params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 5.0)
        elif bootstrap_type == 'Bernoulli':
            params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        for tr, va in kf.split(X):
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]
            m = CatBoostRegressor(**params)
            m.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)
            p = m.predict(X_va)
            scores.append(np.sqrt(mean_squared_error(y_va, p)))
        return float(np.mean(scores))
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=cfg.cb_trials, show_progress_bar=True)
    best = study.best_params
    # normalize Tweedie flags for reuse
    if best.get('use_tweedie', False):
        vp = best.get('tweedie_variance_power', 1.5)
        best['loss_function'] = f'Tweedie:variance_power={vp}'
    else:
        best['loss_function'] = 'RMSE'
    best.pop('use_tweedie', None)
    best.pop('tweedie_variance_power', None)
    return best

def tune_twlr(X: pd.DataFrame, y: pd.Series, cfg: RunConfig) -> Dict:
    """Optuna tuner for Tweedie GLM (linear regression)."""
    def objective(trial):
        params = {
            # power in (1,2): compound Poisson-Gamma; matches your GBM Tweedie range
            'power': trial.suggest_float('power', 1.1, 1.9),
            'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
            #'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
            #'max_iter': trial.suggest_int('max_iter', 200, 2000),
            #'tol': trial.suggest_float('tol', 1e-8, 1e-3, log=True),
            'link': 'auto',         # uses log link for 1<p<2
            # solver is lbfgs for sklearn>=1.2; leave default
        }

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        for tr, va in kf.split(X):
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]

            # Tweedie (1<p<2) requires y >= 0
            y_tr = np.clip(y_tr, 0, None)
            y_va = np.clip(y_va, 0, None)

            m = TweedieRegressor(**params)
            m.fit(X_tr, y_tr)
            p = m.predict(X_va)
            scores.append(np.sqrt(mean_squared_error(y_va, p)))

        return float(np.mean(scores))

    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
    n_trials = int(getattr(cfg, "twlr_trials", 15))  # falls back to 80 if not in RunConfig
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params

# =====================================================================
# BUILDERS + FITTERS (PER-MODEL)
# =====================================================================

def build_xgb(params: Dict, seed: int):
    return xgb.XGBRegressor(**{**params, 'random_state': seed, 'verbosity': 0})

def fit_xgb(model, X_tr, y_tr, X_va, y_va, cfg: RunConfig):
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
              early_stopping_rounds=cfg.early_stopping_rounds, verbose=False)

def build_lgb(params: Dict, seed: int):
    merged = {**params, 'random_state': seed, 'verbosity': -1,
              'bagging_seed': seed, 'feature_fraction_seed': seed}
    return lgb.LGBMRegressor(**merged)

def fit_lgb(model, X_tr, y_tr, X_va, y_va, cfg: RunConfig):
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)],
              callbacks=[lgb.early_stopping(cfg.early_stopping_rounds), lgb.log_evaluation(0)])

def build_cb(params: Dict, seed: int):
    merged = {**params, 'eval_metric': 'RMSE', 'random_state': seed,
              'thread_count': -1, 'verbose': False, 'allow_writing_files': False}
    return CatBoostRegressor(**merged)

def fit_cb(model, X_tr, y_tr, X_va, y_va, cfg: RunConfig):
    model.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)

def build_twlr(params: Dict, seed: int):
    # seed not used by TweedieRegressor (deterministic optimizer)
    return TweedieRegressor(**params)

def fit_twlr(model, X_tr, y_tr, X_va, y_va, cfg: RunConfig):
    # Outer pipeline should handle scaling because needs_scaling=True below.
    # Ensure non-negative targets for Tweedie.
    y_tr = np.clip(y_tr, 0, None)
    model.fit(X_tr, y_tr)

# =====================================================================
# MODEL REGISTRY (ADD NEW MODELS HERE)
# =====================================================================

MODEL_REGISTRY: Dict[str, ModelSpec] = {
    "xgb": ModelSpec("xgb", tuner=tune_xgb, builder=build_xgb, fit_fn=fit_xgb, needs_scaling=False),
    "lgb": ModelSpec("lgb", tuner=tune_lgb, builder=build_lgb, fit_fn=fit_lgb, needs_scaling=False),
    "cb":  ModelSpec("cb",  tuner=tune_cb,  builder=build_cb,  fit_fn=fit_cb,  needs_scaling=False),
    "twlr" : ModelSpec("twlr",tuner=tune_twlr,builder=build_twlr,fit_fn=fit_twlr,needs_scaling=False)
}
# To add a new model (example sketch):
# def tune_enet(X, y, cfg): ...  # inside CV, fit a scaler on X_tr, transform X_tr/X_va
# def build_enet(params, seed): return ElasticNet(**params, random_state=seed)
# def fit_enet(model, X_tr, y_tr, X_va, y_va, cfg): model.fit(X_tr, y_tr)
# MODEL_REGISTRY["enet"] = ModelSpec("enet", tune_enet, build_enet, fit_enet, needs_scaling=True)

In [5]:
# =====================================================================
# META-LEARNER (GENERIC, RAW PREDICTIONS ONLY)
# =====================================================================

def create_meta_features(
    preds: ArrayLike,
    *,
    order: Optional[List[str]] = None,     # fixed model order (for dict input)
    include_raw: bool = True,
    include_squares: bool = True,
    include_pairwise: bool = True,         # pairwise interactions: xi * xj
    include_stats: Sequence[str] = ("max", "min", "mean", "std")
) -> Tuple[np.ndarray, List[str], List[str]]:
    """
    Build meta-features from an arbitrary number of base-model predictions.

    Parameters
    ----------
    preds : dict[str, 1D array] | list/tuple[1D array] | 2D array (n_samples, n_models)
        Base predictions (OOF or test). All arrays must have the same length.
    order : list[str], optional
        If preds is a dict, enforce this column order (keeps train/test consistent).
        If None, dict keys are sorted alphabetically.
    include_raw : bool
        Include raw base predictions as features.
    include_squares : bool
        Include squared terms for each base prediction.
    include_pairwise : bool
        Include pairwise interaction terms for each unordered pair (i<j).
    include_stats : sequence of {"max","min","mean","std"}
        Include row-wise reduction stats across models.

    Returns
    -------
    Xf : ndarray, shape (n_samples, n_features)
        Engineered meta-features.
    feature_names : list[str]
        Names/descriptors for each column in Xf.
    model_names : list[str]
        The ordered model names corresponding to input columns (to reuse later).
    """
    # ---- normalize input to matrix (n_samples, n_models) + names ----
    if isinstance(preds, dict):
        model_names = list(order) if order is not None else sorted(preds.keys())
        cols = [np.asarray(preds[name]).reshape(-1) for name in model_names]
    elif isinstance(preds, (list, tuple)):
        cols = [np.asarray(a).reshape(-1) for a in preds]
        model_names = [f"m{i}" for i in range(len(cols))]
    else:
        X = np.asarray(preds)
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        model_names = [f"m{i}" for i in range(X.shape[1])]
        cols = [X[:, i] for i in range(X.shape[1])]

    # length consistency check
    if len(cols) == 0:
        raise ValueError("No prediction columns provided.")
    n = len(cols[0])
    if any(len(c) != n for c in cols):
        raise ValueError("All prediction arrays must have the same length.")

    X = np.column_stack(cols).astype(np.float64, copy=False)
    m = X.shape[1]

    features: List[np.ndarray] = []
    names: List[str] = []

    # ---- raw ----
    if include_raw:
        features.append(X)
        names += [f"{name}" for name in model_names]

    # ---- squares ----
    if include_squares:
        features.append(X ** 2)
        names += [f"{name}^2" for name in model_names]

    # ---- pairwise interactions ----
    if include_pairwise and m >= 2:
        for i, j in combinations(range(m), 2):
            features.append((X[:, i] * X[:, j]).reshape(-1, 1))
            names.append(f"{model_names[i]}*{model_names[j]}")

    # ---- row-wise stats ----
    stats = set((include_stats or ()))
    stats = {s.lower() for s in stats}
    if "max" in stats:
        features.append(np.max(X, axis=1, keepdims=True)); names.append("row_max")
    if "min" in stats:
        features.append(np.min(X, axis=1, keepdims=True)); names.append("row_min")
    if "mean" in stats:
        features.append(np.mean(X, axis=1, keepdims=True)); names.append("row_mean")
    if "std" in stats:
        features.append(np.std(X, axis=1, keepdims=True)); names.append("row_std")

    Xf = np.column_stack(features) if features else np.empty((n, 0))
    return Xf, names, model_names


def create_meta_matrix(
    preds_by_model: Dict[str, np.ndarray],
    order: Optional[List[str]] = None,
    engineered: bool = False,
) -> Tuple[np.ndarray, List[str]]:
    """
    Convenience wrapper for dict inputs.

    engineered=False  -> raw only
    engineered=True   -> raw + squares + pairwise + row_max/min/std
    """
    if not engineered:
        X_raw, _, model_order = create_meta_features(
            preds_by_model,
            order=order,
            include_raw=True,
            include_squares=False,
            include_pairwise=False,
            include_stats=(),
        )
        return X_raw, model_order

    X_eng, _, model_order = create_meta_features(
        preds_by_model,
        order=order,
        include_raw=True,
        include_squares=True,
        include_pairwise=True,
        include_stats=("max", "min", "std"),
    )
    return X_eng, model_order


def fit_meta(
    oof_by_model: Dict[str, np.ndarray],
    y_true: np.ndarray,
    meta_type: str,
    *,
    engineered: bool = False
) -> Tuple[object, List[str], float]:
    """
    Fit a meta-learner on arbitrary N base predictions.

    Returns
    -------
    model        : fitted meta model (LinearRegression or Ridge)
    base_order   : list[str]  (order of base models used for columns)
    oof_rmse     : float       RMSE on the OOF meta-fit
    """
    # Build meta matrix (raw by default; engineered if requested)
    P, base_order = create_meta_matrix(oof_by_model, order=None, engineered=engineered)

    if meta_type == "ridge":
        alphas = [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]
        gs = GridSearchCV(Ridge(fit_intercept=False), {'alpha': alphas}, cv=5,
                          scoring='neg_mean_squared_error', n_jobs=-1)
        gs.fit(P, y_true)
        model = gs.best_estimator_
    elif meta_type == "linear":
        model = LinearRegression(fit_intercept=False)
        model.fit(P, y_true)
    else:
        raise ValueError("meta_type must be 'linear' or 'ridge'")

    oof_pred = np.maximum(model.predict(P), 0)
    rmse = float(np.sqrt(mean_squared_error(y_true, oof_pred)))
    return model, base_order, rmse


def predict_meta(
    model: object,
    col_order: List[str],
    preds_by_model: Dict[str, np.ndarray],
    *,
    engineered: bool = False
) -> np.ndarray:
    """
    Predict with the meta-learner using the same base column order and feature mode.
    """
    P_test, _ = create_meta_matrix(preds_by_model, order=col_order, engineered=engineered)
    return np.maximum(model.predict(P_test), 0)

In [6]:
# =====================================================================
# TUNING OR LOADING PARAMS (ONLY FOR SELECTED MODELS)
# =====================================================================

def get_params_for_models(cfg: RunConfig, X_raw: pd.DataFrame, y: pd.Series
                          ) -> Dict[str, Dict]:
    """
    Load cache; for selected models, ensure tuned params exist (tune if missing or --tune).
    Saves back to cache and returns {model_name: params}.
    """
    cache = load_cached_params(cfg.param_cache_path)
    changed = False
    out: Dict[str, Dict] = {}

    for key in cfg.models:
        spec = MODEL_REGISTRY[key]
        if cfg.tune or key not in cache:
            print(f"[params] Tuning: {key}")
            best = spec.tuner(X_raw, y, cfg)    # pass RAW X (no global scaling)
            cache[key] = best
            changed = True
        out[key] = cache[key]

    if changed:
        save_cached_params(cfg.param_cache_path, cache)
        print(f"[params] Saved tuned params -> {cfg.param_cache_path}")
    else:
        if os.path.exists(cfg.param_cache_path):
            print(f"[params] Loaded tuned params from cache -> {cfg.param_cache_path}")
        else:
            print("[params] No cache file found; tuned params were computed this run.")

    return out


# =====================================================================
# TRAINING (GENERIC N-MODEL, MULTI-SEED, K-FOLD)
# =====================================================================

def train_generic_ensemble(
    cfg: RunConfig,
    params_by_model: Dict[str, Dict],
    X_train_raw: pd.DataFrame,
    y_train: pd.Series,
    X_test_raw: pd.DataFrame
) -> Tuple[np.ndarray, Dict]:
    """
    Train all chosen models generically with multi-seed K-Fold.
    • For each seed:
        - KFold splits
        - Optionally per-fold MinMax scaling if any model needs it
        - Train every model; collect OOF + test preds
        - Fit meta-learner (linear/ridge) on raw OOF predictions
        - Compare to simple uniform average; keep the better per seed
    • Average winning predictions across seeds
    • Train a final meta on averaged OOFs for a last comparison vs uniform avg
    """
    per_seed_chosen_preds = []
    per_seed_oof_by_model = []      # list of dicts {model: oof}
    per_seed_test_by_model = []     # list of dicts {model: test_avg}
    per_seed_model_rmses = []       # reporting rmse per model per seed

    any_needs_scaling = cfg.needs_scaling #any(MODEL_REGISTRY[m].needs_scaling for m in cfg.models)

    for seed in cfg.seeds:
        print(f"\n>>> Seed {seed}")
        kf = KFold(n_splits=cfg.n_folds, shuffle=True, random_state=seed)

        # init collectors for this seed
        oof_by_model = {m: np.zeros(len(X_train_raw)) for m in cfg.models}
        test_by_model = {m: np.zeros(len(X_test_raw)) for m in cfg.models}
        model_rmses = {}

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train_raw), 1):
            print(f"    Fold {fold}/{cfg.n_folds}")

            # per-fold optional scaling (only used if any model needs it)
            X_tr_raw = X_train_raw.iloc[tr_idx]; y_tr = y_train.iloc[tr_idx]
            X_va_raw = X_train_raw.iloc[va_idx]; y_va = y_train.iloc[va_idx]
            
            if any_needs_scaling:
                scaler = MinMaxScaler()
                X_tr = pd.DataFrame(scaler.fit_transform(X_tr_raw), columns=X_tr_raw.columns, index=X_tr_raw.index)
                X_va = pd.DataFrame(scaler.transform(X_va_raw), columns=X_va_raw.columns, index=X_va_raw.index)
                X_te = pd.DataFrame(scaler.transform(X_test_raw), columns=X_test_raw.columns, index=X_test_raw.index)
            else:
                # tree models → raw features are fine
                X_tr, X_va, X_te = X_tr_raw, X_va_raw, X_test_raw

            # run every selected model
            for key in cfg.models:
                spec = MODEL_REGISTRY[key]
                model = spec.builder(params_by_model[key], seed)
                spec.fit_fn(model, X_tr, y_tr, X_va, y_va, cfg)

                # fill OOF + accumulate test preds
                oof_by_model[key][va_idx] = spec.predict_fn(model, X_va)
                test_by_model[key] += spec.predict_fn(model, X_te) / cfg.n_folds

        # Per-model OOF RMSEs (reporting only)
        for key in cfg.models:
            rmse = float(np.sqrt(mean_squared_error(y_train, oof_by_model[key])))
            model_rmses[key] = rmse
        print("    OOF RMSE by model:", " | ".join(f"{k}: {v:.3f}" for k, v in model_rmses.items()))

        # Uniform average baseline (generic)
        names = sorted(cfg.models)
        oof_stack = np.column_stack([oof_by_model[n] for n in names])
        test_stack = np.column_stack([test_by_model[n] for n in names])
        uniform_oof = np.maximum(oof_stack.mean(axis=1), 0)
        uniform_test = np.maximum(test_stack.mean(axis=1), 0)
        uniform_rmse = float(np.sqrt(mean_squared_error(y_train, uniform_oof)))
        print(f"    Uniform-average OOF RMSE: {uniform_rmse:.3f}")

        # Meta-learner on raw OOFs (no engineered extras)
        meta_model, base_order, meta_rmse = fit_meta(oof_by_model, y_train.values, cfg.meta_type, engineered=True)
        meta_test = predict_meta(meta_model, base_order, test_by_model, engineered=True)
        
        print(f"    Meta-learner ({cfg.meta_type}) OOF RMSE: {meta_rmse:.3f}")

        # pick better for this seed
        chosen = meta_test if meta_rmse <= uniform_rmse else uniform_test
        per_seed_chosen_preds.append(chosen)
        per_seed_oof_by_model.append(oof_by_model)
        per_seed_test_by_model.append(test_by_model)
        per_seed_model_rmses.append(model_rmses)

    # ===== Average across seeds =====
    final_seed_avg = np.mean(np.column_stack(per_seed_chosen_preds), axis=1)

    # Build averaged OOF across seeds for final reporting comparison
    avg_oof_by_model: Dict[str, np.ndarray] = {k: None for k in cfg.models}
    for d in per_seed_oof_by_model:
        for k in cfg.models:
            avg_oof_by_model[k] = (d[k] if avg_oof_by_model[k] is None
                                   else avg_oof_by_model[k] + d[k])
    for k in cfg.models:
        avg_oof_by_model[k] /= len(cfg.seeds)

    # final uniform vs final meta on averaged OOFs (reporting only)
    names = sorted(cfg.models)
    oof_stack = np.column_stack([avg_oof_by_model[n] for n in names])
    uniform_oof = np.maximum(oof_stack.mean(axis=1), 0)
    uniform_rmse = float(np.sqrt(mean_squared_error(y_train, uniform_oof)))
    final_meta_model, col_order, meta_rmse = fit_meta(avg_oof_by_model, y_train.values, cfg.meta_type)
    final_method = f"Meta ({cfg.meta_type})" if meta_rmse <= uniform_rmse else "Uniform Average"

    report = {
        "per_seed_model_rmses": per_seed_model_rmses,
        "final_method": final_method,
        "uniform_oof_rmse": uniform_rmse,
        "meta_oof_rmse": meta_rmse
    }
    return np.maximum(final_seed_avg, 0), report

In [7]:
def parse_args(argv=None) -> RunConfig:
    """CLI argument parsing that plays nice with Jupyter/VSCode (ignores unknown args)."""
    p = argparse.ArgumentParser(description="Generic Big Mart Ensemble with N-Model Meta-Learner (Leakage-Safe)")
    p.add_argument("--train", type=str, default="train_v9rqX0R.csv", help="Path to train CSV")
    p.add_argument("--test", type=str, default="test_AbJTz2l.csv", help="Path to test CSV")
    p.add_argument("--param-cache", type=str, default="tuned_params.json", help="Path to param cache JSON")
    p.add_argument("--submission", type=str, default="submission_generic_ensemble.csv", help="Path to output submission CSV")
    p.add_argument("--seeds", type=int, nargs="+", default=None, help="Seeds for multi-seed averaging")
    p.add_argument("--folds", type=int, default=10, help="Number of K folds")
    p.add_argument("--tune", action="store_true", help="Force tuning for the selected models")
    p.add_argument("--xgb-trials", type=int, default=50, help="XGB Optuna trials")
    p.add_argument("--lgb-trials", type=int, default=100, help="LGB Optuna trials")
    p.add_argument("--cb-trials", type=int, default=30, help="CB Optuna trials")
    p.add_argument("--esr", type=int, default=50, help="Early stopping rounds")
    p.add_argument("--models", nargs="+", default=None, help="Subset of models to use (registry keys), e.g. --models xgb lgb cb")
    p.add_argument("--meta", type=str, choices=["linear", "ridge"], default="ridge", help="Meta-learner type")
    p.add_argument("--scale-policy", choices=[True  , False],default=True)
    
    # <-- swallow unknown args (e.g., Jupyter's -f)
    args, _unknown = p.parse_known_args(argv)

    return RunConfig(
        train_path=args.train,
        test_path=args.test,
        param_cache_path=args.param_cache,
        submission_path=args.submission,
        seeds=args.seeds,
        n_folds=args.folds,
        tune=False ,#args.tune,
        xgb_trials=args.xgb_trials,
        lgb_trials=args.lgb_trials,
        cb_trials=args.cb_trials,
        early_stopping_rounds=args.esr,
        models=args.models,
        meta_type=args.meta,
        needs_scaling = True #args.scale_policy
    )


In [8]:
np.random.seed(42)
cfg = parse_args()

In [9]:
# =====================================================================
# MAIN ORCHESTRATION
# =====================================================================

"""Orchestrate the full, generic pipeline."""
print("=" * 72)
print("BIG MART SALES — GENERIC ENSEMBLE (N-Models + N-Pred Meta-Learner)")
print("=" * 72)

print("\n[1/6] Loading data...")
train, test, id_frame = load_data(cfg)
print(f"Train: {train.shape}, Test: {test.shape}")

print("\n[2/6] Cleaning + feature engineering...")
train_final, test_final, feature_cols = clean_and_engineer_features(train, test)

print("\n[3/6] Preparing matrices (raw features; no global scaling)...")
X_train_raw = train_final[feature_cols].fillna(0)
X_test_raw  = test_final[feature_cols].fillna(0)
y_train = train_final['Item_Outlet_Sales']

print("\n[4/6] Load/Tune per-model hyperparameters (leakage-safe)...")
params_by_model = get_params_for_models(cfg, X_train_raw, y_train)

BIG MART SALES — GENERIC ENSEMBLE (N-Models + N-Pred Meta-Learner)

[1/6] Loading data...
Train: (8523, 12), Test: (5681, 11)

[2/6] Cleaning + feature engineering...

[3/6] Preparing matrices (raw features; no global scaling)...

[4/6] Load/Tune per-model hyperparameters (leakage-safe)...
[params] Loaded tuned params from cache -> tuned_params.json


In [10]:
print("\n[5/6] Train generic multi-seed K-Fold ensemble...")
final_preds, report = train_generic_ensemble(cfg, params_by_model, X_train_raw, y_train, X_test_raw)

print("\n[6/6] Write submission...")
submission = pd.DataFrame({
    'Item_Identifier': id_frame['Item_Identifier'],
    'Outlet_Identifier': id_frame['Outlet_Identifier'],
    'Item_Outlet_Sales': final_preds
})
submission.to_csv(cfg.submission_path, index=False)

print("\n" + "=" * 72)
print("SUBMISSION GENERATED")
print("=" * 72)
print(f"File: {cfg.submission_path}")
print(f"Models used: {cfg.models}")
print(f"Meta type: {cfg.meta_type} | Final method chosen: {report['final_method']}")
print(f"Uniform OOF RMSE: {report['uniform_oof_rmse']:.3f} | Meta OOF RMSE: {report['meta_oof_rmse']:.3f}")
print("=" * 72)



[5/6] Train generic multi-seed K-Fold ensemble...

>>> Seed 42
    Fold 1/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[294]	valid_0's tweedie: 311.326
    Fold 2/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[252]	valid_0's tweedie: 305.912
    Fold 3/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[200]	valid_0's tweedie: 315.753
    Fold 4/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[261]	valid_0's tweedie: 309.968
    Fold 5/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[211]	valid_0's tweedie: 320.315
    Fold 6/10
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[226]	valid_0's tweedie: 308.883
    Fold 7/10
Training until validation scores don't improve for 50 rounds
Ear