In [None]:
class Config:
    data_dir = "data"
    n_trials_base = 30
    n_trials_meta = 75
    n_splits = 5
    random_state = 42
    output_name = "submission/pipe_task3_predictionsv2.csv"
    summary_name = "pipe_task3_summaryv2.json"
    base_trials = {"lgbm": 75, "xgb": 75, "cat": 50}

FEATURE_COLUMNS = [
    "friend_count", "social_interactions", "guild_membership", "event_participation_rate",
    "daily_login_streak", "avg_session_length", "sessions_per_week", "total_playtime_hours",
    "days_since_last_login", "achievement_count", "achievement_completion_rate",
    "historical_spending", "prev_month_spending", "total_transactions",
    "avg_transaction_value", "account_age_days", "vip_status", "is_premium_member",
    "primary_game", "games_played", "cross_game_activity", "platform",
    "days_since_last_purchase", "purchase_frequency", "payment_methods_used",
    "purchases_on_discount", "discount_rate_used", "seasonal_spending_pattern",
    "owns_limited_edition", "competitive_rank", "segment",
]
TARGET_COLUMN = "spending_30d"
ID_COLUMN = "id"

In [None]:
def json_serialize(obj):
    if isinstance(obj, (np.integer, np.floating)): return obj.item()
    if isinstance(obj, np.ndarray): return obj.tolist()
    return str(obj)

def prepare_features(train_df, test_df, features):
    train_df, test_df = train_df.copy(), test_df.copy()
    categorical_cols = [col for col in features if train_df[col].dtype == "object"]
    if categorical_cols:
        encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        combined = pd.concat([train_df[categorical_cols], test_df[categorical_cols]], axis=0, ignore_index=True)
        encoder.fit(combined.fillna("missing").astype(str))
        train_df[categorical_cols] = encoder.transform(train_df[categorical_cols].fillna("missing").astype(str))
        test_df[categorical_cols] = encoder.transform(test_df[categorical_cols].fillna("missing").astype(str))

    fill_values = train_df[features].median()
    train_df[features] = train_df[features].fillna(fill_values)
    test_df[features] = test_df[features].fillna(fill_values)
    return train_df[features].to_numpy(np.float32), test_df[features].to_numpy(np.float32)

def mae_metric(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred.ravel())

def get_predictions(model, X):
    return np.asarray(model.predict(X)).reshape(-1, 1)

In [None]:
def build_model(model_name, params, random_state):
    if model_name == "rf":
        return RandomForestRegressor(**params, n_jobs=-1, random_state=random_state)
    if model_name == "lgbm":
        return lgb.LGBMRegressor(**params, objective="mae", n_jobs=-1, random_state=random_state, verbosity=-1)
    if model_name == "xgb":
        return xgb.XGBRegressor(**params, objective="reg:squarederror", tree_method="hist", n_jobs=-1, random_state=random_state)
    if model_name == "cat":
        return CatBoostRegressor(**params, loss_function="MAE", verbose=False, allow_writing_files=False, random_seed=random_state)
    raise ValueError(f"Unknown model {model_name}")

def sample_params(model_name, trial):
    if model_name == "rf":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "max_depth": trial.suggest_int("max_depth", 4, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
        }
    if model_name == "lgbm":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
            "num_leaves": trial.suggest_int("num_leaves", 32, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 4, 15),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 60),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
    if model_name == "xgb":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 4, 15),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
            "gamma": trial.suggest_float("gamma", 1e-3, 5.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
    if model_name == "cat":
        return {
            "iterations": trial.suggest_int("iterations", 500, 1500),
            "depth": trial.suggest_int("depth", 4, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.3, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        }
    raise ValueError(f"Unknown model {model_name}")

def optimize_model(model_name, X, y, cv, n_trials, random_state):
    def objective(trial):
        params = sample_params(model_name, trial)
        scores = []
        for train_idx, valid_idx in cv.split(X, y):
            model = build_model(model_name, params, random_state)
            model.fit(X[train_idx], y[train_idx])
            preds = get_predictions(model, X[valid_idx])
            scores.append(mae_metric(y[valid_idx], preds))
        return float(np.mean(scores))
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params, study.best_value

def generate_oof_predictions(model_name, params, X, y, X_test, cv, random_state):
    oof = np.zeros((len(X), 1), dtype=np.float32)
    test_preds = np.zeros((len(X_test), 1), dtype=np.float32)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y), 1):
        model = build_model(model_name, params, random_state + fold)
        model.fit(X[train_idx], y[train_idx])
        oof[valid_idx] = get_predictions(model, X[valid_idx])
        test_preds += get_predictions(model, X_test) / cv.get_n_splits()
    return oof, test_preds

In [None]:
def optimize_meta_model(meta_X, y, cv, n_trials, random_state):
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 1500),
            "num_leaves": trial.suggest_int("num_leaves", 16, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 60),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
        scores = []
        for train_idx, valid_idx in cv.split(meta_X, y):
            model = lgb.LGBMRegressor(**params, objective="mae", n_jobs=-1, random_state=random_state)
            model.fit(meta_X[train_idx], y[train_idx])
            preds = get_predictions(model, meta_X[valid_idx])
            scores.append(mae_metric(y[valid_idx], preds))
        return float(np.mean(scores))
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params, study.best_value

def train_meta(params, meta_X, y, meta_test, cv, random_state):
    oof = np.zeros((len(meta_X), 1), dtype=np.float32)
    test_preds = np.zeros((len(meta_test), 1), dtype=np.float32)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(meta_X, y), 1):
        model = lgb.LGBMRegressor(**params, objective="mae", n_jobs=-1, random_state=random_state + fold, verbosity=-1)
        model.fit(meta_X[train_idx], y[train_idx])
        oof[valid_idx] = get_predictions(model, meta_X[valid_idx])
        test_preds += get_predictions(model, meta_test) / cv.get_n_splits()
    return oof, test_preds

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

train_df = pd.read_csv(f"{Config.data_dir}/train.csv")
test_df = pd.read_csv(f"{Config.data_dir}/test.csv")

X, X_test = prepare_features(train_df, test_df, FEATURE_COLUMNS)
y = train_df[TARGET_COLUMN].to_numpy(np.float32)
cv = KFold(n_splits=Config.n_splits, shuffle=True, random_state=Config.random_state)

base_models = ["lgbm", "xgb", "cat"]
meta_features, meta_test_features = [], []
meta_feature_names = []
base_scores, base_params = {}, {}

for model_name in tqdm(base_models, desc="Base models"):
    print(f"Optimizing {model_name}...")
    n_trials = Config.base_trials.get(model_name, Config.n_trials_base)
    best_params, best_score = optimize_model(model_name, X, y, cv, n_trials, Config.random_state)
    base_scores[model_name] = best_score
    base_params[model_name] = best_params
    print(f"{model_name} Best CV MAE: {best_score:.5f}")
    
    oof, test_preds = generate_oof_predictions(model_name, best_params, X, y, X_test, cv, Config.random_state)
    meta_features.append(oof)
    meta_test_features.append(test_preds)
    meta_feature_names.append(model_name)

meta_X = np.column_stack(meta_features)
meta_test = np.column_stack(meta_test_features)

print("Optimizing Meta Model...")
meta_params, meta_score = optimize_meta_model(meta_X, y, cv, Config.n_trials_meta, Config.random_state)
print(f"Meta Model Best CV MAE: {meta_score:.5f}")

meta_oof, meta_test_preds = train_meta(meta_params, meta_X, y, meta_test, cv, Config.random_state)
tuned_score = mae_metric(y, meta_oof)
print(f"Meta OOF MAE (retrained): {tuned_score:.5f}")

output_df = pd.DataFrame({ID_COLUMN: test_df[ID_COLUMN], TARGET_COLUMN: meta_test_preds.ravel()})
output_df.to_csv(Config.output_name, index=False)
print(f"Submission saved to {Config.output_name}")

summary = {
    "base_scores": base_scores, "base_params": base_params,
    "meta_score": meta_score, "meta_oof_score": tuned_score,
    "meta_params": meta_params
}
with open(Config.summary_name, "w") as f:
    json.dump(summary, f, indent=2, default=json_serialize)
print(f"Summary saved to {Config.summary_name}")