In [None]:
class Config:
    data_dir = "data"
    n_trials_base = 30
    n_trials_meta = 30
    n_splits = 5
    random_state = 42
    output_name = "submission/pipe_task2_predictionsv2.csv"
    summary_name = "pipe_task2_summaryv2.json"
    base_trials = {"lgbm": 30, "xgb": 30, "cat": 30}

FEATURE_COLUMNS = [
    "loyalty_tier", "engagement_level", "vip_tier", "days_since_last_login",
    "avg_monthly_spending", "spending_frequency", "total_playtime_hours",
    "friend_count", "rare_items_count", "avg_session_duration",
    "gifts_sent_received", "total_spending_thb", "friend_invites_sent",
    "peak_concurrent_hours", "play_frequency", "speed_of_progression",
    "chat_activity_score", "tournament_entries", "login_streak",
    "win_rate_ranked", "player_num", "player_num_norm", "player_num_z",
    "player_bucket",
]
TARGET_COLUMN = "segment"
ID_COLUMN = "id"

In [None]:
def json_serialize(obj):
    if isinstance(obj, (np.integer, np.floating)): return obj.item()
    if isinstance(obj, np.ndarray): return obj.tolist()
    return str(obj)

def add_player_features(df, stats):
    player_num = df["player_id"].str.extract(r"(\d+)").squeeze().astype(float)
    df["player_num"] = player_num
    range_span = max(stats["pmax"] - stats["pmin"], 1.0)
    df["player_num_norm"] = (player_num - stats["pmin"]) / range_span
    df["player_num_z"] = (player_num - stats["mean"]) / stats["std"]
    edges = [stats["pmin"] - 1.0] + stats["quantiles"] + [stats["pmax"] + 1.0]
    df["player_bucket"] = pd.cut(player_num, bins=edges, labels=False, include_lowest=True)
    return df

def prepare_features(train_df, test_df, features):
    train_df, test_df = train_df.copy(), test_df.copy()
    
    if "player_id" in train_df.columns:
        train_player_nums = train_df["player_id"].str.extract(r"(\d+)").squeeze().astype(float)
        player_stats = {
            "pmin": float(train_player_nums.min()), "pmax": float(train_player_nums.max()),
            "mean": float(train_player_nums.mean()), "std": float(train_player_nums.std()),
            "quantiles": train_player_nums.quantile([0.25, 0.5, 0.75]).tolist(),
        }
        train_df = add_player_features(train_df, player_stats)
        test_df = add_player_features(test_df, player_stats)
    
    categorical_cols = [col for col in features if train_df[col].dtype == "object"]
    if categorical_cols:
        encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        combined = pd.concat([train_df[categorical_cols], test_df[categorical_cols]], axis=0, ignore_index=True)
        encoder.fit(combined.fillna("missing").astype(str))
        train_df[categorical_cols] = encoder.transform(train_df[categorical_cols].fillna("missing").astype(str))
        test_df[categorical_cols] = encoder.transform(test_df[categorical_cols].fillna("missing").astype(str))

    fill_values = train_df[features].median()
    train_df[features] = train_df[features].fillna(fill_values)
    test_df[features] = test_df[features].fillna(fill_values)
    return train_df[features].to_numpy(np.float32), test_df[features].to_numpy(np.float32)

def f1_macro(y_true, probas):
    preds = probas.argmax(axis=1)
    return f1_score(y_true, preds, average="macro")

def get_probabilities(model, X, num_classes):
    proba = np.asarray(model.predict_proba(X))
    if proba.ndim == 1: proba = proba.reshape(-1, 1)
    if proba.shape[1] != num_classes: proba = proba[:, :num_classes]
    return proba

In [None]:
def build_model(model_name, params, random_state, num_classes):
    if model_name == "rf":
        return RandomForestClassifier(**params, n_jobs=-1, random_state=random_state)
    if model_name == "lgbm":
        return lgb.LGBMClassifier(**params, objective="multiclass", num_class=num_classes, n_jobs=-1, random_state=random_state, verbosity=-1)
    if model_name == "xgb":
        return xgb.XGBClassifier(**params, objective="multi:softprob", eval_metric="mlogloss", tree_method="hist", num_class=num_classes, n_jobs=-1, random_state=random_state)
    if model_name == "cat":
        return CatBoostClassifier(**params, loss_function="MultiClass", verbose=False, allow_writing_files=False, random_seed=random_state)
    raise ValueError(f"Unknown model {model_name}")

def sample_params(model_name, trial):
    if model_name == "rf":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "max_depth": trial.suggest_int("max_depth", 4, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
            "max_features": trial.suggest_float("max_features", 0.5, 1.0),
        }
    if model_name == "lgbm":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "num_leaves": trial.suggest_int("num_leaves", 16, 128),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 60),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
    if model_name == "xgb":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0, log=True),
            "gamma": trial.suggest_float("gamma", 1e-3, 5.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
    if model_name == "cat":
        return {
            "iterations": trial.suggest_int("iterations", 200, 800),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.3, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        }
    raise ValueError(f"Unknown model {model_name}")

def optimize_model(model_name, X, y, cv, n_trials, random_state, num_classes, feature_names):
    use_frames = model_name == "lgbm"
    def objective(trial):
        params = sample_params(model_name, trial)
        scores = []
        for train_idx, valid_idx in cv.split(X, y):
            model = build_model(model_name, params, random_state, num_classes)
            X_train, X_valid = X[train_idx], X[valid_idx]
            if use_frames:
                X_train = pd.DataFrame(X_train, columns=feature_names)
                X_valid = pd.DataFrame(X_valid, columns=feature_names)
            model.fit(X_train, y[train_idx])
            preds = get_probabilities(model, X_valid, num_classes)
            scores.append(f1_macro(y[valid_idx], preds))
        return float(np.mean(scores))
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params, study.best_value

def generate_oof_predictions(model_name, params, X, y, X_test, cv, random_state, num_classes, feature_names):
    oof = np.zeros((len(X), num_classes), dtype=np.float32)
    test_preds = np.zeros((len(X_test), num_classes), dtype=np.float32)
    use_frames = model_name == "lgbm"
    X_data = pd.DataFrame(X, columns=feature_names) if use_frames else X
    X_test_data = pd.DataFrame(X_test, columns=feature_names) if use_frames else X_test

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y), 1):
        model = build_model(model_name, params, random_state + fold, num_classes)
        X_train = X_data.iloc[train_idx] if use_frames else X_data[train_idx]
        X_valid = X_data.iloc[valid_idx] if use_frames else X_data[valid_idx]
        model.fit(X_train, y[train_idx])
        oof[valid_idx] = get_probabilities(model, X_valid, num_classes)
        test_preds += get_probabilities(model, X_test_data, num_classes) / cv.get_n_splits()
    return oof, test_preds

In [None]:
def optimize_meta_model(meta_X, y, cv, n_trials, random_state, num_classes, feature_names):
    meta_df = pd.DataFrame(meta_X, columns=feature_names)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "num_leaves": trial.suggest_int("num_leaves", 16, 128),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 60),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, log=True),
        }
        scores = []
        for train_idx, valid_idx in cv.split(meta_df, y):
            model = lgb.LGBMClassifier(**params, objective="multiclass", num_class=num_classes, n_jobs=-1, random_state=random_state, verbosity=-1)
            model.fit(meta_df.iloc[train_idx], y[train_idx])
            preds = get_probabilities(model, meta_df.iloc[valid_idx], num_classes)
            scores.append(f1_macro(y[valid_idx], preds))
        return float(np.mean(scores))
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params, study.best_value

def train_meta(params, meta_X, y, meta_test, cv, random_state, num_classes, feature_names):
    oof = np.zeros((len(meta_X), num_classes), dtype=np.float32)
    test_preds = np.zeros((len(meta_test), num_classes), dtype=np.float32)
    meta_df = pd.DataFrame(meta_X, columns=feature_names)
    meta_test_df = pd.DataFrame(meta_test, columns=feature_names)
    
    for fold, (train_idx, valid_idx) in enumerate(cv.split(meta_df, y), 1):
        model = lgb.LGBMClassifier(**params, objective="multiclass", num_class=num_classes, n_jobs=-1, verbosity=-1, random_state=random_state + fold)
        model.fit(meta_df.iloc[train_idx], y[train_idx])
        oof[valid_idx] = get_probabilities(model, meta_df.iloc[valid_idx], num_classes)
        test_preds += get_probabilities(model, meta_test_df, num_classes) / cv.get_n_splits()
    return oof, test_preds

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

train_df = pd.read_csv(f"{Config.data_dir}/train.csv")
test_df = pd.read_csv(f"{Config.data_dir}/test.csv")

X, X_test = prepare_features(train_df, test_df, FEATURE_COLUMNS)
y = train_df[TARGET_COLUMN].astype(int).to_numpy()
num_classes = int(train_df[TARGET_COLUMN].nunique())
cv = StratifiedKFold(n_splits=Config.n_splits, shuffle=True, random_state=Config.random_state)

base_models = ["lgbm", "xgb", "cat"]
meta_features, meta_test_features = [], []
meta_feature_names = []
base_scores, base_params = {}, {}

for model_name in tqdm(base_models, desc="Base models"):
    print(f"Optimizing {model_name}...")
    n_trials = Config.base_trials.get(model_name, Config.n_trials_base)
    best_params, best_score = optimize_model(model_name, X, y, cv, n_trials, Config.random_state, num_classes, FEATURE_COLUMNS)
    base_scores[model_name] = best_score
    base_params[model_name] = best_params
    print(f"{model_name} Best CV F1 Macro: {best_score:.5f}")
    
    oof, test_preds = generate_oof_predictions(model_name, best_params, X, y, X_test, cv, Config.random_state, num_classes, FEATURE_COLUMNS)
    meta_features.append(oof)
    meta_test_features.append(test_preds)
    for cls in range(num_classes):
        meta_feature_names.append(f"{model_name}_class{cls}")

meta_X = np.column_stack(meta_features)
meta_test = np.column_stack(meta_test_features)

print("Optimizing Meta Model...")
meta_params, meta_score = optimize_meta_model(meta_X, y, cv, Config.n_trials_meta, Config.random_state, num_classes, meta_feature_names)
print(f"Meta Model Best CV F1 Macro: {meta_score:.5f}")

meta_oof, meta_test_preds = train_meta(meta_params, meta_X, y, meta_test, cv, Config.random_state, num_classes, meta_feature_names)
tuned_score = f1_macro(y, meta_oof)
print(f"Meta OOF F1 Macro (retrained): {tuned_score:.5f}")

final_preds = meta_test_preds.argmax(axis=1)
output_df = pd.DataFrame({ID_COLUMN: test_df[ID_COLUMN], TARGET_COLUMN: final_preds})
output_df.to_csv(Config.output_name, index=False)
print(f"Submission saved to {Config.output_name}")

summary = {
    "base_scores": base_scores, "base_params": base_params,
    "meta_score": meta_score, "meta_oof_score": tuned_score,
    "meta_params": meta_params
}
with open(Config.summary_name, "w") as f:
    json.dump(summary, f, indent=2, default=json_serialize)
print(f"Summary saved to {Config.summary_name}")