## Prediction für Test-data

In [None]:
# ==== Apply-Pipeline für gespeicherte Modelle (XGBoost u.a.) ====
# Voraussetzungen (liegen bereits vor):
#   - xgb_final.pkl                (sklearn-Wrapper)      ODER
#   - xgb_final_booster.json       (reiner Booster Fallback)
#   - xgb_columns.txt              (OHE-Spaltenliste aus dem Training)
#   - xgb_meta.json                (use_log, use_smearing, smear, optional: rare_levels_map)
#
# test.csv: enthält neue Daten; wenn 'loss' vorhanden ist -> werden Metriken berechnet.
#
# Nutzung: einfach am Ende run_apply("test.csv")

import json
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------------- Pfade & Konstanten ----------------------
ARTIFACTS_DIR = Path(".")  # ggf. anpassen
MODEL_PKL      = ARTIFACTS_DIR / "xgb_final.pkl"
MODEL_BOOSTER  = ARTIFACTS_DIR / "xgb_final_booster.json"
COLUMNS_TXT    = ARTIFACTS_DIR / "xgb_columns.txt"
META_JSON      = ARTIFACTS_DIR / "xgb_meta.json"

ID_COL = "id"
TARGET_COL = "loss"

# explizite Drops
DROP_ALWAYS = ["cont12"]  # zusätzlich zur ID
DROP_CATS   = ["cat70", "cat15", "cat22", "cat64", "cat62", "cat63", "cat68", "cat55", "cat56"]

# RARE-Kompressions-Liste
RARE_COLS = ["cat20","cat35","cat58","cat48","cat59","cat69","cat21","cat60","cat34","cat67",
             "cat47","cat61","cat77","cat46","cat33","cat18","cat32","cat51","cat17","cat42","cat78"]

# Fallback-Schwelle, falls kein Mapping im Meta vorhanden
RARE_MIN_COUNT = 20
RARE_LABEL = "RARE"

# ---------------------- Hilfsfunktionen ----------------------
def load_meta(meta_path=META_JSON):
    meta = {}
    if Path(meta_path).exists():
        with open(meta_path, "r", encoding="utf-8") as f:
            meta = json.load(f)
    # Defaults
    meta.setdefault("use_log", False)
    meta.setdefault("use_smearing", False)
    meta.setdefault("smear", 1.0)
    # optional: {"rare_levels_map": {"cat20": ["A","B","RARE"], ...}}
    return meta

def load_columns(columns_path=COLUMNS_TXT):
    with open(columns_path, "r", encoding="utf-8") as f:
        cols = [ln.strip() for ln in f if ln.strip()]
    if not cols:
        raise RuntimeError("xgb_columns.txt ist leer – ohne Spaltenliste kann nicht ausgerichtet werden.")
    return cols

def compress_rare_levels(df, cols, rare_map=None, min_count=RARE_MIN_COUNT, rare_label=RARE_LABEL):
    """Komprimiere seltene Levels deterministisch:
       - Wenn rare_map vorhanden (aus Training): alles außerhalb der erlaubten Menge -> RARE
       - Sonst: zähle im aktuellen df und komprimiere Levels mit Count < min_count zu RARE
    """
    for c in cols:
        if c not in df.columns:
            continue
        if rare_map and c in rare_map:
            allowed = set(rare_map[c])
            df[c] = df[c].where(df[c].isin(allowed), rare_label)
        else:
            vc = df[c].value_counts(dropna=False)
            keep = set(vc[vc >= min_count].index)
            df[c] = df[c].where(df[c].isin(keep), rare_label)
    return df

def preprocess_unseen(df_raw, columns_txt=COLUMNS_TXT, meta=None):
    """Repliziere das Training-Preprocessing:
       1) ID separieren
       2) feste Drops
       3) Rare-Kompression (Mapping aus meta, sonst Fallback)
       4) One-Hot-Encoding
       5) Spalten auf Trainings-OHE ausrichten
    """
    if meta is None:
        meta = load_meta()

    df = df_raw.copy()

    # 1) Ziel & ID separieren
    y = None
    if TARGET_COL in df.columns:
        y = df[TARGET_COL].copy()
        df = df.drop(columns=[TARGET_COL])
    ids = df[ID_COL].copy() if ID_COL in df.columns else pd.Series(np.arange(len(df)), name=ID_COL)

    # 2) Drop ID + feste Drops
    drop_cols = [c for c in [ID_COL] if c in df.columns]
    drop_cols += [c for c in DROP_ALWAYS if c in df.columns]
    drop_cols += [c for c in DROP_CATS if c in df.columns]
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")

    # 3) Rare-Kompression (nur auf tatsächlich vorhandene Spalten)
    rare_map = meta.get("rare_levels_map", None)
    df = compress_rare_levels(df, [c for c in RARE_COLS if c in df.columns],
                              rare_map=rare_map, min_count=RARE_MIN_COUNT, rare_label=RARE_LABEL)

    # 4) OHE
    cat_cols = df.select_dtypes(include=["object","category"]).columns.tolist()
    df_ohe = pd.get_dummies(df, columns=cat_cols, drop_first=False, dtype=np.uint8)

    # 5) Spaltenausrichtung
    train_cols = load_columns(columns_txt)
    X = df_ohe.reindex(columns=train_cols, fill_value=0)

    return X, y, ids

def load_model():
    """Lade sklearn-Wrapper, ansonsten Booster-Fallback."""
    model = None
    meta = load_meta()
    # 1) sklearn-Wrapper
    if MODEL_PKL.exists():
        try:
            import joblib
            model = joblib.load(MODEL_PKL)
        except Exception:
            with open(MODEL_PKL, "rb") as f:
                model = pickle.load(f)
        return ("sklearn", model, meta)
    # 2) Booster-Fallback
    if MODEL_BOOSTER.exists():
        import xgboost as xgb
        booster = xgb.Booster()
        booster.load_model(str(MODEL_BOOSTER))
        return ("booster", booster, meta)
    raise FileNotFoundError("Kein Modell gefunden: weder xgb_final.pkl noch xgb_final_booster.json vorhanden.")

def predict_with_loaded(model_kind, model_obj, X, meta):
    """Gibt y_pred auf Originalskala zurück (inkl. Log-Rücktransformation + Smearing)."""
    use_log = bool(meta.get("use_log", False))
    use_smearing = bool(meta.get("use_smearing", False))
    smear = float(meta.get("smear", 1.0))

    if model_kind == "sklearn":
        z = model_obj.predict(X)
    else:
        # Booster-Fallback
        import xgboost as xgb
        dmat = xgb.DMatrix(X)
        z = model_obj.predict(dmat)

    # Rücktransformation
    if use_log:
        y_pred = np.expm1(z)
        if use_smearing:
            y_pred = y_pred * smear
    else:
        y_pred = z
    return y_pred

def evaluate_if_possible(y_true, y_pred):
    if y_true is None:
        print("Hinweis: In test.csv ist keine Zielspalte 'loss' vorhanden – gebe nur Vorhersagen aus.")
        return None
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2   = r2_score(y_true, y_pred)
    print(f"MAE={mae:.4f} | RMSE={rmse:.2f} | R²={r2:.4f}")
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def run_apply(csv_path, save_preds=True, out_csv="predictions_xgb_test.csv"):
    """Hauptfunktion: lädt test.csv, preprocess, lädt Modell, macht Vorhersagen, berechnet ggf. Metriken."""
    # 1) Daten laden
    df = pd.read_csv(csv_path)

    # 2) Preprocess replizieren
    meta = load_meta()
    X, y, ids = preprocess_unseen(df, columns_txt=COLUMNS_TXT, meta=meta)

    # 3) Modell laden
    model_kind, model_obj, meta = load_model()

    # 4) Vorhersage
    y_pred = predict_with_loaded(model_kind, model_obj, X, meta)

    # 5) Metriken (falls y vorhanden)
    metrics = evaluate_if_possible(y, y_pred)

    # 6) Speichern
    if save_preds:
        out_df = pd.DataFrame({ID_COL: ids.values, "prediction": y_pred})
        out_df.to_csv(out_csv, index=False)
        print(f"Predictions gespeichert -> {out_csv}")

    return y_pred, metrics

# --------- Beispielaufruf ----------
y_pred, metrics = run_apply("data/test.csv", save_preds=True, out_csv="predictions_xgb_test.csv")
