# 1、准备工作

In [1]:
# -*- coding: utf-8 -*-
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np

# 固定工作目录
WORKDIR = Path("/mnt/workspace")
os.chdir(WORKDIR)

# 路径设置
MODEL_DIR = WORKDIR / "model" / "autogluon_model"
TRAIN_PATH = Path("data/train.csv")   # 相对 WORKDIR
TEST_PATH  = Path("data/test.csv")    # 相对 WORKDIR

print("CWD:", Path.cwd())
print("Model dir exists:", MODEL_DIR.exists())
print("Train exists:", TRAIN_PATH.exists(), "| Test exists:", TEST_PATH.exists())

# ===== 稳健 CSV 读取器 =====
# 不同情况下读取CSV文件
def robust_load_csv(path_like):
    """
    稳健读取 CSV：
    - 自动识别分隔符（逗号/分号/制表符）
    - 尝试常见编码（utf-8/utf-8-sig/gbk）
    - 确保读取到 >= 2 列（避免“一整列”问题）
    """
    path = str(path_like)
    last_exc = None
    for enc in ["utf-8", "utf-8-sig", "gbk"]:
        try:
            df = pd.read_csv(path, engine="python", sep=None, encoding=enc)
            if df.shape[1] == 1:
                # 可能是分号或制表符，重试常见分隔符
                for sep in [",", ";", "\t", "|"]:
                    df2 = pd.read_csv(path, engine="python", sep=sep, encoding=enc)
                    if df2.shape[1] > 1:
                        print(f"[robust_load_csv] encoding={enc}, sep={repr(sep)}, shape={df2.shape}")
                        return df2
            else:
                print(f"[robust_load_csv] encoding={enc}, auto-sep, shape={df.shape}")
                return df
        except Exception as e:
            last_exc = e
            continue
    raise ValueError(f"无法解析CSV: {path}。最后一次异常: {last_exc}")

# ===== 评估指标工具 =====
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, r2_score, mean_squared_error

# 利用sklearn的工具反馈RMSE，用于回归任务
def regression_report(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {"rmse": float(rmse), "r2": float(r2)}

# 计算ROC，用于binary任务
def binary_clf_report(y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    f1b = f1_score(y_true, y_pred, average="binary")
    auc = None
    if y_proba is not None:
        # y_proba: shape (n,) 或 (n,2)
        if y_proba.ndim == 1:
            prob1 = y_proba
        else:
            prob1 = y_proba[:, 1]
        try:
            auc = roc_auc_score(y_true, prob1)
        except Exception:
            auc = None
    out = {"accuracy": float(acc), "f1_binary": float(f1b)}
    if auc is not None:
        out["auc"] = float(auc)
    return out

# 分类任务的准确率
def multiclass_clf_report(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    return {"accuracy": float(acc), "f1_macro": float(f1m)}

CWD: /mnt/workspace
Model dir exists: True
Train exists: True | Test exists: True


# 2、提取模型信息

In [2]:
from autogluon.tabular import TabularPredictor

assert MODEL_DIR.exists(), f"主模型目录不存在: {MODEL_DIR}"
predictor = TabularPredictor.load(str(MODEL_DIR))

print("=== Predictor Info ===")
print("path:", predictor.path)
print("label:", predictor.label)
print("problem_type:", predictor.problem_type)
print("eval_metric:", predictor.eval_metric)

# Leaderboard：查看模型层级/得分
lb = predictor.leaderboard(silent=True)
display(lb.head(20))

# 判断是否存在 L2/L3 融合
model_names = lb["model"].astype(str).tolist()
has_L2 = any(m.startswith("WeightedEnsemble_L2") for m in model_names)
has_L3 = any(m.startswith("WeightedEnsemble_L3") for m in model_names)
print(f"Has L2 ensemble? {has_L2}")
print(f"Has L3 ensemble? {has_L3}")

# 记录最优模型
best_row = lb.iloc[0]
best_model_name = str(best_row["model"])
best_val_score = float(best_row["score_val"]) if pd.notna(best_row.get("score_val", np.nan)) else None
print(f"Best model: {best_model_name}, score_val: {best_val_score}")


=== Predictor Info ===
path: /mnt/workspace/model/autogluon_model/
label: Premium Amount
problem_type: regression
eval_metric: root_mean_squared_error


  import pkg_resources


Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_15_L2,-1.044921,38.260312,845.927748,0.013803,11.939117,2,True,64
1,WeightedEnsemble_14_L2,-1.044921,38.473516,836.048459,0.013598,12.017005,2,True,55
2,WeightedEnsemble_13_L2,-1.044921,38.770473,834.416426,0.013576,11.914437,2,True,46
3,WeightedEnsemble_18_L2,-1.044921,38.916586,833.058394,0.013499,11.967577,2,True,91
4,WeightedEnsemble_17_L2,-1.044921,39.156264,831.91321,0.013863,12.049597,2,True,82
5,WeightedEnsemble_16_L2,-1.044921,39.231844,839.283582,0.019447,12.04935,2,True,73
6,WeightedEnsemble_22_L2,-1.044921,40.606216,932.33438,0.013808,12.152689,2,True,127
7,WeightedEnsemble_20_L2,-1.044921,40.94892,929.138671,0.013818,12.077396,2,True,109
8,WeightedEnsemble_24_L2,-1.044921,41.005375,947.061845,0.014307,12.010493,2,True,145
9,WeightedEnsemble_21_L2,-1.044921,41.05501,925.403579,0.013661,12.002737,2,True,118


Has L2 ensemble? True
Has L3 ensemble? True
Best model: WeightedEnsemble_15_L2, score_val: -1.0449213176465442


# 3、读取测试数据

In [3]:
# 测试集
assert TEST_PATH.exists(), f"找不到测试集: {TEST_PATH}"
test_df = robust_load_csv(TEST_PATH)
label = predictor.label

has_test_label = (label in test_df.columns)
X_test = test_df.drop(columns=[label]) if has_test_label else test_df.copy()
y_test = test_df[label] if has_test_label else None

print("Test shape:", test_df.shape, "| has_label:", has_test_label)

# 可选：训练集（仅用于基线训练）
train_df = None
if TRAIN_PATH.exists():
    train_df = robust_load_csv(TRAIN_PATH)
    assert label in train_df.columns, f"训练集缺少标签列 `{label}`"
    print("Train shape:", train_df.shape)
else:
    print("[Info] 未找到 data/train.csv，将跳过基线模型的训练。")


[robust_load_csv] encoding=utf-8, auto-sep, shape=(800000, 20)
Test shape: (800000, 20) | has_label: False
[robust_load_csv] encoding=utf-8, auto-sep, shape=(1200000, 21)
Train shape: (1200000, 21)


# 4、主模型预测，并保存结果

In [4]:
# 主模型预测
main_preds = predictor.predict(X_test)

# 导出：保留原测试表的列 + 主模型预测
main_out = test_df.copy()
main_out["main_pred"] = main_preds

pred_main_path = WORKDIR / "predictions_main.csv"
main_out.to_csv(pred_main_path, index=False)
print("Saved:", pred_main_path)

# 计算主模型测试指标（若测试集带标签）
main_metrics = None
if y_test is not None:
    if predictor.problem_type == "regression":
        main_metrics = regression_report(y_test, main_preds)
    elif predictor.problem_type == "binary":
        try:
            proba = predictor.predict_proba(X_test)
        except Exception:
            proba = None
        main_metrics = binary_clf_report(y_test, main_preds, proba)
    else:
        main_metrics = multiclass_clf_report(y_test, main_preds)

# 结果块：main_model（只放结果，不放模型对象）
main_model = {
    "test_metrics": main_metrics,   # None 表示测试集无标签，无法评估
    "predictions_csv": str(pred_main_path)
}
print("main_model:", main_model)


Saved: /mnt/workspace/predictions_main.csv
main_model: {'test_metrics': None, 'predictions_csv': '/mnt/workspace/predictions_main.csv'}


# 5、在主模型基础上生成基线模型

In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor

baseline_predictor = None
baseline_model = {
    "trained": False,
    "leaderboard_top": None,
    "test_metrics": None,
    "predictions_csv": None,
}

if train_df is not None:
    # 转为 AutoGluon 数据集（可直接用 pandas 也行，这里保持一致）
    ag_train = TabularDataset(train_df)

    # 轻量基线：快速可比
    baseline_hparams = {
        "RF": {},
        "XT": {},
        "LR": {},
    }

    BASELINE_DIR = WORKDIR / "model" / "baseline_autogluon"
    BASELINE_DIR.mkdir(parents=True, exist_ok=True)

    baseline_predictor = TabularPredictor(
        label=label,
        path=str(BASELINE_DIR),
        problem_type=predictor.problem_type,
        eval_metric=predictor.eval_metric,
        verbosity=2,
    ).fit(
        train_data=ag_train,
        time_limit=60,                 # 轻量快速
        hyperparameters=baseline_hparams
    )

    # 基线模型排行榜
    b_lb = baseline_predictor.leaderboard(silent=True)
    display(b_lb)

    # 基线预测与导出
    baseline_preds = baseline_predictor.predict(X_test)
    baseline_out = test_df.copy()
    baseline_out["baseline_pred"] = baseline_preds
    pred_baseline_path = WORKDIR / "predictions_baseline.csv"
    baseline_out.to_csv(pred_baseline_path, index=False)
    print("Saved:", pred_baseline_path)

    # 基线指标（若测试集带标签）
    bm = None
    if y_test is not None:
        if predictor.problem_type == "regression":
            bm = regression_report(y_test, baseline_preds)
        elif predictor.problem_type == "binary":
            try:
                b_proba = baseline_predictor.predict_proba(X_test)
            except Exception:
                b_proba = None
            bm = binary_clf_report(y_test, baseline_preds, b_proba)
        else:
            bm = multiclass_clf_report(y_test, baseline_preds)

    baseline_model = {
        "trained": True,
        "leaderboard_top": b_lb.head(3).to_dict(orient="list"),
        "test_metrics": bm,
        "predictions_csv": str(pred_baseline_path),
    }
    print("baseline_model:", baseline_model)
else:
    print("未训练基线模型（缺少 data/train.csv）。")


Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "/mnt/workspace/model/baseline_autogluon/"
AutoGluon Version:  0.8.2
Python Version:     3.10.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jul 17 11:00:10 CST 2025
Disk Space Avail:   53.07 GB / 105.09 GB (50.5%)
Train Data Rows:    1200000
Train Data Columns: 20
Label Column: Premium Amount
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    27110.46 MB
	Train Data (Original)  Memory Usage: 931.82 MB (3.4% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-833.117603,0.052069,38.688019,0.000434,0.119101,2,True,3
1,ExtraTrees,-833.225455,0.029383,34.300343,0.029383,34.300343,1,True,1
2,LinearModel,-854.074712,0.022253,4.268575,0.022253,4.268575,1,True,2


Saved: /mnt/workspace/predictions_baseline.csv
baseline_model: {'trained': True, 'leaderboard_top': {'model': ['WeightedEnsemble_L2', 'ExtraTrees', 'LinearModel'], 'score_val': [-833.1176032828895, -833.2254546001295, -854.0747116246549], 'pred_time_val': [0.05206942558288574, 0.029382944107055664, 0.022252798080444336], 'fit_time': [38.688018560409546, 34.30034279823303, 4.268575191497803], 'pred_time_val_marginal': [0.0004336833953857422, 0.029382944107055664, 0.022252798080444336], 'fit_time_marginal': [0.11910057067871094, 34.30034279823303, 4.268575191497803], 'stack_level': [2, 1, 1], 'can_infer': [True, True, True], 'fit_order': [3, 1, 2]}, 'test_metrics': None, 'predictions_csv': '/mnt/workspace/predictions_baseline.csv'}


# 6、汇总主模型与基线模型的结果

In [6]:
summary = {
    "predictor": {
        "path": predictor.path,
        "label": predictor.label,
        "problem_type": predictor.problem_type,
        "eval_metric": str(predictor.eval_metric),
        "has_L2": bool(has_L2),
        "has_L3": bool(has_L3),
        "best_model": str(best_model_name),
        "best_score_val": None if best_val_score is None else float(best_val_score),
    },
    "main_model": main_model,                     # 主模型的结果块
    "baseline_model": baseline_model,             # 基线模型的结果块
    "has_baseline_predictor": baseline_predictor is not None,
}

out_path = WORKDIR / "model_analysis_summary.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("=== Summary JSON saved ===")
print(out_path)
summary


=== Summary JSON saved ===
/mnt/workspace/model_analysis_summary.json


{'predictor': {'path': '/mnt/workspace/model/autogluon_model/',
  'label': 'Premium Amount',
  'problem_type': 'regression',
  'eval_metric': 'root_mean_squared_error',
  'has_L2': True,
  'has_L3': True,
  'best_model': 'WeightedEnsemble_15_L2',
  'best_score_val': -1.0449213176465442},
 'main_model': {'test_metrics': None,
  'predictions_csv': '/mnt/workspace/predictions_main.csv'},
 'baseline_model': {'trained': True,
  'leaderboard_top': {'model': ['WeightedEnsemble_L2',
    'ExtraTrees',
    'LinearModel'],
   'score_val': [-833.1176032828895, -833.2254546001295, -854.0747116246549],
   'pred_time_val': [0.05206942558288574,
    0.029382944107055664,
    0.022252798080444336],
   'fit_time': [38.688018560409546, 34.30034279823303, 4.268575191497803],
   'pred_time_val_marginal': [0.0004336833953857422,
    0.029382944107055664,
    0.022252798080444336],
   'fit_time_marginal': [0.11910057067871094,
    34.30034279823303,
    4.268575191497803],
   'stack_level': [2, 1, 1],
   'ca

In [7]:
rows = []

# 主模型
mm = main_model.get("test_metrics")
rows.append({
    "model": "AutoGluon Main",
    **({} if mm is None else mm),
    "pred_csv": main_model.get("predictions_csv")
})

# 基线
bm = baseline_model.get("test_metrics")
rows.append({
    "model": "Baseline (Light)",
    **({} if bm is None else bm),
    "pred_csv": baseline_model.get("predictions_csv")
})

df_compare = pd.DataFrame(rows)
display(df_compare)


Unnamed: 0,model,pred_csv
0,AutoGluon Main,/mnt/workspace/predictions_main.csv
1,Baseline (Light),/mnt/workspace/predictions_baseline.csv


# 7、提交kaggle数据集，提交前进行EXP处理

In [8]:
# --- Kaggle 提交文件生成器（支持从对数空间还原） ---
import pandas as pd
import numpy as np
from pathlib import Path

# === 配置：根据你的训练目标变换选择 ===
#   - "log1p" : 训练用了 log1p(y)  →  预测需 expm1 还原
#   - "log"   : 训练用了 log(y)    →  预测需 exp 还原
#   - None    : 没有对数变换       →  不做还原
TRANSFORM = "log1p"     # ← 按需改为 "log" 或 None
CLIP_MIN_ZERO = True    # RMSLE 要求非负，必要时把负值裁为 0

WORKDIR = Path("/mnt/workspace")
SAMPLE_PATH = WORKDIR / "data" / "sample_submission.csv"
PRED_PATH   = WORKDIR / "predictions_main.csv"     # 如需基线提交，可改为 "predictions_baseline.csv"
OUT_PATH    = WORKDIR / "data" / "submission.csv"

assert SAMPLE_PATH.exists(), f"找不到 sample_submission: {SAMPLE_PATH}"
assert PRED_PATH.exists(),   f"找不到预测文件: {PRED_PATH}"

sample = pd.read_csv(SAMPLE_PATH)
preds  = pd.read_csv(PRED_PATH)

# 1) 识别 sample 中的 id 列与目标列（通常 1 个目标列）
sample_cols = sample.columns.tolist()
if "id" in sample_cols:
    id_col = "id"
elif "ID" in sample_cols:
    id_col = "ID"
else:
    id_col = sample_cols[0]

target_cols = [c for c in sample_cols if c != id_col]
assert len(target_cols) >= 1, "sample_submission 里没找到目标列"
if len(target_cols) > 1:
    print("[Warn] sample_submission 中目标列>1，将全部保留。")

# 2) 识别预测列：优先 main_pred，其次 prediction；否则尝试最后一列
candidate_pred_cols = [c for c in ["main_pred", "prediction"] if c in preds.columns]
if candidate_pred_cols:
    pred_col = candidate_pred_cols[0]
else:
    common_meta = {id_col, "id", "ID"}
    remaining = [c for c in preds.columns if c not in common_meta]
    pred_col = remaining[-1] if remaining else preds.columns[-1]
    print(f"[Info] 未找到 main_pred/prediction，改用列: {pred_col}")

# 2.5) 进行“从对数空间还原” + 合规性处理（RMSLE 需要非负）
def inverse_transform(x: pd.Series, mode: str | None):
    x = x.astype(float)
    if mode == "log1p":
        x = np.expm1(x)     # y = exp(pred_log1p) - 1
    elif mode == "log":
        x = np.exp(x)       # y = exp(pred_log)
    # 裁负（RMSLE 要求预测非负；多数比赛 target 也非负）
    if CLIP_MIN_ZERO:
        x = np.clip(x, 0, None)
    return x

# 3) 优先按 id 合并；若预测里没有 id 列，则按行顺序对齐
if id_col in preds.columns:
    preds_narrow = preds[[id_col, pred_col]].copy()
    preds_narrow[pred_col] = inverse_transform(preds_narrow[pred_col], TRANSFORM)

    sub = sample[[id_col] + target_cols].copy()
    for tcol in target_cols:
        sub[tcol] = None
    sub = sub.merge(preds_narrow, on=id_col, how="left")
    sub[target_cols[0]] = sub[pred_col]
    sub = sub[[id_col] + target_cols]
else:
    print(f"[Warn] 预测文件中缺少 {id_col}，按行序对齐（务必确保与 sample 同顺序）")
    assert len(preds) == len(sample), "无法按行对齐：行数不一致"
    restored = inverse_transform(preds[pred_col], TRANSFORM)
    sub = sample.copy()
    sub[target_cols[0]] = restored.values

# 4) 保存
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
sub.to_csv(OUT_PATH, index=False)
print(f"✅ 已生成 Kaggle 提交文件: {OUT_PATH}")
print("列示例：")
print(sub.head(3))

# 5) 可选：简单统计，确认数值合理性
col_preview = target_cols[0]
print("\n[Preview] 提交目标列统计：")
print(sub[col_preview].describe(percentiles=[0.01, 0.5, 0.99]))
num_neg = (sub[col_preview] < 0).sum()
if num_neg > 0:
    print(f"[Warn] 仍有 {num_neg} 个负值（RMSLE 不允许），请检查变换与裁剪逻辑。")


✅ 已生成 Kaggle 提交文件: /mnt/workspace/data/submission.csv
列示例：
        id  Premium Amount
0  1200000      759.853934
1  1200001      801.764744
2  1200002      794.337299

[Preview] 提交目标列统计：
count    800000.000000
mean        757.499200
std         153.447147
min         142.516872
1%          184.764305
50%         793.769949
99%         979.566096
max        1252.986556
Name: Premium Amount, dtype: float64
