# House Prices - Advanced Regression
## **优化版：数据处理 + RF 调优 + 特征重要性 + 提交文件**

> **目标**：Kaggle Top 10% 水平  
> **核心优化**：
> - 有意义缺失值处理
> - 高价值衍生特征
> - `Neighborhood` Target Encoding
> - 去标准化（树模型专用）
> - RF 交叉验证调优
> - 特征重要性输出

---

**文件结构要求**：
```
project/
├── files/
│   ├── train.csv
│   └── test.csv
└── HousePrice_Optimized.ipynb
```

In [35]:
spark.stop()

In [19]:
# ==============================
# 1. 初始化 Spark
# ==============================
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HousePricePrediction_RFv0.2") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "100") \
    .getOrCreate()

In [20]:
# ==============================
# 2. 路径设置
# ==============================
from pathlib import Path
from datetime import datetime

DATA_DIR = Path.cwd().parent / "files"
RESULT_DIR = Path.cwd().parent / "results" / datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
RESULT_DIR.mkdir(parents=True, exist_ok=True)

train_path = str(DATA_DIR / "train.csv")
test_path  = str(DATA_DIR / "test.csv")

In [21]:
# ==============================
# 3. 读取数据
# ==============================
train_df = (spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .option("nullValue", "NA")
            .csv(train_path))

test_df = (spark.read
           .option("header", "true")
           .option("inferSchema", "true")
           .option("nullValue", "NA")
           .csv(test_path))

print(f"Train: {train_df.count()} rows, {len(train_df.columns)} cols")
print(f"Test:  {test_df.count()} rows, {len(test_df.columns)} cols")

Train: 1460 rows, 81 cols
Test:  1459 rows, 80 cols


In [22]:
# ==============================
# 4. 列类型划分
# ==============================
from pyspark.sql.types import IntegerType, DoubleType, StringType

num_cols = [field.name for field in train_df.schema.fields
            if isinstance(field.dataType, (IntegerType, DoubleType))
            and field.name != "SalePrice"]

cat_cols = [field.name for field in train_df.schema.fields
            if isinstance(field.dataType, StringType)]

print(f"数值列: {len(num_cols)}")
print(f"类别列: {len(cat_cols)}")

数值列: 37
类别列: 43


In [23]:
# ==============================
# 5. 【优化】有意义缺失值处理
# ==============================
meaningful_na_cols = [
    'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'PoolQC', 'Fence', 'MiscFeature'
]

for c in meaningful_na_cols:
    fill_val = "No" + c.replace("Qual", "").replace("Type", "").replace("Fin", "").replace("Cond", "")
    train_df = train_df.na.fill({c: fill_val})
    test_df  = test_df.na.fill({c: fill_val})

# 其余类别列填 "Missing"
other_cat_cols = [c for c in cat_cols if c not in meaningful_na_cols]
for c in other_cat_cols:
    train_df = train_df.na.fill({c: "Missing"})
    test_df  = test_df.na.fill({c: "Missing"})

In [24]:
# ==============================
# 6. 【优化】衍生特征（已修复）
# ==============================
from pyspark.sql import functions as F

def add_derived_features(df):
    return (df
            .withColumn("HouseAge",      F.col("YrSold") - F.col("YearBuilt"))
            .withColumn("RemodAge",      F.col("YrSold") - F.col("YearRemodAdd"))
            .withColumn("TotalSF",       F.col("1stFlrSF") + F.col("2ndFlrSF") + F.col("TotalBsmtSF"))
            .withColumn("TotalBath",     F.col("FullBath") + 0.5*F.col("HalfBath") + 
                                         F.col("BsmtFullBath") + 0.5*F.col("BsmtHalfBath"))
            .withColumn("TotalPorchSF",  F.col("OpenPorchSF") + F.col("EnclosedPorch") + 
                                         F.col("3SsnPorch") + F.col("ScreenPorch"))
            .withColumn("HasPool",       (F.col("PoolArea") > 0).cast("int"))
            .withColumn("Has2ndFloor",   (F.col("2ndFlrSF") > 0).cast("int"))
            .withColumn("HasGarage",     (F.col("GarageArea") > 0).cast("int"))
           )

train_df = add_derived_features(train_df)
test_df  = add_derived_features(test_df)

new_num_cols = ["HouseAge", "RemodAge", "TotalSF", "TotalBath", 
                "TotalPorchSF", "HasPool", "Has2ndFloor", "HasGarage"]
num_cols.extend(new_num_cols)

print(f"新增 {len(new_num_cols)} 个衍生特征")

新增 8 个衍生特征


In [25]:
# ==============================
# 9. 目标变量：log1p
# ==============================
train_df = train_df.withColumn("logSalePrice", F.log1p(F.col("SalePrice")))
target_col = "logSalePrice"

In [26]:
# ==============================
# 7. 【优化】Target Encoding for Neighborhood
# ==============================
from pyspark.sql.window import Window

# 训练集统计
mean_by_nb = train_df.groupBy("Neighborhood").agg(F.mean("logSalePrice").alias("NB_mean"))
count_by_nb = train_df.groupBy("Neighborhood").agg(F.count("*").alias("NB_count"))

train_df = train_df.join(mean_by_nb, on="Neighborhood", how="left")
train_df = train_df.join(count_by_nb, on="Neighborhood", how="left")

global_mean = train_df.agg(F.mean("logSalePrice")).collect()[0][0]
alpha = 100

train_df = train_df.withColumn(
    "Neighborhood_enc",
    (F.col("NB_count") * F.col("NB_mean") + alpha * global_mean) / (F.col("NB_count") + alpha)
)

# 测试集使用训练集统计
test_df = test_df.join(mean_by_nb, on="Neighborhood", how="left")
test_df = test_df.join(count_by_nb, on="Neighborhood", how="left")
test_df = test_df.withColumn(
    "Neighborhood_enc",
    F.when(F.col("NB_count").isNull(), global_mean)
    .otherwise((F.col("NB_count") * F.col("NB_mean") + alpha * global_mean) / (F.col("NB_count") + alpha))
)

# 替换原列
if "Neighborhood" in cat_cols:
    cat_cols.remove("Neighborhood")
num_cols.append("Neighborhood_enc")

In [27]:
# ==============================
# 8. 数值列中位数填补
# ==============================
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=num_cols, outputCols=num_cols, strategy="median")
train_df = imputer.fit(train_df).transform(train_df)
test_df  = imputer.fit(test_df).transform(test_df)

In [28]:
# ==============================
# 10. 类别编码（One-Hot）
# ==============================
from pyspark.ml.feature import StringIndexer, OneHotEncoder

indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_idx", handleInvalid="keep")
    for c in cat_cols
]

encoders = [
    OneHotEncoder(inputCol=c + "_idx", outputCol=c + "_vec", handleInvalid="keep")
    for c in cat_cols
]

In [29]:
# ==============================
# 11. 特征装配（无标准化）
# ==============================
from pyspark.ml.feature import VectorAssembler

feature_cols = [c + "_vec" for c in cat_cols] + num_cols
final_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [30]:
# # ==============================
# # 12. RF + 交叉验证调优（已修复）
# # ==============================
# from pyspark.ml.regression import RandomForestRegressor
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.ml import Pipeline

# rf = RandomForestRegressor(featuresCol="features", labelCol=target_col, seed=42)

# pipeline = Pipeline(stages=indexers + encoders + [final_assembler, rf])

# evaluator = RegressionEvaluator(labelCol=target_col, metricName="rmse")

# paramGrid = (ParamGridBuilder()
#              .addGrid(rf.numTrees, [300, 500])
#              .addGrid(rf.maxDepth, [15, 20, 30])           # 关键：用 30 代替 None
#              .addGrid(rf.subsamplingRate, [0.8, 1.0])
#              .addGrid(rf.featureSubsetStrategy, ["sqrt", "onethird"])
#              .addGrid(rf.minInstancesPerNode, [1, 5])
#              .build())

# cv = CrossValidator(
#     estimator=pipeline,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     numFolds=5,
#     seed=42,
#     parallelism=4
# )

# print(f"开始 5 折交叉验证，共 {len(paramGrid)} 组参数...")
# cv_model = cv.fit(train_df)
# best_model = cv_model.bestModel
# best_rf = best_model.stages[-1]

# print("\n最佳参数:")
# print(f"  numTrees: {best_rf.getNumTrees}")
# print(f"  maxDepth: {best_rf.getMaxDepth()}")
# print(f"  subsamplingRate: {best_rf.getSubsamplingRate()}")
# print(f"  featureSubsetStrategy: {best_rf.getFeatureSubsetStrategy()}")
# print(f"  minInstancesPerNode: {best_rf._java_obj.getMinInstancesPerNode()}")

# ==============================
# 12. 【终极简化】直接使用一组最优参数（无需 CV）
# ==============================
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# 经验最优参数（来自 Kaggle 高分方案 + 实际调优）
best_params = {
    "numTrees": 500,
    "maxDepth": 20,
    "subsamplingRate": 0.8,
    "featureSubsetStrategy": "sqrt",
    "minInstancesPerNode": 1,
    "seed": 42
}

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=target_col,
    **best_params
)

# 构建 Pipeline
pipeline = Pipeline(stages=indexers + encoders + [final_assembler, rf])

print("开始训练最终模型（单组最优参数）...")
final_model = pipeline.fit(train_df)
print("训练完成！")

开始训练最终模型（单组最优参数）...
训练完成！


In [31]:
# ==============================
# 13. 验证集评估
# ==============================
train_set, val_set = train_df.randomSplit([0.8, 0.2], seed=42)
pred_val = final_model.transform(val_set)

evaluator = RegressionEvaluator(labelCol=target_col, metricName="rmse")
rmse_val = evaluator.evaluate(pred_val)
r2_val = RegressionEvaluator(labelCol=target_col, metricName="r2").evaluate(pred_val)

print(f"\n验证集表现:")
print(f"  RMSE (log): {rmse_val:.5f}")
print(f"  R²:         {r2_val:.4f}")


验证集表现:
  RMSE (log): 0.06979
  R²:         0.9704


In [None]:
# ==============================
# 14. 特征重要性
# ==============================
feature_names = num_cols.copy()
for col in cat_cols:
    indexer = next(s for s in best_model.stages if s.getOutputCol() == col + "_idx")
    feature_names.extend([f"{col}_{label}" for label in indexer.labels])

importances = best_rf.featureImportances.toArray()
importance_df = spark.createDataFrame(
    [(float(imp), name) for imp, name in zip(importances, feature_names)],
    ["importance", "feature"]
).orderBy(F.desc("importance"))

print("\nTop 15 重要特征:")
importance_df.show(15, truncate=False)

# 保存
importance_df.toPandas().to_csv(RESULT_DIR / "feature_importance.csv", index=False)

In [32]:
# ==============================
# 15. 测试集预测 & 提交
# ==============================
submission = (final_model.transform(test_df)
              .withColumn("SalePrice", F.exp(F.col("prediction")) - 1)
              .select("Id", "SalePrice"))

submission_path = RESULT_DIR / "submission.csv"
submission.toPandas().to_csv(submission_path, index=False)
print(f"\n提交文件已生成: {submission_path}")


提交文件已生成: d:\python\BDA\house-prices-advanced-regression-techniques\results\2025-11-15-22-59-31\submission.csv


In [34]:
# 在验证集上计算真实 RMSE
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F

pred_val = final_model.transform(val_set)
pred_val = pred_val.withColumn("pred_price", F.exp(F.col("prediction")) - 1)

rmse_real = pred_val.select(
    F.sqrt(F.mean((F.col("SalePrice") - F.col("pred_price"))**2))
).collect()[0][0]

print(f"真实 RMSE: {rmse_real:.0f}")

真实 RMSE: 15369


In [None]:
# ==============================
# 16. 结束
# ==============================
spark.stop()
print("\n任务完成！")