In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HousePricePrediction") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "100") \
    .getOrCreate()

In [None]:
from pathlib import Path

DATA_DIR = Path.cwd().parent / 'files'
train_path = str(DATA_DIR / "train.csv")
test_path  = str(DATA_DIR/"test.csv")

train_df = (spark.read
            .option("header", "true")
            .option("inferSchema", "true")   # 关键：自动推断数值类型
            .option("nullValue", "NA")       # 可选：明确 NA 为 null
            .csv(train_path))

test_df = (spark.read
           .option("header", "true")
           .option("inferSchema", "true")
           .option("nullValue", "NA")
           .csv(test_path))

print("=== Train Schema ===")
train_df.printSchema()
print(f"Train rows: {train_df.count()}, cols: {len(train_df.columns)}")

print("\n=== Test Schema ===")
test_df.printSchema()

In [None]:
# === 区分列类型（基于实际 schema）===

from pyspark.sql.types import IntegerType, DoubleType, StringType

num_cols = [field.name for field in train_df.schema.fields
            if isinstance(field.dataType, (IntegerType, DoubleType))
            and field.name != "SalePrice"]

cat_cols = [field.name for field in train_df.schema.fields
            if isinstance(field.dataType, StringType)]

print(f"数值列 ({len(num_cols)}): {num_cols[:10]}...")
print(f"类别列 ({len(cat_cols)}): {cat_cols[:10]}...")

In [None]:
from pyspark.sql import functions as F
# === 数值列：中位数填补 ===
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=num_cols, outputCols=num_cols, strategy="median")

# 分别 fit，避免 train/test 统计泄露
train_df = imputer.fit(train_df).transform(train_df)
test_df  = imputer.fit(test_df).transform(test_df)

# === 类别列：缺失值填 "Missing" ===
for c in cat_cols:
    train_df = train_df.fillna({c: "Missing"})
    test_df  = test_df.fillna({c: "Missing"})

# 验证无缺失
print("数值列缺失统计:")
train_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in num_cols]).show()

In [None]:
# === 类别编码：StringIndexer + OneHotEncoder ===
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_idx", handleInvalid="keep")
    for c in cat_cols
]

encoders = [
    OneHotEncoder(inputCol=c + "_idx", outputCol=c + "_vec", handleInvalid="keep")
    for c in cat_cols
]

# 数值特征标准化
from pyspark.ml.feature import VectorAssembler, StandardScaler

assembler_num = VectorAssembler(inputCols=num_cols, outputCol="num_features")
scaler = StandardScaler(inputCol="num_features", outputCol="scaled_num", withStd=True, withMean=True)

# 最终特征向量
feature_cols = [c + "_vec" for c in cat_cols] + ["scaled_num"]
final_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 目标：对 SalePrice 取 log
import pyspark.sql.functions as F
train_df = train_df.withColumn("logSalePrice", F.log1p(F.col("SalePrice")))
target_col = "logSalePrice"

In [None]:
# === 构建完整 Pipeline ===
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol=target_col,
    numTrees=300,
    maxDepth=12,
    subsamplingRate=0.8,
    seed=42
)

pipeline_rf = Pipeline(stages=indexers + encoders + [assembler_num, scaler, final_assembler, rf])

In [None]:
train_set, val_set = train_df.randomSplit([0.8, 0.2], seed=42)

model_rf = pipeline_rf.fit(train_set)
pred_rf = model_rf.transform(val_set)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol=target_col, metricName="rmse")
rmse_rf = evaluator.evaluate(pred_rf)
r2_rf = RegressionEvaluator(labelCol=target_col, metricName="r2").evaluate(pred_rf)
print(f"RF RMSE (log): {rmse_rf:.4f}, R²: {r2_rf:.4f}")

In [None]:
from datetime import datetime
import shutil

# === 预测 test 并生成 submission ===
test_pred = model_rf.transform(test_df)
test_pred = test_pred.withColumn("SalePrice_pred", F.exp(F.col("prediction")) - 1)

# 构造时间戳子目录：格式 2025-11-11-14-30-00
time_str   = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
RESULT_DIR = Path.cwd().parent / 'results' / time_str
RESULT_DIR.mkdir(parents=True, exist_ok=True)

# 先写到临时目录（Spark 默认行为）
temp_out   = RESULT_DIR / "temp_csv"
result = test_pred.select("Id", "SalePrice_pred").withColumnRenamed("SalePrice_pred", "SalePrice")
result.coalesce(1).write.csv(str(temp_out), header=True, mode="overwrite")

# 找到真正的 csv 文件并重命名为 submission.csv
csv_file = list(temp_out.glob("part-*-*.csv"))[0]
final_file = RESULT_DIR / "submission.csv"
shutil.move(str(csv_file), str(final_file))

# 清理 Spark 生成的冗余文件（_SUCCESS、.crc 等）
shutil.rmtree(temp_out)