In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings

# 忽略警告信息，防止输出过多干扰信息
warnings.filterwarnings('ignore')
# 设置随机种子，保证实验结果的可复现性
np.random.seed(111)


def load_data():
    """加载所有相关数据集"""
    try:
        # 读取数据
        train_data = pd.read_csv('ruc_Class25Q1_train.csv', encoding='utf-8')
        test_data = pd.read_csv('ruc_Class25Q1_test.csv', encoding='utf-8')
        details_data = pd.read_csv('ruc_Class25Q1_details.csv', encoding='utf-8')
        rent_data = pd.read_csv('ruc_Class25Q1_rent.csv', encoding='utf-8')
        print("成功加载所有数据文件")
        return train_data, test_data, details_data, rent_data
    except Exception as e:
        print(f"加载数据文件出错: {e}")
        return None, None, None, None


def preprocess_data(df):
    """增强的数据预处理"""   # 数据预处理，包括特征提取和转换
    df_processed = df.copy()

    # 处理建筑面积列 - 提取数值部分
    if '建筑面积' in df_processed.columns:
        df_processed['建筑面积_数值'] = df_processed['建筑面积'].str.extract(r'(\d+\.?\d*)').astype(float)

    # 处理环线信息 - 将其映射为数值
    if '环线' in df_processed.columns:
        环线映射 = {
            '二至三环': 2.5, '三至四环': 3.5, '四至五环': 4.5,
            '五至六环': 5.5, '六环外': 6.5
        }
        df_processed['环线数值'] = df_processed['环线'].map(环线映射)
        df_processed['环线数值'] = df_processed['环线数值'].fillna(4.5)

    # 处理基本特征
    if '房屋户型' in df_processed.columns:
        # 提取房间数和卫生间数
        df_processed['房间数'] = df_processed['房屋户型'].str.extract(r'(\d+)室').astype(float)
        df_processed['卫生间数'] = df_processed['房屋户型'].str.extract(r'(\d+)卫').astype(float)
        # 计算室卫比例
        df_processed['室卫比'] = df_processed['房间数'] / df_processed['卫生间数'].replace(0, 1)

    # 处理朝向 - 转换为数值特征
    if '房屋朝向' in df_processed.columns:
        # 基本朝向
        df_processed['朝南'] = df_processed['房屋朝向'].str.contains('南', na=False).astype(int)
        df_processed['朝北'] = df_processed['房屋朝向'].str.contains('北', na=False).astype(int)
        df_processed['朝东'] = df_processed['房屋朝向'].str.contains('东', na=False).astype(int)
        df_processed['朝西'] = df_processed['房屋朝向'].str.contains('西', na=False).astype(int)
        # 复合朝向特征
        df_processed['朝向数量'] = df_processed[['朝南', '朝北', '朝东', '朝西']].sum(axis=1)
        df_processed['南北通透'] = ((df_processed['朝南'] == 1) & (df_processed['朝北'] == 1)).astype(int)

    # 处理电梯配备情况
    if '配备电梯' in df_processed.columns:
        df_processed['有电梯'] = (df_processed['配备电梯'] == '有').astype(int)

    # 处理装修情况，将其转换为数值
    if '装修情况' in df_processed.columns:
        装修映射 = {'毛坯': 0, '简装': 1, '精装': 2, '其他': 1.5}
        df_processed['装修情况数值'] = df_processed['装修情况'].map(装修映射).fillna(1)

    # 处理房屋用途
    if '房屋用途' in df_processed.columns:
        df_processed['是否普通住宅'] = (df_processed['房屋用途'] == '普通住宅').astype(int)

    # 处理楼层信息
    if '所在楼层' in df_processed.columns:
        df_processed['是否底层'] = df_processed['所在楼层'].str.contains('低楼层', na=False).astype(int)
        df_processed['是否顶层'] = df_processed['所在楼层'].str.contains('高楼层', na=False).astype(int)
        df_processed['是否中层'] = df_processed['所在楼层'].str.contains('中楼层', na=False).astype(int)

    # 如果是训练集，确保价格列存在且不为NaN
    if '价格' in df_processed.columns:
        df_processed = df_processed.dropna(subset=['价格'])
        # 处理价格信息，进行对数变换，使价格分布更平稳
        df_processed['log_价格'] = np.log1p(df_processed['价格'])

    return df_processed


def feature_engineering(train_df, test_df, details_df=None, rent_df=None):
    """增强的特征工程，加入更多有区分度的特征"""
    train = train_df.copy()
    test = test_df.copy()

    # 基础特征
    base_features = ['建筑面积_数值', '环线数值']

    # 扩展特征
    extended_features = [
        '房间数', '卫生间数', '室卫比',
        '朝南', '朝北', '朝东', '朝西', '朝向数量', '南北通透',
        '有电梯', '装修情况数值', '是否普通住宅',
        '是否底层', '是否顶层', '是否中层'
    ]

    # 合并所有可能的特征
    feature_cols = base_features.copy()
    for feature in extended_features:
        if feature in train.columns:
            feature_cols.append(feature)

    # 添加地理特征
    if 'lon' in train.columns and 'lat' in train.columns:
        # 创建到市中心的距离特征
        train['到市中心距离'] = np.sqrt((train['lon'] - 116.4) ** 2 + (train['lat'] - 39.9) ** 2)
        test['到市中心距离'] = np.sqrt((test['lon'] - 116.4) ** 2 + (test['lat'] - 39.9) ** 2)
        feature_cols.append('到市中心距离')

        # 创建经纬度网格特征 - 捕捉不同区域的价格差异
        train['lon_grid'] = (train['lon'] * 10).astype(int) / 10
        test['lon_grid'] = (test['lon'] * 10).astype(int) / 10
        train['lat_grid'] = (train['lat'] * 10).astype(int) / 10
        test['lat_grid'] = (test['lat'] * 10).astype(int) / 10
        feature_cols.extend(['lon_grid', 'lat_grid'])

    # 融合额外的数据 - 如果可用
    if details_df is not None and not details_df.empty and '小区ID' in train.columns:
        # 小区级别聚合特征
        avg_prices = train.groupby('小区ID')['价格'].agg(['mean', 'median', 'std']).reset_index()
        avg_prices.columns = ['小区ID', '小区均价', '小区中位价', '小区价格标准差']

        # 合并回训练集和测试集
        train = pd.merge(train, avg_prices, on='小区ID', how='left')
        test = pd.merge(test, avg_prices, on='小区ID', how='left')

        # 添加到特征列表
        feature_cols.extend(['小区均价', '小区中位价', '小区价格标准差'])

    # 创建单价特征
    if '价格' in train.columns:
        train['单价'] = train['价格'] / train['建筑面积_数值']

    # 加入平方特征和交互特征
    train['面积平方'] = train['建筑面积_数值'] ** 2
    test['面积平方'] = test['建筑面积_数值'] ** 2
    train['环线面积'] = train['环线数值'] * train['建筑面积_数值']
    test['环线面积'] = test['环线数值'] * test['建筑面积_数值']
    feature_cols.extend(['面积平方', '环线面积'])

    # 提取特征
    train_features = train[feature_cols].copy()
    test_features = test[feature_cols].copy()

    # 处理缺失值和类别型变量
    for col in train_features.columns:
        # 转换类别型变量为数值
        if train_features[col].dtype.name == 'category':
            train_features[col] = train_features[col].astype(float)
            if col in test_features.columns:
                test_features[col] = test_features[col].astype(float)

        # 填充缺失值
        if train_features[col].isna().any():
            if pd.api.types.is_numeric_dtype(train_features[col]):
                median_val = train_features[col].median()
                train_features[col].fillna(median_val, inplace=True)
                test_features[col].fillna(median_val, inplace=True)
            else:
                # 对于非数值列使用众数填充
                mode_val = train_features[col].mode()[0]
                train_features[col].fillna(mode_val, inplace=True)
                test_features[col].fillna(mode_val, inplace=True)

    # 获取目标变量 - 优先使用对数变换后的价格
    if 'log_价格' in train.columns:
        y_train = train['log_价格']
    elif '价格' in train.columns:
        y_train = train['价格']
    else:
        y_train = None

    print(f"使用的特征: {feature_cols}")
    return train_features, test_features, y_train


def train_advanced_model(X_train, X_val, y_train, y_val):
    """训练高级模型并提供详细评估指标"""
    print("\n训练高级模型...")

    # 标准化特征
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # 定义模型
    models = {
        'OLS': LinearRegression(),
        'LASSO': Lasso(alpha=0.001, max_iter=10000, random_state=111),
        'GradientBoosting': GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=4,
            min_samples_split=10,
            random_state=111
        ),
        'RandomForest': RandomForestRegressor(
            n_estimators=200,
            max_depth=15,
            min_samples_split=10,
            random_state=111
        ),
        'ElasticNet': ElasticNet(
            alpha=0.001,
            l1_ratio=0.5,
            max_iter=5000,
            random_state=111
        )
    }

    # 评估模型
    kf = KFold(n_splits=6, shuffle=True, random_state=111)
    best_model = None
    best_score = float('inf')
    results = {}

    # 打印指标表头
    print("\n" + "=" * 80)
    print("{:<15} {:<15} {:<15} {:<15}".format("模型", "In sample", "Out of sample", "Cross-validation"))
    print("-" * 80)

    for name, model in models.items():
        # 训练模型
        if name in ['OLS', 'LASSO', 'ElasticNet']:
            model.fit(X_train_scaled, y_train)
            y_train_pred = model.predict(X_train_scaled)
            y_val_pred = model.predict(X_val_scaled)

            # 交叉验证
            cv_scores = -cross_val_score(model, X_train_scaled, y_train,
                                         scoring='neg_mean_absolute_error', cv=kf)
        else:
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)

            # 交叉验证
            cv_scores = -cross_val_score(model, X_train, y_train,
                                         scoring='neg_mean_absolute_error', cv=kf)

        # 计算度量
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        # 保存结果
        results[name] = {
            'train_mae': train_mae,
            'val_mae': val_mae,
            'train_rmse': train_rmse,
            'val_rmse': val_rmse,
            'cv_mae': cv_scores.mean(),
            'model': model,
            'scaler': scaler if name in ['OLS', 'LASSO', 'ElasticNet'] else None
        }

        # 打印指标行
        print("{:<15} {:<15.4f} {:<15.4f} {:<15.4f}".format(
            name, train_mae, val_mae, cv_scores.mean()))

        # 选择最佳模型
        if cv_scores.mean() < best_score:
            best_score = cv_scores.mean()
            best_model = name

    print("=" * 80)
    print(f"\n最佳模型: {best_model}, CV MAE: {results[best_model]['cv_mae']:.4f}")

    return results, results[best_model]


def generate_predictions(test_data, model_results, test_features, is_log_transformed=False):
    """为所有模型生成预测并创建提交文件"""
    print("\n生成各模型预测结果...")

    submissions = {}

    # 为每个模型生成预测
    for name, result in model_results.items():
        model = result['model']
        scaler = result.get('scaler')

        # 根据模型类型准备特征
        if scaler is not None:
            X_test_scaled = scaler.transform(test_features)
            predictions = model.predict(X_test_scaled)
        else:
            predictions = model.predict(test_features)

        # 如果使用了对数变换，需要反变换回原始价格
        if is_log_transformed:
            predictions = np.expm1(predictions)

        # 确保预测结果为正值
        predictions = np.maximum(predictions, 0)

        # 创建提交文件
        submission = pd.DataFrame({
            'ID': test_data['ID'],
            '价格': predictions
        })

        # 保存提交文件
        filename = f'prediction_{name}.csv'
        submission.to_csv(filename, index=False)
        print(f"已生成{name}模型提交文件: {filename}")

        # 存储预测结果
        submissions[name] = predictions

    # 返回所有预测结果，以便后续分析
    return submissions


def main():
    """主函数"""
    try:
        print("开始加载数据...")
        train_data, test_data, details_data, rent_data = load_data()
        if train_data is None:
            return

        print("数据预处理...")
        train_processed = preprocess_data(train_data)
        test_processed = preprocess_data(test_data)

        print("特征工程...")
        X_train_full, X_test_full, y_train_full = feature_engineering(
            train_processed, test_processed, details_data, rent_data)

        # 移除异常值
        if y_train_full is not None:
            z_scores = np.abs((y_train_full - y_train_full.mean()) / y_train_full.std())
            non_outliers = z_scores < 3.0
            X_train_full = X_train_full[non_outliers]
            y_train_full = y_train_full[non_outliers]
            print(f"移除异常值后的样本数: {len(y_train_full)}")

        print("划分训练集和验证集...")
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_full, y_train_full, test_size=0.2, random_state=111)

        print("训练高级模型...")
        model_results, best_model_result = train_advanced_model(X_train, X_val, y_train, y_val)

        print("生成提交文件...")
        # 检查是否使用了对数变换
        is_log_transformed = 'log_价格' in train_processed.columns
        generate_predictions(test_data, model_results, X_test_full, is_log_transformed)

        # 单独保存最佳模型结果为submission_best.csv
        best_model_name = list(model_results.keys())[0]
        for name, result in model_results.items():
            if result['cv_mae'] < model_results[best_model_name]['cv_mae']:
                best_model_name = name

        print(f"\n最终最佳模型: {best_model_name}")
        best_submission = pd.read_csv(f'prediction_{best_model_name}.csv')
        best_submission.to_csv('prediction_best.csv', index=False)
        print("已生成最佳模型提交文件: prediction_best.csv")

    except Exception as e:
        import traceback
        print(f"执行过程中出错: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

开始加载数据...
成功加载所有数据文件
数据预处理...
特征工程...
使用的特征: ['建筑面积_数值', '环线数值', '房间数', '卫生间数', '室卫比', '朝南', '朝北', '朝东', '朝西', '朝向数量', '南北通透', '有电梯', '装修情况数值', '是否普通住宅', '是否底层', '是否顶层', '是否中层', '到市中心距离', 'lon_grid', 'lat_grid', '面积平方', '环线面积']
移除异常值后的样本数: 83813
划分训练集和验证集...
训练高级模型...

训练高级模型...

模型              In sample       Out of sample   Cross-validation
--------------------------------------------------------------------------------
OLS             0.4097          0.4094          0.4109         
LASSO           0.4100          0.4097          0.4111         
GradientBoosting 0.1843          0.1849          0.1867         
RandomForest    0.1173          0.1360          0.1389         
ElasticNet      0.4099          0.4096          0.4110         

最佳模型: RandomForest, CV MAE: 0.1389
生成提交文件...

生成各模型预测结果...
已生成OLS模型提交文件: prediction_OLS.csv
已生成LASSO模型提交文件: prediction_LASSO.csv
已生成GradientBoosting模型提交文件: prediction_GradientBoosting.csv
已生成RandomForest模型提交文件: prediction_RandomForest.csv
已生成ElasticNe