In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
warnings.filterwarnings('ignore')
np.random.seed(111)

In [3]:
def load_data():
    """加载所有相关数据集"""
    try:
        #注意处理中文字符
        train_data = pd.read_csv('C:/Users/surface/ruc_Class25Q1_train.csv', encoding='utf-8')
        test_data = pd.read_csv('C:/Users/surface/ruc_Class25Q1_test.csv', encoding='utf-8')
        details_data = pd.read_csv('C:/Users/surface/ruc_Class25Q1_details.csv', encoding='utf-8')
        rent_data = pd.read_csv('C:/Users/surface/ruc_Class25Q1_rent.csv', encoding='utf-8')
        print("成功加载所有数据文件")
        return train_data, test_data, details_data, rent_data
    except Exception as e:
        print(f"加载数据文件出错: {e}")
        return None, None, None, None

In [4]:
def preprocess_data(df, is_train=True):
    """数据预处理"""
    #复制数据：防止修改原数据
    df_processed = df.copy()
    
    # 价格处理
    if is_train and '价格' in df_processed.columns:
        df_processed = df_processed.dropna(subset=['价格'])

    # 环线处理
    ring_mapping = {'二环内': 1, '二至三环': 2, '三至四环': 4, '四至五环': 6, '五至六环': 8, '六环外': 10}
    if '环线' in df_processed.columns:
        # 映射转换+填补空缺
        df_processed['环线数值'] = df_processed['环线'].map(ring_mapping).fillna(4)
    else:
        df_processed['环线数值'] = 4
    
    # 提取建筑面积数值
    if '建筑面积' in df_processed.columns:
        #转化文本为浮点数
        df_processed['建筑面积_数值'] = df_processed['建筑面积'].str.extract(r'(\d+\.?\d*)').astype(float)
        df_processed['建筑面积_数值'] = df_processed['建筑面积_数值'].fillna(df_processed['建筑面积_数值'].median())
    else:
        df_processed['建筑面积_数值'] = 90.0
    
    # 处理房屋户型
    if '房屋户型' in df_processed.columns:
        df_processed['房间数'] = df_processed['房屋户型'].str.extract(r'(\d+)室').astype(float).fillna(2)
        df_processed['厅数'] = df_processed['房屋户型'].str.extract(r'(\d+)厅').astype(float).fillna(1)
        df_processed['卫数'] = df_processed['房屋户型'].str.extract(r'(\d+)卫').astype(float).fillna(1)
        #对户型进行得分加权得到总房间数
        df_processed['总房间数'] = df_processed['房间数']*2.0 + df_processed['厅数']*1.0 + df_processed['卫数']*0.5
    else:
        df_processed['房间数'] = 2
        df_processed['厅数'] = 1
        df_processed['卫数'] = 1
        df_processed['总房间数'] = 4
    
    # 处理朝向
    if '房屋朝向' in df_processed.columns:
        for direction in ['南', '北', '东', '西']:
            df_processed[f'朝{direction}'] = df_processed['房屋朝向'].str.contains(direction, na=False).astype(int)
    else:
        df_processed['朝南'], df_processed['朝北'] = 1, 0
        df_processed['朝东'], df_processed['朝西'] = 1, 0
    
    # 处理电梯和装修情况
    if '配备电梯' in df_processed.columns:
        df_processed['有电梯'] = (df_processed['配备电梯'] == '有').astype(int)
    else:
        df_processed['有电梯'] = 0
    
    if '装修情况' in df_processed.columns:
        decoration_map = {'毛坯': 0, '简装': 2, '精装': 4, '其他': 1}
        df_processed['装修情况数值'] = df_processed['装修情况'].map(decoration_map).fillna(2)
    else:
        df_processed['装修情况数值'] = 2
    
    # 处理楼层信息
    if '所在楼层' in df_processed.columns:
        df_processed['楼层类型'] = df_processed['所在楼层'].str.extract(r'(低|中|高)').fillna('中')
        floor_type_map = {'低': 0.7, '中': 1.0, '高': 1.3}
        df_processed['楼层类型数值'] = df_processed['楼层类型'].map(floor_type_map)
        df_processed['总楼层'] = df_processed['所在楼层'].str.extract(r'共(\d+)层').astype(float).fillna(10)
    else:
        df_processed['楼层类型数值'] = 1.0
        df_processed['总楼层'] = 10
    
    return df_processed

In [5]:
def extract_community_features(train_df, test_df, details_df, rent_df):
    """提取和合并更详细的信息"""
    train_enriched = train_df.copy()
    test_enriched = test_df.copy()
    
    # 从details_df提取小区信息
    if details_df is not None and not details_df.empty and '名称' in details_df.columns:
        community_details = details_df[['名称', '环线位置', '容 积 率', '停车位', 'coord_x', 'coord_y']].copy()
        community_details = community_details.rename(columns={
            '名称': '小区名称', '环线位置': '小区环线', '容 积 率': '小区容积率'
        })
        #小区名称列去除重复记录，确保每个小区仅保留一条数据。
        community_details = community_details.drop_duplicates('小区名称')
        
        #左连接将信息合并到房源中
        if '小区名称' in train_enriched.columns:
            train_enriched = pd.merge(train_enriched, community_details, on='小区名称', how='left')
            test_enriched = pd.merge(test_enriched, community_details, on='小区名称', how='left')
    
    # 从rent_df提取租金信息
    if rent_df is not None and not rent_df.empty and '小区名称' in rent_df.columns:
        try:
            rent_df['面积数值'] = pd.to_numeric(
                rent_df['面积'].str.extract(r'(\d+\.?\d*)')[0], errors='coerce'
            ).fillna(70)
            
            rent_df['每平米租金'] = rent_df['价格'] / rent_df['面积数值']
            
            # 仅保留均值和波动
            rent_mean = rent_df.groupby('小区名称')['价格'].mean().rename('小区平均租金')
            rent_std = rent_df.groupby('小区名称')['价格'].std().rename('小区租金波动')
            per_sqm_mean = rent_df.groupby('小区名称')['每平米租金'].mean().rename('小区租金面积比')
            
            avg_rent = pd.DataFrame({'小区名称': rent_mean.index})
            avg_rent['小区平均租金'] = rent_mean.values
            avg_rent['小区租金波动'] = rent_std.reindex(rent_mean.index).values
            avg_rent['小区租金面积比'] = per_sqm_mean.reindex(rent_mean.index).values
            
            if '小区名称' in train_enriched.columns:
                train_enriched = pd.merge(train_enriched, avg_rent, on='小区名称', how='left')
                test_enriched = pd.merge(test_enriched, avg_rent, on='小区名称', how='left')
        except Exception as e:
            print(f"处理租金数据时出错: {e}")
            for df in [train_enriched, test_enriched]:
                df['小区平均租金'] = 4000
                df['小区租金波动'] = 500
                df['小区租金面积比'] = 60
    
    # 计算小区价格统计
    if '小区名称' in train_enriched.columns and '价格' in train_enriched.columns:
        community_price_stats = train_enriched.groupby('小区名称')['价格'].agg(
            ['mean', 'std', 'count']
        ).reset_index()
        
        community_price_stats.columns = ['小区名称', '小区均价', '小区价格波动', '小区房源数']
        
        if '建筑面积_数值' in train_enriched.columns:
            train_enriched['每平米价格'] = train_enriched['价格'] / train_enriched['建筑面积_数值']
            per_sqm_stats = train_enriched.groupby('小区名称')['每平米价格'].mean().reset_index()
            per_sqm_stats.columns = ['小区名称', '小区每平米均价']
            community_price_stats = pd.merge(community_price_stats, per_sqm_stats, on='小区名称', how='left')
        
        train_enriched = pd.merge(train_enriched, community_price_stats, on='小区名称', how='left')
        test_enriched = pd.merge(test_enriched, community_price_stats, on='小区名称', how='left')
    
    return train_enriched, test_enriched

In [None]:
def create_features(train_df, test_df):
    """创建高级特征和交互项"""
    train_featured = train_df.copy()
    test_featured = test_df.copy()
    
    # 北京市中心坐标
    CENTER_LAT, CENTER_LON = 39.909946, 116.397428
    
    for df in [train_featured, test_featured]:
        # 地理位置特征
        if 'lon' in df.columns and 'lat' in df.columns:
            df['lon'] = pd.to_numeric(df['lon'], errors='coerce').fillna(CENTER_LON)
            df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(CENTER_LAT)
            
            try:
                # 计算到市中心的距离及其变换
                df['到市中心距离_km'] = np.sqrt(
                    ((df['lat'] - CENTER_LAT) * 111) ** 2 + 
                    ((df['lon'] - CENTER_LON) * 111 * np.cos(np.radians(CENTER_LAT))) ** 2
                )
                df['到市中心距离_km'] = df['到市中心距离_km'].clip(0, 50)
                df['距离平方'] = df['到市中心距离_km'] ** 2
                df['距离倒数'] = 1 / (df['到市中心距离_km'] + 0.1)
                df['中心区域'] = (df['到市中心距离_km'] < 5).astype(int) * 2
            except:
                df['到市中心距离_km'] = 5.0
                df['距离平方'] = 25.0
                df['距离倒数'] = 0.2
                df['中心区域'] = 0
        
        # 面积特征及变换
        if '建筑面积_数值' in df.columns:
            df['建筑面积平方'] = df['建筑面积_数值'] ** 2
            df['建筑面积平方根'] = np.sqrt(df['建筑面积_数值'])
            
            # 房间交互特征
            if '房间数' in df.columns:
                df['房间数'] = df['房间数'].replace(0, 1)
                df['平均房间面积'] = df['建筑面积_数值'] / df['房间数']
                df['平均房间面积'] = df['平均房间面积'].clip(5, 100)
            
            # 户型特征
            if '厅数' in df.columns and '卫数' in df.columns:
                df['卧厅比'] = df['房间数'] / df['厅数'].replace(0, 0.5)
                df['卧卫比'] = df['房间数'] / df['卫数'].replace(0, 0.5)
                df['户型质量分'] = df['房间数'] * 1.5 + df['厅数'] * 1.0 + df['卫数'] * 0.8
                
                for col in ['卧厅比', '卧卫比']:
                    df[col] = df[col].clip(0.2, 5)
        
        # 朝向特征
        if all(f'朝{d}' in df.columns for d in ['南', '东', '西', '北']):
            df['朝向质量分'] = df['朝南']*6 + df['朝东']*3 + df['朝西']*1 - df['朝北']*1
            df['南北通透'] = ((df['朝南'] == 1) & (df['朝北'] == 1)).astype(int) * 3
            df['东西通透'] = ((df['朝东'] == 1) & (df['朝西'] == 1)).astype(int)
            df['黄金朝向'] = ((df['朝南'] == 1) & (df['朝东'] == 1)).astype(int) * 2.5
            
            # 朝向与面积交互
            if '建筑面积_数值' in df.columns:
                df['南向大户型'] = ((df['朝南'] == 1) & (df['建筑面积_数值'] > 120)).astype(int) * 2
                df['朝向面积交互'] = df['朝向质量分'] * np.log1p(df['建筑面积_数值'])
        
        # 楼层特征
        if '楼层类型数值' in df.columns and '总楼层' in df.columns:
            df['楼层比例'] = df['楼层类型数值'] / df['总楼层']
            df['楼层比例'] = df['楼层比例'].clip(0, 1)
            
            # 电梯楼层交互
            if '有电梯' in df.columns:
                df['高层无电梯惩罚'] = ((df['总楼层'] > 6) & (df['有电梯'] == 0)).astype(int) * -1.5
                df['高层有电梯'] = ((df['总楼层'] > 6) & (df['有电梯'] == 1)).astype(int) * 1.2
                df['电梯溢价'] = df['有电梯'] * np.log1p(df['总楼层'])
        
        # 环线面积交互
        if '建筑面积_数值' in df.columns and '环线数值' in df.columns:
            df['面积环线比'] = df['建筑面积_数值'] / df['环线数值'] ** 2
            df['面积环线比'] = df['面积环线比'].clip(0, 200)
            df['环内大户型'] = ((df['环线数值'] <= 3) & (df['建筑面积_数值'] > 120)).astype(int) * 3
            df['环线倒数'] = 1 / (df['环线数值'] + 0.1)
        
        # 装修面积交互
        if '建筑面积_数值' in df.columns and '装修情况数值' in df.columns:
            df['面积装修交互'] = df['建筑面积_数值'] * df['装修情况数值'] ** 1.5
            
            # 装修年代特征
            if '年份' in df.columns:
                current_year = 2023
                df['建筑年龄'] = current_year - df['年份']
                df['建筑年龄'] = df['建筑年龄'].clip(0, 70)
                df['装修折旧'] = df['装修情况数值'] * np.exp(-0.05 * df['建筑年龄'])
        
        # 距离面积交互
        if '建筑面积_数值' in df.columns and '到市中心距离_km' in df.columns:
            df['到市中心距离_km'] = df['到市中心距离_km'].replace(0, 0.1)
            df['面积距离交互'] = df['建筑面积_数值'] / (df['到市中心距离_km'] ** 1.5)
            df['面积距离交互'] = df['面积距离交互'].clip(0, 1000)
            df['中心区大户型'] = ((df['到市中心距离_km'] < 5) & (df['建筑面积_数值'] > 120)).astype(int) * 4
        
        # 小区价格特征
        if '小区均价' in df.columns:
            df['小区价值指数'] = df['小区均价'] / df['环线数值']
            
            if '到市中心距离_km' in df.columns:
                df['价格距离比'] = df['小区均价'] / (df['到市中心距离_km'] ** 1.2)
                df['价格距离比'] = df['价格距离比'].clip(0, 200000)
            
            # 价格波动特征
            if '小区价格波动' in df.columns:
                df['价格波动率'] = df['小区价格波动'] / (df['小区均价'] + 1)
                df['价格波动率'] = df['价格波动率'].clip(0, 0.5)
        
        # 租金回报率
        if '小区平均租金' in df.columns and '小区均价' in df.columns:
            df['租金回报率'] = (df['小区平均租金'] * 12) / (df['小区均价'] + 1) * 100
            df['租金回报率'] = df['租金回报率'].clip(0, 10)
            
            # 投资稳定性
            if '小区租金波动' in df.columns:
                df['投资稳定性'] = 1 - df['小区租金波动'] / (df['小区平均租金'] + 1)
                df['投资稳定性'] = df['投资稳定性'].clip(0, 1)
    
    return train_featured, test_featured


In [7]:
def main():
    # 数据加载
    train_data, test_data, details_data, rent_data = load_data()
    if train_data is None:
        print("无法加载数据，程序终止")
        return
    
    # 保存测试集ID
    test_ids = test_data['ID'].copy()
    
    # 数据预处理
    train_processed = preprocess_data(train_data, is_train=True)
    test_processed = preprocess_data(test_data, is_train=False)
    
    # 提取小区特征
    train_enriched, test_enriched = extract_community_features(train_processed, test_processed, details_data, rent_data)
    
    # 创建高级特征
    train_featured, test_featured = create_features(train_enriched, test_enriched)
    
    # 处理缺失值和极端值
    for col in train_featured.select_dtypes(include=[np.number]).columns: #筛选出所有数值型列
        if col in test_featured.columns:
            # 替换无穷大为NaN
            train_featured[col] = train_featured[col].replace([np.inf, -np.inf], np.nan)
            test_featured[col] = test_featured[col].replace([np.inf, -np.inf], np.nan)
            
            # 填充缺失值
            median_val = train_featured[col].median()
            if pd.isna(median_val): median_val = 0
            train_featured[col] = train_featured[col].fillna(median_val)
            test_featured[col] = test_featured[col].fillna(median_val)

            #剪切极端值：1%分位数和99%分位数
            try:
                q1, q99 = train_featured[col].quantile(0.01), train_featured[col].quantile(0.99)
                train_featured[col] = train_featured[col].clip(q1, q99)
                test_featured[col] = test_featured[col].clip(q1, q99)
            except:
                pass
    
    # 准备特征矩阵和目标变量
    numerical_features = train_featured.select_dtypes(include=[np.number]).columns.tolist()
    excluded_cols = ['ID', '价格', '每平米价格']
    feature_columns = [col for col in numerical_features if col not in excluded_cols]
    final_features = [col for col in feature_columns if col in train_featured.columns and col in test_featured.columns]
    
    X_train_raw = train_featured[final_features].copy()
    y_raw = train_featured['价格']
    y = np.log1p(y_raw)  # 对数变换
    X_test_raw = test_featured[final_features].copy()
    
    # 标准化特征
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test = scaler.transform(X_test_raw)
    
    # 特征选择
    n_features = min(40, len(final_features))
    try:
        #基于F统计量进行选择
        selector = SelectKBest(f_regression, k=n_features)
        selector.fit(X_train, y)
        #返回被选中的列
        selected_indices = selector.get_support(indices=True)
        selected_features = [final_features[i] for i in selected_indices]
        X_train_selected = X_train[:, selected_indices]
        X_test_selected = X_test[:, selected_indices]
        
        print(f"选择的{len(selected_features)}个最重要特征:")
        for feature in selected_features[:len(selected_features)]:
            print(f"- {feature}")
    except:
        print("特征选择失败，使用所有特征")
        X_train_selected = X_train
        X_test_selected = X_test
        selected_features = final_features
    
    # 划分训练集和验证集
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train_selected, y, test_size=0.2, random_state=111
    )
    
    # 定义模型
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(alpha=0.5),
        'Lasso': Lasso(alpha=0.0003, max_iter=5000),
        'ElasticNet': ElasticNet(alpha=0.0003, l1_ratio=0.7, max_iter=5000),
        'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=15, 
                                              min_samples_split=5, min_samples_leaf=2, 
                                              max_features='sqrt', random_state=111)
    }
    
    # 训练和评估模型
    results = []
    all_predictions = {}
    
    for name, model in models.items():
        print(f"\n训练模型: {name}")
        
        # 训练模型
        model.fit(X_train_split, y_train_split)
        
        # 预测
        y_train_pred = model.predict(X_train_split)
        y_val_pred = model.predict(X_val)
        
        # 计算误差
        train_mae = mean_absolute_error(np.expm1(y_train_split), np.expm1(y_train_pred))
        val_mae = mean_absolute_error(np.expm1(y_val), np.expm1(y_val_pred))
        train_rmse = np.sqrt(((np.expm1(y_train_split) - np.expm1(y_train_pred)) ** 2).mean())
        val_rmse = np.sqrt(((np.expm1(y_val) - np.expm1(y_val_pred)) ** 2).mean())
        
        # 交叉验证
        cv = KFold(n_splits=6, shuffle=True, random_state=111)
        cv_scores = cross_val_score(model, X_train_selected, y, cv=cv, scoring='neg_mean_absolute_error')
        cv_mae = np.mean(-cv_scores)
        
        results.append({
            '模型': name,
            '训练集MAE': train_mae,
            '测试集MAE': val_mae,
            '训练集RMSE': train_rmse,
            '测试集RMSE': val_rmse,
            '交叉验证MAE': cv_mae
        })
        
        # 训练完整模型并预测
        model.fit(X_train_selected, y)
        test_pred_log = model.predict(X_test_selected)
        test_pred = np.expm1(test_pred_log)
        all_predictions[name] = test_pred
        
        # 保存预测结果
        pd.DataFrame({'ID': test_ids, '价格': test_pred}).to_csv(
            f'house_price_predictions_{name}.csv', index=False
        )
        print(f"{name} - 训练集MAE: {train_mae:.4f}, 测试集MAE: {val_mae:.4f}, 训练集RMSE: {train_rmse:.4f}, 测试集RMSE: {val_rmse:.4f}, CV MAE: {cv_mae:.4f}")
    
    # 输出评估结果
    results_df = pd.DataFrame(results)
    print("\n模型评估结果:")
    print(results_df[['模型', '训练集MAE', '测试集MAE', '训练集RMSE', '测试集RMSE', '交叉验证MAE']])
    
    # 最佳模型
    best_model_index = results_df['测试集MAE'].idxmin()
    best_model_name = results_df.loc[best_model_index, '模型']
    print(f"\n最佳单模型: {best_model_name}, 测试集MAE: {results_df.loc[best_model_index, '测试集MAE']:.4f}, 测试集RMSE: {results_df.loc[best_model_index, '测试集RMSE']:.4f}")

if __name__ == "__main__":
    main()

成功加载所有数据文件
选择的40个最重要特征:
- Unnamed: 0
- 城市
- 年份
- 环线数值
- 建筑面积_数值
- 房间数
- 厅数
- 卫数
- 总房间数
- 朝北
- 装修情况数值
- 小区平均租金
- 小区租金波动
- 小区租金面积比
- 小区均价
- 小区价格波动
- 小区房源数
- 小区每平米均价
- 到市中心距离_km
- 距离平方
- 距离倒数
- 中心区域
- 建筑面积平方
- 建筑面积平方根
- 平均房间面积
- 卧厅比
- 户型质量分
- 南北通透
- 南向大户型
- 朝向面积交互
- 高层无电梯惩罚
- 面积环线比
- 环内大户型
- 面积装修交互
- 建筑年龄
- 装修折旧
- 面积距离交互
- 小区价值指数
- 价格距离比
- 租金回报率

训练模型: LinearRegression
LinearRegression - 训练集MAE: 396785.0944, 测试集MAE: 391823.6220, 训练集RMSE: 1151264.2659, 测试集RMSE: 1021985.5147, CV MAE: 0.1815

训练模型: Ridge
Ridge - 训练集MAE: 396531.9405, 测试集MAE: 391781.2460, 训练集RMSE: 1149513.5227, 测试集RMSE: 1021879.0661, CV MAE: 0.1814

训练模型: Lasso
Lasso - 训练集MAE: 395330.1833, 测试集MAE: 395380.8327, 训练集RMSE: 1137845.9595, 测试集RMSE: 1046053.3284, CV MAE: 0.1817

训练模型: ElasticNet
ElasticNet - 训练集MAE: 395641.1628, 测试集MAE: 395730.7875, 训练集RMSE: 1139058.1711, 测试集RMSE: 1047489.2779, CV MAE: 0.1817

训练模型: RandomForest
RandomForest - 训练集MAE: 127091.3216, 测试集MAE: 170447.8412, 训练集RMSE: 438791.5184, 测试集RMSE: 592300.9972, CV MAE