In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, QuantileTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.decomposition import PCA
import re
import warnings
warnings.filterwarnings('ignore')

# 尝试导入高性能模型
try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False
    print("⚠️  XGBoost未安装，将跳过XGBoost模型")

try:
    import lightgbm as lgb
    HAS_LGB = True
except:
    HAS_LGB = False
    print("⚠️  LightGBM未安装，将跳过LightGBM模型")

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def advanced_feature_engineering(df, is_training=True, training_stats=None):
    """高级特征工程 - 添加训练/测试集标识"""
    data = df.copy()
    
    # 如果是训练集，记录统计信息；如果是测试集，使用训练集的统计信息
    if training_stats is None:
        training_stats = {}
    
    # 通用面积提取函数
    def extract_area(area_str):
        if pd.isna(area_str):
            return np.nan
        try:
            numbers = re.findall(r'\d+\.?\d*', str(area_str))
            if len(numbers) >= 2:
                return (float(numbers[0]) + float(numbers[1])) / 2
            elif len(numbers) == 1:
                return float(numbers[0])
        except:
            pass
        return np.nan
    
    # 1. 建筑面积处理
    if '建筑面积' in data.columns:
        data['建筑面积'] = data['建筑面积'].apply(extract_area)
        # 更保守的异常值处理
        data.loc[(data['建筑面积'] > 800) | (data['建筑面积'] < 15), '建筑面积'] = np.nan
    
    # 2. 套内面积
    if '套内面积' in data.columns:
        data['套内面积'] = data['套内面积'].apply(extract_area)
        data.loc[(data['套内面积'] > 800) | (data['套内面积'] < 15), '套内面积'] = np.nan
        
        # 计算得房率
        if '建筑面积' in data.columns:
            data['得房率'] = data['套内面积'] / data['建筑面积']
            data['得房率'] = data['得房率'].clip(0.5, 1.0)  # 合理范围
    
    # 3. 户型信息
    if '房屋户型' in data.columns:
        def extract_room_info(room_str):
            if pd.isna(room_str):
                return 2, 1, 1
            
            room_str = str(room_str)
            
            room_match = re.search(r'(\d+)室', room_str)
            rooms = int(room_match.group(1)) if room_match else 2
            
            hall_match = re.search(r'(\d+)厅', room_str)
            halls = int(hall_match.group(1)) if hall_match else 1
            
            bath_match = re.search(r'(\d+)卫', room_str)
            baths = int(bath_match.group(1)) if bath_match else 1
            
            rooms = max(1, min(rooms, 8))
            halls = max(0, min(halls, 4))
            baths = max(1, min(baths, 4))
            
            return rooms, halls, baths
        
        room_info = data['房屋户型'].apply(extract_room_info)
        data['室数'] = [info[0] for info in room_info]
        data['厅数'] = [info[1] for info in room_info]
        data['卫数'] = [info[2] for info in room_info]
        data['总房间数'] = data['室数'] + data['厅数'] + data['卫数']
        
        # 户型类型编码
        data['户型类型'] = data['室数'].astype(str) + '室' + data['厅数'].astype(str) + '厅'
        
        # 房间配比特征
        data['室厅比'] = data['室数'] / (data['厅数'] + 1)
        data['室卫比'] = data['室数'] / data['卫数']
    
    # 4. 楼层信息
    if '所在楼层' in data.columns:
        def extract_floor_info(floor_str):
            if pd.isna(floor_str):
                return 5, 20, 0.25
            
            floor_str = str(floor_str)
            patterns = [
                r'第?(\d+)层.*共(\d+)层',
                r'(\d+)/(\d+)',
                r'(\d+)层.*(\d+)层'
            ]
            
            for pattern in patterns:
                match = re.search(pattern, floor_str)
                if match:
                    try:
                        current = int(match.group(1))
                        total = int(match.group(2))
                        if 1 <= current <= total <= 100:
                            ratio = current / total
                            return current, total, ratio
                    except:
                        continue
            
            return 5, 20, 0.25
        
        floor_info = data['所在楼层'].apply(extract_floor_info)
        data['当前楼层'] = [info[0] for info in floor_info]
        data['总楼层'] = [info[1] for info in floor_info]
        data['楼层比例'] = [info[2] for info in floor_info]
        
        # 楼层分类（更细致）
        data['楼层类型_底层'] = (data['楼层比例'] <= 0.2).astype(int)
        data['楼层类型_低层'] = ((data['楼层比例'] > 0.2) & (data['楼层比例'] <= 0.4)).astype(int)
        data['楼层类型_中层'] = ((data['楼层比例'] > 0.4) & (data['楼层比例'] <= 0.7)).astype(int)
        data['楼层类型_高层'] = ((data['楼层比例'] > 0.7) & (data['楼层比例'] < 0.9)).astype(int)
        data['楼层类型_顶层'] = (data['楼层比例'] >= 0.9).astype(int)
        
        # 楼层价值评分（一般中高层最贵）
        data['楼层价值'] = np.where(
            (data['楼层比例'] >= 0.3) & (data['楼层比例'] <= 0.8), 1, 0
        )
    
    # 5. 朝向特征（更全面）
    if '房屋朝向' in data.columns:
        data['朝南'] = data['房屋朝向'].str.contains('南', na=False).astype(int)
        data['朝北'] = data['房屋朝向'].str.contains('北', na=False).astype(int)
        data['朝东'] = data['房屋朝向'].str.contains('东', na=False).astype(int)
        data['朝西'] = data['房屋朝向'].str.contains('西', na=False).astype(int)
        data['南北通透'] = ((data['朝南'] == 1) & (data['朝北'] == 1)).astype(int)
        data['东西通透'] = ((data['朝东'] == 1) & (data['朝西'] == 1)).astype(int)
        
        # 朝向评分（南>东>北>西）
        orientation_score = 0
        orientation_score += data['朝南'] * 4
        orientation_score += data['朝东'] * 3
        orientation_score += data['朝北'] * 2
        orientation_score += data['朝西'] * 1
        data['朝向评分'] = orientation_score
        
        # 通透性评分
        data['通透性'] = data['南北通透'] * 2 + data['东西通透'] * 1
    
    # 6. 装修情况
    if '装修情况' in data.columns:
        decoration_map = {
            '豪华装修': 5, '精装修': 4, '中装修': 3, 
            '简装修': 2, '毛坯房': 1, '其他': 2
        }
        data['装修评分'] = data['装修情况'].fillna('其他').map(decoration_map).fillna(2)
    
    # 7. 环线特征
    if '环线' in data.columns:
        def parse_ring(ring_str):
            if pd.isna(ring_str):
                return 5
            
            ring_str = str(ring_str).lower()
            ring_map = {
                '一环': 1, '内环': 1, '二环': 2, '三环': 3, 
                '四环': 4, '五环': 5, '六环': 6
            }
            
            for key, value in ring_map.items():
                if key in ring_str:
                    return value
            
            numbers = re.findall(r'\d+', ring_str)
            if numbers:
                return min(int(numbers[0]), 8)
            
            return 5
        
        data['环线数值'] = data['环线'].apply(parse_ring)
        # 环线价值（越靠内越贵，但非线性）
        data['环线价值'] = np.exp(-(data['环线数值'] - 1) * 0.3)
    
    # 8. 时间特征
    if '交易时间' in data.columns:
        data['交易时间'] = pd.to_datetime(data['交易时间'], errors='coerce')
        data['交易年份'] = data['交易时间'].dt.year.fillna(2023)
        data['交易月份'] = data['交易时间'].dt.month.fillna(6)
        data['交易季度'] = data['交易时间'].dt.quarter.fillna(2)
        
        # 季节性特征（房地产有明显季节性）
        data['是否旺季'] = data['交易月份'].isin([3, 4, 5, 9, 10, 11]).astype(int)
        data['是否年底'] = data['交易月份'].isin([11, 12]).astype(int)
        
        # 年份热编码 - 使用固定年份避免训练测试集特征不一致
        common_years = [2020, 2021, 2022, 2023, 2024]
        for year in common_years:
            data[f'年份_{year}'] = (data['交易年份'] == year).astype(int)
        
        # 删除原始时间列，避免类型问题
        data = data.drop(['交易时间'], axis=1)
    
    # 9. 房龄特征
    if '年份' in data.columns:
        data['年份'] = pd.to_numeric(data['年份'], errors='coerce')
        data['年份'] = data['年份'].fillna(data['年份'].median())
        
        if '交易年份' in data.columns:
            data['房龄'] = data['交易年份'] - data['年份']
            data['房龄'] = data['房龄'].clip(0, 100)
            
            # 房龄分档（非线性贬值）
            data['房龄_新房'] = (data['房龄'] <= 2).astype(int)
            data['房龄_次新'] = ((data['房龄'] > 2) & (data['房龄'] <= 5)).astype(int)
            data['房龄_中等'] = ((data['房龄'] > 5) & (data['房龄'] <= 10)).astype(int)
            data['房龄_较旧'] = ((data['房龄'] > 10) & (data['房龄'] <= 20)).astype(int)
            data['房龄_老房'] = (data['房龄'] > 20).astype(int)
            
            # 房龄价值衰减
            data['房龄价值'] = np.exp(-data['房龄'] * 0.02)
    
    # 10. 电梯特征
    if '有无电梯' in data.columns:
        data['有电梯'] = (data['有无电梯'] == '有').astype(int)
        
        # 电梯必要性（高层更需要电梯）
        if '总楼层' in data.columns:
            data['电梯必要性'] = np.where(data['总楼层'] > 6, 1, 0)
            data['电梯匹配度'] = data['有电梯'] * data['电梯必要性']
    
    return data, training_stats

def advanced_external_merge(df, community_df, rent_df):
    """高级外部数据融合"""
    result_df = df.copy()
    
    def safe_extract_numeric(x):
        if pd.isna(x):
            return np.nan
        try:
            x_str = str(x).replace('%', '').replace('，', '').replace(',', '')
            numbers = re.findall(r'\d+\.?\d*', x_str)
            if numbers:
                return float(numbers[0])
        except:
            pass
        return np.nan
    
    # 1. 小区数据融合（增强版）
    if community_df is not None and len(community_df) > 0:
        result_df['小区名称_clean'] = result_df['小区名称'].astype(str).str.strip()
        community_df['名称_clean'] = community_df['名称'].astype(str).str.strip()
        
        # 处理小区特征
        if '容 积 率' in community_df.columns:
            community_df['容积率'] = community_df['容 积 率'].apply(safe_extract_numeric)
            community_df['容积率'] = community_df['容积率'].fillna(2.5).clip(0.1, 10)
            # 容积率分档
            community_df['容积率_低密度'] = (community_df['容积率'] <= 1.5).astype(int)
            community_df['容积率_中密度'] = ((community_df['容积率'] > 1.5) & (community_df['容积率'] <= 3.0)).astype(int)
            community_df['容积率_高密度'] = (community_df['容积率'] > 3.0).astype(int)
        
        if '绿 化 率' in community_df.columns:
            community_df['绿化率'] = community_df['绿 化 率'].apply(safe_extract_numeric)
            community_df['绿化率'] = community_df['绿化率'].fillna(30).clip(0, 80) / 100
            # 绿化率分档
            community_df['绿化率_高'] = (community_df['绿化率'] >= 0.35).astype(int)
            community_df['绿化率_中'] = ((community_df['绿化率'] >= 0.25) & (community_df['绿化率'] < 0.35)).astype(int)
            community_df['绿化率_低'] = (community_df['绿化率'] < 0.25).astype(int)
        
        if '物 业 费' in community_df.columns:
            community_df['物业费'] = community_df['物 业 费'].apply(safe_extract_numeric)
            community_df['物业费'] = community_df['物业费'].fillna(community_df['物业费'].median())
            # 物业费分档（通常物业费越高，小区品质越好）
            community_df['物业费_档次'] = pd.cut(community_df['物业费'], 
                                             bins=[0, 2, 4, 6, np.inf], 
                                             labels=['低', '中', '高', '豪华']).astype(str)
        
        # 合并小区数据
        try:
            merge_cols = ['名称_clean', '城市']
            feature_cols = [c for c in community_df.columns if c not in ['名称', '名称_clean']]
            
            merged = result_df.merge(
                community_df[feature_cols + ['名称_clean']],
                left_on='小区名称_clean',
                right_on='名称_clean',
                how='left'
            )
            result_df = merged
        except Exception as e:
            print(f"合并小区数据失败: {e}")
    
    # 2. 租金数据融合（增强版）
    if rent_df is not None and len(rent_df) > 0 and '价格' in rent_df.columns:
        try:
            rent_df['价格'] = pd.to_numeric(rent_df['价格'], errors='coerce')
            rent_df = rent_df.dropna(subset=['价格'])
            
            if len(rent_df) > 0:
                # 按小区和城市统计租金
                if '城市' in rent_df.columns:
                    rent_stats = rent_df.groupby(['小区名称', '城市'])['价格'].agg([
                        'mean', 'median', 'count', 'std', 'min', 'max'
                    ]).reset_index()
                    rent_stats.columns = ['小区名称', '城市', '平均租金', '租金中位数', '租金样本数', '租金标准差', '最低租金', '最高租金']
                    merge_keys = ['小区名称', '城市']
                else:
                    rent_stats = rent_df.groupby('小区名称')['价格'].agg([
                        'mean', 'median', 'count', 'std', 'min', 'max'
                    ]).reset_index()
                    rent_stats.columns = ['小区名称', '平均租金', '租金中位数', '租金样本数', '租金标准差', '最低租金', '最高租金']
                    merge_keys = ['小区名称']
                
                # 计算租金特征
                rent_stats['租金变异系数'] = rent_stats['租金标准差'] / (rent_stats['平均租金'] + 1)
                rent_stats['租金区间'] = rent_stats['最高租金'] - rent_stats['最低租金']
                
                # 填充缺失值
                for col in ['平均租金', '租金中位数', '租金标准差', '最低租金', '最高租金']:
                    if col in rent_stats.columns:
                        rent_stats[col] = rent_stats[col].fillna(rent_stats[col].median())
                
                rent_stats['租金样本数'] = rent_stats['租金样本数'].fillna(0)
                rent_stats['租金变异系数'] = rent_stats['租金变异系数'].fillna(0)
                
                # 合并租金数据
                result_df = result_df.merge(rent_stats, on=merge_keys, how='left')
                
                # 全局租金填充
                global_rent_median = rent_stats['平均租金'].median()
                result_df['平均租金'] = result_df['平均租金'].fillna(global_rent_median)
                result_df['租金中位数'] = result_df['租金中位数'].fillna(global_rent_median)
                
        except Exception as e:
            print(f"合并租金数据失败: {e}")
    
    return result_df

def create_advanced_features(df, is_training=True, training_stats=None):
    """创建高级组合特征 - 确保训练测试集一致性"""
    data = df.copy()
    
    if training_stats is None:
        training_stats = {}
    
    # 1. 面积相关组合特征
    if '建筑面积' in data.columns:
        if '总房间数' in data.columns:
            data['每房间面积'] = data['建筑面积'] / (data['总房间数'] + 1)
            
        if '室数' in data.columns:
            data['每室面积'] = data['建筑面积'] / (data['室数'] + 1)
            
        # 面积分档 - 使用训练集的分位数
        if is_training:
            area_quantiles = data['建筑面积'].quantile([0.25, 0.75])
            training_stats['area_q25'] = area_quantiles[0.25]
            training_stats['area_q75'] = area_quantiles[0.75]
        
        if 'area_q25' in training_stats and 'area_q75' in training_stats:
            data['面积_小户型'] = (data['建筑面积'] <= training_stats['area_q25']).astype(int)
            data['面积_中户型'] = ((data['建筑面积'] > training_stats['area_q25']) & 
                                (data['建筑面积'] <= training_stats['area_q75'])).astype(int)
            data['面积_大户型'] = (data['建筑面积'] > training_stats['area_q75']).astype(int)
    
    # 2. 租售比特征（核心特征）
    if '平均租金' in data.columns and '建筑面积' in data.columns:
        data['月租售比'] = data['平均租金'] / (data['建筑面积'] + 1)
        data['年租售比'] = data['月租售比'] * 12
        
        # 租售比分档 - 使用训练集的分位数
        if is_training and data['年租售比'].std() > 0:
            rent_ratio_quantiles = data['年租售比'].quantile([0.33, 0.67])
            training_stats['rent_ratio_q33'] = rent_ratio_quantiles[0.33]
            training_stats['rent_ratio_q67'] = rent_ratio_quantiles[0.67]
        
        if 'rent_ratio_q33' in training_stats and 'rent_ratio_q67' in training_stats:
            data['租售比_低'] = (data['年租售比'] <= training_stats['rent_ratio_q33']).astype(int)
            data['租售比_中'] = ((data['年租售比'] > training_stats['rent_ratio_q33']) & 
                              (data['年租售比'] <= training_stats['rent_ratio_q67'])).astype(int)
            data['租售比_高'] = (data['年租售比'] > training_stats['rent_ratio_q67']).astype(int)
    
    # 3. 位置价值综合评分
    location_score = 0
    if '环线价值' in data.columns:
        location_score += data['环线价值'] * 0.4
    if '朝向评分' in data.columns:
        location_score += data['朝向评分'] / 10 * 0.2  # 归一化
    if '楼层价值' in data.columns:
        location_score += data['楼层价值'] * 0.2
    if '通透性' in data.columns:
        location_score += data['通透性'] / 2 * 0.2  # 归一化
    
    data['位置价值综合'] = location_score
    
    # 4. 房屋品质评分
    quality_score = 0
    if '装修评分' in data.columns:
        quality_score += data['装修评分'] / 5 * 0.3  # 归一化
    if '房龄价值' in data.columns:
        quality_score += data['房龄价值'] * 0.3
    if '有电梯' in data.columns:
        quality_score += data['有电梯'] * 0.2
    if '得房率' in data.columns:
        quality_score += (data['得房率'] - 0.7) * 0.2  # 得房率越高越好
    
    data['房屋品质综合'] = quality_score
    
    # 5. 小区环境评分
    community_score = 0
    if '绿化率' in data.columns:
        community_score += data['绿化率'] * 0.4
    if '容积率' in data.columns:
        # 容积率越低越好（取倒数）
        community_score += (1 / (data['容积率'] + 1)) * 0.3
    if '物业费' in data.columns:
        # 物业费标准化（通常适中最好）
        if is_training:
            median_fee = data['物业费'].median()
            training_stats['median_fee'] = median_fee
        
        if 'median_fee' in training_stats:
            median_fee = training_stats['median_fee']
            community_score += (1 - abs(data['物业费'] - median_fee) / median_fee) * 0.3
    
    data['小区环境综合'] = community_score
    
    # 6. 户型合理性评分
    if all(col in data.columns for col in ['室数', '厅数', '卫数', '建筑面积']):
        # 标准户型比例
        ideal_ratios = {
            1: (40, 60), 2: (60, 90), 3: (90, 130), 
            4: (130, 180), 5: (180, 250)
        }
        
        data['户型合理性'] = 0.5  # 默认值
        
        for rooms in range(1, 6):
            if rooms in ideal_ratios:
                min_area, max_area = ideal_ratios[rooms]
                mask = data['室数'] == rooms
                area_fit = ((data['建筑面积'] >= min_area) & 
                           (data['建筑面积'] <= max_area))
                data.loc[mask & area_fit, '户型合理性'] = 1.0
                data.loc[mask & ~area_fit, '户型合理性'] = 0.3
    
    # 7. 市场热度特征（基于租金样本数）
    if '租金样本数' in data.columns:
        data['市场热度'] = np.log1p(data['租金样本数'])  # 对数变换
        
        # 热度分档 - 使用训练集的分位数
        if is_training and data['市场热度'].std() > 0:
            heat_quantiles = data['市场热度'].quantile([0.33, 0.67])
            training_stats['heat_q33'] = heat_quantiles[0.33]
            training_stats['heat_q67'] = heat_quantiles[0.67]
        
        if 'heat_q33' in training_stats and 'heat_q67' in training_stats:
            data['市场_冷门'] = (data['市场热度'] <= training_stats['heat_q33']).astype(int)
            data['市场_一般'] = ((data['市场热度'] > training_stats['heat_q33']) & 
                              (data['市场热度'] <= training_stats['heat_q67'])).astype(int)
            data['市场_热门'] = (data['市场热度'] > training_stats['heat_q67']).astype(int)
    
    return data, training_stats

def advanced_preprocessing(X_train, X_test, feature_types):
    """高级预处理"""
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()
    
    print(f"高级预处理开始...")
    print(f"训练集形状: {X_train_processed.shape}")
    print(f"测试集形状: {X_test_processed.shape}")
    print(f"数值特征数量: {len(feature_types['numeric'])}")
    print(f"分类特征数量: {len(feature_types['categorical'])}")
    
    # 1. 数值特征处理
    numeric_features = feature_types['numeric']
    processed_numeric = 0
    
    for feat in numeric_features:
        if feat in X_train_processed.columns:
            # 严格的数据类型检查
            dtype = X_train_processed[feat].dtype
            
            # 跳过非数值类型
            if dtype in ['object', 'datetime64[ns]', 'timedelta64[ns]', 'category']:
                print(f"  跳过非数值特征: {feat} (类型: {dtype})")
                continue
            
            # 转换为数值类型
            try:
                X_train_processed[feat] = pd.to_numeric(X_train_processed[feat], errors='coerce')
                X_test_processed[feat] = pd.to_numeric(X_test_processed[feat], errors='coerce')
            except:
                print(f"  无法转换为数值: {feat}")
                continue
                
            # 计算填充值（使用众数或中位数）
            if X_train_processed[feat].nunique() <= 10:  # 离散数值特征
                fill_value = X_train_processed[feat].mode()
                fill_value = fill_value[0] if len(fill_value) > 0 else 0
            else:  # 连续数值特征
                fill_value = X_train_processed[feat].median()
                if pd.isna(fill_value):
                    fill_value = 0
            
            X_train_processed[feat] = X_train_processed[feat].fillna(fill_value)
            X_test_processed[feat] = X_test_processed[feat].fillna(fill_value)
            
            # 温和的异常值处理（使用更大的倍数）
            try:
                std_val = X_train_processed[feat].std()
                if pd.notna(std_val) and std_val > 0:
                    Q1 = X_train_processed[feat].quantile(0.25)
                    Q3 = X_train_processed[feat].quantile(0.75)
                    IQR = Q3 - Q1
                    
                    if IQR > 0:
                        lower_bound = Q1 - 5 * IQR  # 更宽松的界限
                        upper_bound = Q3 + 5 * IQR
                        
                        X_train_processed[feat] = X_train_processed[feat].clip(lower_bound, upper_bound)
                        X_test_processed[feat] = X_test_processed[feat].clip(lower_bound, upper_bound)
                        processed_numeric += 1
            except Exception as e:
                print(f"  异常值处理失败: {feat} - {e}")
                continue
    
    print(f"成功处理数值特征: {processed_numeric}/{len(numeric_features)}")
    
    # 2. 分类特征处理
    le_dict = {}
    categorical_features = feature_types['categorical']
    processed_categorical = 0
    
    for feat in categorical_features:
        if feat in X_train_processed.columns:
            try:
                # 填充缺失值
                mode_val = X_train_processed[feat].mode()
                fill_val = mode_val[0] if len(mode_val) > 0 else '其他'
                
                X_train_processed[feat] = X_train_processed[feat].fillna(fill_val).astype(str)
                X_test_processed[feat] = X_test_processed[feat].fillna(fill_val).astype(str)
                
                # 处理低频类别
                value_counts = X_train_processed[feat].value_counts()
                rare_categories = value_counts[value_counts < 5].index  # 少于5个样本的类别
                
                X_train_processed[feat] = X_train_processed[feat].replace(rare_categories, '其他')
                X_test_processed[feat] = X_test_processed[feat].replace(rare_categories, '其他')
                
                # 标签编码
                le = LabelEncoder()
                le.fit(X_train_processed[feat])
                
                X_train_processed[feat] = le.transform(X_train_processed[feat])
                
                def safe_transform(x):
                    try:
                        return le.transform([x])[0]
                    except:
                        return 0
                
                X_test_processed[feat] = X_test_processed[feat].apply(safe_transform)
                le_dict[feat] = le
                processed_categorical += 1
                
            except Exception as e:
                print(f"  分类特征处理失败: {feat} - {e}")
                continue
    
    print(f"成功处理分类特征: {processed_categorical}/{len(categorical_features)}")
    
    # 3. 最终数据清理
    X_train_processed = X_train_processed.fillna(0)
    X_test_processed = X_test_processed.fillna(0)
    X_train_processed = X_train_processed.replace([np.inf, -np.inf], 0)
    X_test_processed = X_test_processed.replace([np.inf, -np.inf], 0)
    
    print(f"预处理完成:")
    print(f"  训练集最终形状: {X_train_processed.shape}")
    print(f"  测试集最终形状: {X_test_processed.shape}")
    print(f"  训练集缺失值: {X_train_processed.isnull().sum().sum()}")
    print(f"  测试集缺失值: {X_test_processed.isnull().sum().sum()}")
    
    return X_train_processed, X_test_processed, le_dict

def feature_selection(X_train, y_train, feature_names, n_features=50):
    """特征选择"""
    print(f"\n 进行特征选择...")
    print(f"原始特征数: {X_train.shape[1]}")
    
    # 1. 移除低方差特征
    from sklearn.feature_selection import VarianceThreshold
    var_selector = VarianceThreshold(threshold=0.01)
    X_var = var_selector.fit_transform(X_train)
    selected_features = [feature_names[i] for i in range(len(feature_names)) 
                        if var_selector.get_support()[i]]
    
    print(f"移除低方差特征后: {len(selected_features)}")
    
    # 2. 使用统计方法选择特征
    k_best = SelectKBest(score_func=f_regression, k=min(n_features, len(selected_features)))
    X_selected = k_best.fit_transform(X_var, y_train)
    
    # 获取最终选择的特征名
    feature_scores = k_best.scores_
    selected_indices = k_best.get_support(indices=True)
    final_features = [selected_features[i] for i in selected_indices]
    
    print(f"统计特征选择后: {len(final_features)}")
    
    # 显示最重要的特征
    feature_importance = pd.DataFrame({
        'feature': final_features,
        'score': [feature_scores[i] for i in selected_indices]
    }).sort_values('score', ascending=False)
    
    print(f"前10个最重要特征:")
    print(feature_importance.head(10))
    
    return var_selector, k_best, final_features

def train_advanced_models(X_train, X_test, y_train, y_test):
    """训练高级模型"""
    
    # 数据标准化
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    models = {}
    
    # 传统模型（优化参数）
    models['Ridge'] = Ridge(alpha=5.0, random_state=42)
    models['Lasso'] = Lasso(alpha=0.01, random_state=42, max_iter=5000)
    models['ElasticNet'] = ElasticNet(alpha=0.01, l1_ratio=0.7, random_state=42, max_iter=5000)
    
    # 树模型（优化参数）
    models['RandomForest'] = RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    
    models['ExtraTrees'] = ExtraTreesRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    
    models['GradientBoosting'] = GradientBoostingRegressor(
        n_estimators=300,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        max_features='sqrt',
        random_state=42
    )
    
    # 高性能模型
    if HAS_XGB:
        models['XGBoost'] = xgb.XGBRegressor(
            n_estimators=300,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )
    
    if HAS_LGB:
        models['LightGBM'] = lgb.LGBMRegressor(
            n_estimators=300,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
    
    results = []
    trained_models = {}
    
    for name, model in models.items():
        print(f"\n训练模型: {name}")
        
        try:
            # 选择输入数据
            if name in ['Ridge', 'Lasso', 'ElasticNet']:
                X_tr, X_te = X_train_scaled, X_test_scaled
            else:
                X_tr, X_te = X_train, X_test
            
            # 训练模型
            model.fit(X_tr, y_train)
            
            # 预测
            train_pred = model.predict(X_tr)
            test_pred = model.predict(X_te)
            
            # 计算指标
            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)
            
            # 交叉验证
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(model, X_tr, y_train, cv=kfold, scoring='neg_mean_absolute_error')
            cv_mae = -cv_scores.mean()
            cv_std = cv_scores.std()
            
            results.append({
                '模型': name,
                '训练MAE': train_mae,
                '测试MAE': test_mae,
                '训练R2': train_r2,
                '测试R2': test_r2,
                'CV_MAE': cv_mae,
                'CV_STD': cv_std
            })
            
            trained_models[name] = (model, scaler if name in ['Ridge', 'Lasso', 'ElasticNet'] else None)
            
            print(f"{name}:")
            print(f"   测试MAE: {test_mae:.0f}")
            print(f"   测试R2: {test_r2:.4f}")
            print(f"   CV MAE: {cv_mae:.0f} ± {cv_std:.0f}")
            
        except Exception as e:
            print(f"{name} 训练失败: {str(e)}")
    
    return pd.DataFrame(results), trained_models

def create_meta_ensemble(trained_models, X_train, y_train, X_val, y_val):
    """创建元学习集成模型 - 修复版本"""
    print(f"\n🔗 创建元学习集成...")
    
    if len(trained_models) < 2:
        print("  模型数量不足，跳过元学习")
        return None, None
    
    # 准备元特征
    meta_features_train = []
    meta_features_val = []
    model_names = []
    
    for name, (model, scaler) in trained_models.items():
        try:
            print(f"  生成 {name} 的元特征...")
            
            if scaler:
                X_tr_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)
                
                # 重新训练模型
                model.fit(X_tr_scaled, y_train)
                train_pred = model.predict(X_tr_scaled)
                val_pred = model.predict(X_val_scaled)
            else:
                # 重新训练模型
                model.fit(X_train, y_train)
                train_pred = model.predict(X_train)
                val_pred = model.predict(X_val)
            
            meta_features_train.append(train_pred)
            meta_features_val.append(val_pred)
            model_names.append(name)
            
        except Exception as e:
            print(f"  模型 {name} 生成元特征失败: {e}")
            continue
    
    if len(meta_features_train) < 2:
        print("  可用模型不足，跳过元学习")
        return None, None
    
    # 构建元特征矩阵
    meta_X_train = np.column_stack(meta_features_train)
    meta_X_val = np.column_stack(meta_features_val)
    
    # 训练元学习器（简单线性回归）
    meta_model = Ridge(alpha=1.0)
    meta_model.fit(meta_X_train, y_train)
    
    # 验证元模型
    meta_pred = meta_model.predict(meta_X_val)
    meta_mae = mean_absolute_error(y_val, meta_pred)
    meta_r2 = r2_score(y_val, meta_pred)
    
    print(f"元学习集成完成:")
    print(f"   使用模型: {', '.join(model_names)}")
    print(f"   验证MAE: {meta_mae:.0f}")
    print(f"   验证R2: {meta_r2:.4f}")
    
    return meta_model, model_names

def main():
    # 数据路径
    train_path = "ruc_Class25Q1_train.csv"
    test_path = "ruc_Class25Q1_test.csv"
    community_path = "ruc_Class25Q1_details.csv"
    rent_path = "ruc_Class25Q1_rent.csv"
    
    print(f"\n数据加载中...")
    
    # 加载数据
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        print(f"主数据加载成功")
        print(f"   训练集: {train_data.shape}")
        print(f"   测试集: {test_data.shape}")
    except Exception as e:
        print(f"主数据加载失败: {e}")
        return None, None
    
    # 加载辅助数据
    community_data, rent_data = None, None
    try:
        community_data = pd.read_csv(community_path)
        print(f"小区数据: {community_data.shape}")
    except:
        print(f"小区数据加载失败")
    
    try:
        rent_data = pd.read_csv(rent_path)
        print(f"租金数据: {rent_data.shape}")
    except:
        print(f"租金数据加载失败")
    
    # 保存测试集ID
    original_test_ids = test_data['ID'].copy()
    
    print(f"\n高级特征工程...")
    
    # 特征工程流水线 - 添加长度监控
    print("基础特征提取...")
    print(f"   处理前 - 训练集: {len(train_data)}, 测试集: {len(test_data)}")
    
    train_processed, training_stats = advanced_feature_engineering(train_data, is_training=True)
    test_processed, _ = advanced_feature_engineering(test_data, is_training=False, training_stats=training_stats)
    
    print(f"   处理后 - 训练集: {len(train_processed)}, 测试集: {len(test_processed)}")
    
    print("外部数据融合...")
    train_merged = advanced_external_merge(train_processed, community_data, rent_data)
    test_merged = advanced_external_merge(test_processed, community_data, rent_data)
    
    print(f"   融合后 - 训练集: {len(train_merged)}, 测试集: {len(test_merged)}")
    
    # 如果测试集长度发生变化，去重处理
    if len(test_merged) != len(test_data):
        print(f" 测试集长度变化: {len(test_data)} -> {len(test_merged)}")
        print("   进行去重处理...")
        
        # 按ID去重，保持第一次出现的记录
        if 'ID' in test_merged.columns:
            test_merged = test_merged.drop_duplicates(subset=['ID'], keep='first')
            print(f"   去重后测试集长度: {len(test_merged)}")
            
            # 确保ID顺序与原始一致
            test_merged = test_merged.set_index('ID').reindex(original_test_ids).reset_index()
            print(f"   重排序后测试集长度: {len(test_merged)}")
    
    print("高级组合特征...")
    train_final, feature_stats = create_advanced_features(train_merged, is_training=True)
    test_final, _ = create_advanced_features(test_merged, is_training=False, training_stats=feature_stats)
    
    print(f"   最终 - 训练集: {len(train_final)}, 测试集: {len(test_final)}")
    
    # 最终确保测试集长度正确
    if len(test_final) != len(original_test_ids):
        print(f"测试集长度仍然不匹配: {len(test_final)} vs {len(original_test_ids)}")
        
        # 强制修正
        if 'ID' in test_final.columns:
            # 按原始ID顺序重建测试集
            test_final_indexed = test_final.set_index('ID')
            test_final = test_final_indexed.reindex(original_test_ids).reset_index()
            
            # 填充缺失行（如果有的话）
            test_final = test_final.ffill().bfill().fillna(0)
            print(f"   修正后测试集长度: {len(test_final)}")
        else:
            # 如果没有ID列，直接截取
            test_final = test_final.iloc[:len(original_test_ids)]
            print(f"   截取后测试集长度: {len(test_final)}")
    
    # 特征选择 - 确保训练集和测试集特征一致，并排除问题特征
    train_features = set(train_final.columns)
    test_features = set(test_final.columns)
    common_features = list(train_features & test_features)
    
    # 排除不需要的列
    exclude_columns = ['ID', '价格', 'Unnamed: 0', '交易时间', '小区名称', '名称'] + \
                     [col for col in common_features if col.endswith('_clean')]
    
    all_features = [col for col in common_features 
                   if col not in exclude_columns]
    
    print(f"训练集特征数: {len(train_features)}")
    print(f"测试集特征数: {len(test_features)}")
    print(f"共同特征数: {len(common_features)}")
    print(f"排除特征数: {len(exclude_columns)}")
    print(f"可用特征数: {len(all_features)}")
    
    # 分离数值和分类特征
    numeric_features = []
    categorical_features = []
    
    for col in all_features:
        if col in train_final.columns:
            if train_final[col].dtype in ['object']:
                categorical_features.append(col)
            else:
                numeric_features.append(col)
    
    feature_types = {
        'numeric': numeric_features,
        'categorical': categorical_features
    }
    
    # 准备训练数据 - 安全的特征选择
    available_features_train = [col for col in all_features if col in train_final.columns]
    available_features_test = [col for col in all_features if col in test_final.columns]
    final_features = list(set(available_features_train) & set(available_features_test))
    
    print(f"最终使用特征数: {len(final_features)}")
    
    if len(final_features) == 0:
        print("没有可用的共同特征!")
        return None, None
    
    X = train_final[final_features]
    y = train_final['价格']
    X_submit = test_final[final_features]
    
    print(f"\n特征统计:")
    print(f"   总特征数: {len(all_features)}")
    print(f"   数值特征: {len(numeric_features)}")
    print(f"   分类特征: {len(categorical_features)}")
    
    # 异常值处理（更温和）- 只对训练集进行
    print(f"\n🔍 异常值处理...")
    Q1 = y.quantile(0.05)  # 更宽松的界限
    Q3 = y.quantile(0.95)
    mask = (y >= Q1) & (y <= Q3)
    
    X_clean = X[mask]
    y_clean = y[mask]
    
    print(f"   原始训练样本: {len(X)}")
    print(f"   清洗后训练样本: {len(X_clean)}")
    print(f"   移除比例: {(1 - len(X_clean)/len(X))*100:.1f}%")
    print(f"   测试集保持不变: {len(X_submit)} 样本")
    
    # 检查测试集长度是否匹配
    if len(X_submit) != len(original_test_ids):
        print(f"测试集长度不匹配: {len(X_submit)} vs {len(original_test_ids)}")
        print("   尝试修正测试集长度...")
        
        if len(X_submit) > len(original_test_ids):
            # 如果测试集变长了，截取到原始长度
            X_submit = X_submit.iloc[:len(original_test_ids)]
            print(f"   截取后测试集长度: {len(X_submit)}")
        else:
            # 如果测试集变短了，用最后一行填充
            last_row = X_submit.iloc[-1:] if len(X_submit) > 0 else pd.DataFrame()
            while len(X_submit) < len(original_test_ids):
                X_submit = pd.concat([X_submit, last_row], ignore_index=True)
            print(f"   填充后测试集长度: {len(X_submit)}")
    
    # 最终确保长度匹配
    assert len(X_submit) == len(original_test_ids), f"修正后测试集长度仍不匹配: {len(X_submit)} vs {len(original_test_ids)}"
    
    # 数据分割
    X_train, X_val, y_train, y_val = train_test_split(
        X_clean, y_clean, test_size=0.2, random_state=42
    )
    
    print(f"\n 数据预处理...")
    X_train_processed, X_val_processed, le_dict = advanced_preprocessing(X_train, X_val, feature_types)
    X_train_processed, X_submit_processed, _ = advanced_preprocessing(X_train_processed, X_submit, feature_types)
    
    # 长度检查
    print(f"预处理后长度检查:")
    print(f"  训练集: {len(X_train_processed)}")
    print(f"  验证集: {len(X_val_processed)}")  
    print(f"  测试集: {len(X_submit_processed)}")
    print(f"  原始测试ID: {len(original_test_ids)}")       
    print(f"修正后测试集长度: {len(X_submit_processed)}")
    
    # 特征选择
    var_selector, k_best, selected_features = feature_selection(
        X_train_processed, y_train, X_train_processed.columns, n_features=60
    )
    
    # 应用特征选择
    X_train_selected = k_best.transform(var_selector.transform(X_train_processed))
    X_val_selected = k_best.transform(var_selector.transform(X_val_processed))
    X_submit_selected = k_best.transform(var_selector.transform(X_submit_processed))
    
    # 特征选择后长度检查
    print(f"特征选择后长度检查:")
    print(f"  训练集: {X_train_selected.shape}")
    print(f"  验证集: {X_val_selected.shape}")
    print(f"  测试集: {X_submit_selected.shape}")
    
    # 确保测试集行数正确
    assert X_submit_selected.shape[0] == len(original_test_ids), f"特征选择后长度不匹配: {X_submit_selected.shape[0]} vs {len(original_test_ids)}"
    
    print(f"\n模型训练开始...")
    print("=" * 50)
    
    # 训练高级模型
    results_df, trained_models = train_advanced_models(
        X_train_selected, X_val_selected, y_train, y_val
    )
    
    # 显示结果
    if len(results_df) > 0:
        print(f"\n模型性能排行榜:")
        print("=" * 80)
        results_sorted = results_df.sort_values('CV_MAE')
        print(results_sorted[['模型', '测试MAE', '测试R2', 'CV_MAE', 'CV_STD']].to_string(index=False))
        
        # 创建元学习集成
        meta_model, meta_model_names = create_meta_ensemble(
            trained_models, X_train_selected, y_train, X_val_selected, y_val
        )
        
        print(f"\n最终预测生成...")
        
        # 选择最佳模型
        best_model_name = results_sorted.iloc[0]['模型']
        best_model, best_scaler = trained_models[best_model_name]
        
        print(f"最佳单一模型: {best_model_name}")
        
        # 重新训练最佳模型
        X_full_train = np.vstack([X_train_selected, X_val_selected])
        y_full_train = np.hstack([y_train, y_val])
        
        if best_scaler:
            X_full_scaled = best_scaler.fit_transform(X_full_train)
            X_submit_scaled = best_scaler.transform(X_submit_selected)
            best_model.fit(X_full_scaled, y_full_train)
            best_predictions = best_model.predict(X_submit_scaled)
        else:
            best_model.fit(X_full_train, y_full_train)
            best_predictions = best_model.predict(X_submit_selected)
        
        # 预测长度检查
        print(f"预测长度检查:")
        print(f"  预测结果: {len(best_predictions)}")
        print(f"  原始测试ID: {len(original_test_ids)}")
        
        # 确保预测长度正确
        if len(best_predictions) != len(original_test_ids):
            print(f"预测长度不匹配，进行修正...")
            if len(best_predictions) > len(original_test_ids):
                best_predictions = best_predictions[:len(original_test_ids)]
            else:
                print(f" 预测结果过短: {len(best_predictions)} < {len(original_test_ids)}")
                return None, None
        
        final_predictions = best_predictions
        
        # 如果有元学习模型，进行集成
        if meta_model and meta_model_names:
            print(f"🔗 应用元学习集成...")
            # 生成元特征
            meta_features_submit = []
            for name in meta_model_names:
                model, scaler = trained_models[name]
                
                # 重新训练每个模型
                if scaler:
                    X_full_scaled = scaler.fit_transform(X_full_train)
                    X_submit_scaled = scaler.transform(X_submit_selected)
                    model.fit(X_full_scaled, y_full_train)
                    pred = model.predict(X_submit_scaled)
                else:
                    model.fit(X_full_train, y_full_train)
                    pred = model.predict(X_submit_selected)
                    
                meta_features_submit.append(pred)
            
            if len(meta_features_submit) > 0:
                meta_X_submit = np.column_stack(meta_features_submit)
                
                # 检查元特征长度
                print(f"元特征长度检查: {meta_X_submit.shape[0]} vs {len(original_test_ids)}")
                
                # 重新训练元模型
                # 生成训练时的元特征
                meta_features_train = []
                for name in meta_model_names:
                    model, scaler = trained_models[name]
                    if scaler:
                        X_scaled = scaler.fit_transform(X_full_train)
                        model.fit(X_scaled, y_full_train)
                        pred = model.predict(X_scaled)
                    else:
                        model.fit(X_full_train, y_full_train)
                        pred = model.predict(X_full_train)
                    meta_features_train.append(pred)
                
                meta_X_train = np.column_stack(meta_features_train)
                meta_model.fit(meta_X_train, y_full_train)
                ensemble_predictions = meta_model.predict(meta_X_submit)
                
                # 长度匹配检查
                if len(ensemble_predictions) == len(best_predictions):
                    # 混合预测（给最佳模型更高权重）
                    final_predictions = 0.8 * best_predictions + 0.2 * ensemble_predictions
                    print(f"使用集成预测 (最佳模型80% + 元学习20%)")
                else:
                    print(f"集成预测长度不匹配，使用最佳单一模型")
            else:
                print(f"元学习特征生成失败，使用最佳单一模型")
        else:
            print(f"使用最佳单一模型预测")
        
        # 创建提交文件
        print(f"\n创建提交文件...")
        print(f"最终长度检查:")
        print(f"  预测结果: {len(final_predictions)}")
        print(f"  测试集ID: {len(original_test_ids)}")
        
        # 最终长度确保
        if len(final_predictions) != len(original_test_ids):
            print(f"最终长度不匹配，进行修正...")
            min_length = min(len(final_predictions), len(original_test_ids))
            final_predictions = final_predictions[:min_length]
            test_ids_to_use = original_test_ids.iloc[:min_length]
        else:
            test_ids_to_use = original_test_ids
        
        submission = pd.DataFrame({
            'ID': test_ids_to_use,
            '价格': final_predictions
        })
        
        submission.to_csv('high_performance_submission.csv', index=False)
        
        print(f"\n预测完成!")
        print("=" * 50)
        print(f"预测统计:")
        print(f"   样本数量: {len(submission)}")
        print(f"   价格范围: {submission['价格'].min():.0f} - {submission['价格'].max():.0f}")
        print(f"   平均价格: {submission['价格'].mean():.0f}")
        print(f"   中位价格: {submission['价格'].median():.0f}")
        print(f"   标准差: {submission['价格'].std():.0f}")
        print(f"\n文件保存: high_performance_submission.csv")
        
        return submission, results_df
    
    else:
        print("没有成功训练的模型")
        return None, None

if __name__ == "__main__":
    submission, results = main()


数据加载中...
主数据加载成功
   训练集: (84133, 32)
   测试集: (14786, 32)
小区数据: (3100, 27)
租金数据: (84150, 23)

高级特征工程...
基础特征提取...
   处理前 - 训练集: 84133, 测试集: 14786
   处理后 - 训练集: 84133, 测试集: 14786
外部数据融合...
合并租金数据失败: '城市'
合并租金数据失败: '城市'
   融合后 - 训练集: 91613, 测试集: 16110
 测试集长度变化: 14786 -> 16110
   进行去重处理...
   去重后测试集长度: 14786
   重排序后测试集长度: 14786
高级组合特征...
   最终 - 训练集: 91613, 测试集: 14786
训练集特征数: 123
测试集特征数: 123
共同特征数: 122
排除特征数: 8
可用特征数: 118
最终使用特征数: 118

特征统计:
   总特征数: 118
   数值特征: 78
   分类特征: 40

🔍 异常值处理...
   原始训练样本: 91613
   清洗后训练样本: 82562
   移除比例: 9.9%
   测试集保持不变: 14786 样本

 数据预处理...
高级预处理开始...
训练集形状: (66049, 118)
测试集形状: (16513, 118)
数值特征数量: 78
分类特征数量: 40
成功处理数值特征: 44/78
成功处理分类特征: 40/40
预处理完成:
  训练集最终形状: (66049, 118)
  测试集最终形状: (16513, 118)
  训练集缺失值: 0
  测试集缺失值: 0
高级预处理开始...
训练集形状: (66049, 118)
测试集形状: (14786, 118)
数值特征数量: 78
分类特征数量: 40
成功处理数值特征: 44/78
成功处理分类特征: 40/40
预处理完成:
  训练集最终形状: (66049, 118)
  测试集最终形状: (14786, 118)
  训练集缺失值: 0
  测试集缺失值: 0
预处理后长度检查:
  训练集: 66049
  验证集: 16513
  测试集: 14786
  原始测试ID: 