In [1]:
import  pandas as pd
import re
import cn2an
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. 数据加载与合并
train = pd.read_csv('ruc_Class25Q1_train.csv')
test = pd.read_csv('ruc_Class25Q1_test.csv')
details = pd.read_csv('ruc_Class25Q1_details.csv')
rent = pd.read_csv('ruc_Class25Q1_rent.csv')
# 合并小区详细信息
train = train.merge(details, how='left',
                    left_on=['小区名称', '区域', '城市'],
                    right_on=['名称', '区县', '城市'])

test = test.merge(details, how='left',
                  left_on=['小区名称', '区域', '城市'],
                  right_on=['名称', '区县', '城市'])

#生成城市独热变量X1
X_1 = pd.get_dummies(train['城市'], prefix='X_1', dtype=int, drop_first=True)

#生成楼层变量X2
def parse_floor(floor_str):
    match = re.search(r'^(.+?)\s*\(共(\d+)层\)$', floor_str)
    if match:
        desc = match.group(1).strip()  # 楼层描述（中楼层/顶层/底层等）
        total = int(match.group(2))    # 总楼层数
        return desc, total
    return None, None  # 处理格式错误的情况

def get_floor_value(desc, total):
    # 将输入转换为Series（如果是单个值也会被转换）
    desc_series = pd.Series(desc).astype(str).str.strip()
    total_series = pd.Series(total).fillna(1).astype(int)
    result = pd.Series(np.nan, index=desc_series.index)
    # 处理地下室（优先级最高）
    basement_mask = desc_series.str.contains("地下室", na=False)
    result[basement_mask] = 0.0
    # 确保总层数至少为1
    total_series = total_series.clip(lower=1)
    # 处理其他楼层类型
    masks = {
        "底层": (1 / total_series).round(4),
        "顶层": 1.0,
        "低楼层": ((1 + total_series / 3) / (2 * total_series)).round(4),
        "中楼层": ((total_series / 3) + (2 * total_series / 3) ) / (2 * total_series).round(4),
        "高楼层": ((2 * total_series / 3 + total_series) / (2 * total_series)).round(4)
    }
    for pattern, value in masks.items():
        mask = desc_series.str.contains(pattern, na=False) & ~basement_mask
        result[mask] = value if isinstance(value, (int, float)) else value[mask]
    # 填充默认值（未匹配任何模式的情况）
    result.fillna(0.5, inplace=True)
    return result.values  # 返回numpy数组
parsed_data_2 = [parse_floor(item) for item in train['所在楼层']]
X_2_df = pd.DataFrame(parsed_data_2, columns=['desc', 'total'])
X_2 = get_floor_value(X_2_df['desc'], X_2_df['total'])
X_2 = pd.Series(get_floor_value(X_2_df['desc'], X_2_df['total']), name='楼层系数')

#生成户型变量X3
def parse_layout(layout_str):
    # 类型检查 + 缺失值处理
    if pd.isna(layout_str) or not isinstance(layout_str, str):
        return {
            'bedrooms': 0,
            'living_rooms': 0,
            'kitchens': 0,
            'bathrooms': 0
        }
    # 统一术语
    layout_str = str(layout_str).replace("房间", "室")
    # 正则匹配
    pattern = r'(\d+)室(\d+)厅(\d+)厨(\d+)卫'
    match = re.search(pattern, layout_str)
    if match:
        return {
            'bedrooms': int(match.group(1)),
            'living_rooms': int(match.group(2)),
            'kitchens': int(match.group(3)),
            'bathrooms': int(match.group(4))
        }
    else:
        # 处理简写格式
        alt_match = re.search(r'(\d+)室?.*?(\d+)卫', layout_str)
        return {
            'bedrooms': int(alt_match.group(1)) if alt_match else 0,
            'living_rooms': 1 if '厅' in layout_str else 0,
            'kitchens': 1 if '厨' in layout_str else 0,
            'bathrooms': int(alt_match.group(2)) if alt_match else 0
        }
parsed_data_3 = [parse_layout(item) for item in train['房屋户型']]
X_3 = pd.DataFrame([x for x in parsed_data_3 if x is not None])

#生成得房率变量X4，缺失处用城市平均得房率替代
def clean_area(series):
    # 移除所有非数字字符（包括㎡等单位）
    return (
        series.astype(str)
        .str.replace(r'[^\d.]', '', regex=True)  # 只保留数字和点
        .replace('', np.nan)  # 空字符串转为NaN
        .astype(float)  # 转换为浮点数
    )
def calculate_efficiency(df):
    # 基础计算
    df['得房率'] = df['套内面积'] / df['建筑面积']
    # 计算城市平均值
    city_avg = df.groupby('城市')['得房率'].mean()
    # 填充缺失值
    df['得房率_修正'] = df.apply(
        lambda row: city_avg[row['城市']] if pd.isna(row['得房率']) else row['得房率'],
        axis=1
    )
    # 处理城市标记缺失的情况（用全局平均值）
    global_avg = df['得房率'].mean()
    df['得房率_修正'] = df['得房率_修正'].fillna(global_avg)
    return df
train['建筑面积'] = clean_area(train['建筑面积'])
train['套内面积'] = clean_area(train['套内面积'])
X_4 = calculate_efficiency(train)

#生成房屋朝向变量X5，朝南标记为1，其余为0
def process_direction(df):
    # 基础朝南标记
    df_value = df.str.contains('南').astype(int)
    return df_value
X_5 = process_direction(train['房屋朝向'])

#生成装修情况变量X6,发现缺失值都是地下室，那么直接标记为1，等同毛坯
def process_renovation(df):
    # 基础编码
    df_re = df.map({
        '毛坯': 0,
        '简装': 0.5,
        '精装': 1,
        '其它': 0.1
    }).fillna(0)  # 缺失值填充为1
    return df_re
X_6 = process_renovation(train['装修情况'])

#生成梯户比X7，公式为户/梯
def parse_ratio(text):
    if pd.isna(text):
        return 0  # 直接返回0

    try:
        ti = cn2an.cn2an(text.split('梯')[0], "smart")  # 梯数
        hu = cn2an.cn2an(text.split('户')[0].split('梯')[1], "smart")  # 户数
        return round(hu / ti, 2) if ti != 0 else 0
    except:
        return 0  # 解析失败也返回0
X_7 = train['梯户比例'].apply(parse_ratio)

#生成变量X8，是否电梯
def process_elevator(df):
    return (df == '有').fillna(0).astype(int)
X_8 = process_elevator(train['配备电梯'])

#生成变量X9,别墅类型
def villa_value(df):
    filled = df.fillna('NA').astype(str)
    return np.select(
        condlist=[
            filled.str.contains('拼'),
            filled.str.contains('排'),
            filled.str.contains('独')
        ],
        choicelist=[1.0, 2.0, 3.0],
        default=0.0
    )
X_9 = villa_value(train['别墅类型'])

#生成变量X10，房屋用途
housing_keywords = [
    '普通住宅', '公寓/住宅', '公寓', '公寓（住宅）', '公寓/公寓', '住宅式公寓',
    '别墅', '四合院', '平房', '老公寓', '新式里弄', '花园洋房'
]
X_10 = train['房屋用途'].isin(housing_keywords).astype(int)

#生成变量X11，交易权属
property_score_map = {
    '商品房': 1.00,
    '私产': 0.95,
    '已购公房': 0.70,
    '房改房': 0.65,
    '央产房': 0.60,
    '一类经济适用房': 0.45,
    '限价商品房': 0.40,
    '自住型商品房': 0.38,
    '二类经济适用房': 0.30,
    '使用权': 0.05,
    '集资房': 0.20,  # 归为其他
    '拆迁还建房': 0.20,
    '动迁安置房': 0.20,
    '售后公房': 0.20,
    '定向安置房': 0.20
}
def encode_property_type(df_col):
    # 处理缺失值（用最低分填充）
    filled = df_col.fillna('其他')
    encoded = filled.map(property_score_map)
    return encoded.fillna(0.20)
X_11 = encode_property_type(train['交易权属'])

#X12， 房屋年限
年限映射 = {
    '满五年': 1.0,
    '满两年': 0.5,
    '未满两年': 0
}
def year_value(series):
    return series.map(年限映射).fillna(0)
X_12 = year_value(train['房屋年限'])

#X13, 产权所属
def encode_ownership(df):
    # 基础编码
    df_own = df.eq('非共有').astype(int)
    return df_own
X_13 = encode_ownership(train['产权所属'])

#X14, 文本数据处理
def process_property_text(df):
    """文本特征处理器（仅返回生成的评分列）"""
    # 创建副本避免修改原数据
    df_processed = df.copy()
    # 预处理
    df_processed['周边配套'] = df_processed['周边配套'].fillna('').astype(str)
    df_processed['交通出行'] = df_processed['交通出行'].fillna('').astype(str)
    # 配套设施评分
    def facility_score(text):
        if not text.strip():
            return 0.0
        facility_weights = {'医院': 0.4, '超市': 0.2, '公园': 0.1, '学校': 0.3}
        score = sum(v for k, v in facility_weights.items() if k in text)
        return min(score, 1)
    # 交通评分
    def transport_score(text):
        if not text.strip():
            return 0.0
        metro_lines = len(re.findall(r'\d+号线', text))
        metro_bonus = 0.5 * min(metro_lines, 4) / 4
        transfer = 1 if ('换乘' in text) or ('交汇' in text) else 0
        bus = 1 if ('公交' in text) or bool(re.search(r'\b\d{2,4}路\b', text)) else 0
        return min(metro_bonus + transfer * 0.2 + bus * 0.3, 1)
    scores = pd.DataFrame({
        '配套评分': df_processed['周边配套'].apply(facility_score),
        '交通评分': df_processed['交通出行'].apply(transport_score)
    })
    return scores
X_14 = process_property_text(train)

#X15,区域
X_15 = pd.get_dummies(train['区域'], prefix='X_15', dtype=int, drop_first=True)

#X16,板块
X_16 = pd.get_dummies(train['板块_x'], prefix='X_16', dtype=int, drop_first=True)

#X17,建筑年代，因许多时间为区间，因此计算区间上界到2020的年限
def extract_end_year(s):
    if pd.isna(s):
        return None
    years = re.findall(r"\d+", s)
    end_year = max(map(int, years))
    end_year = min(end_year, 2020)  # 处理超2020的情况
    return end_year
end = train["建筑年代"].apply(extract_end_year)
# 填补缺失值（用中位数）
median_age = end.median()
end = end.fillna(median_age)
# 计算房龄
X_17 = 2020 - end

#X18,户栋比
train["房屋数"] = train["房屋总数"].str.extract(r"(\d+)").astype(float)
train["楼栋数"] = train["楼栋总数"].str.extract(r"(\d+)").astype(float)
X_18 = pd.DataFrame({"户栋比": train["房屋数"]/train["楼栋数"]})
median_hudongbi = X_18.median()
X_18 = X_18.fillna(median_hudongbi)

#X19,绿化率
def clean_percent(s):
    if pd.isna(s):
        return None
    match = re.search(r"(\d+)%", str(s))
    return min(float(match.group(1)) / 100, 1) if match else None  # 限制最大值为1
X_19 = train["绿 化 率"].apply(clean_percent)
median_value = X_19.median()
X_19 = X_19.fillna(median_value)

#X20，环线
train['城市_环线'] = train['城市'].astype(str) + '_' + train['环线'].fillna('缺失')  # 处理缺失值
X_20 = pd.get_dummies(train['城市_环线'], prefix='X_20', dtype=int, drop_first=True)

#X21,停车位
train["停车位"] = train["停车位"].fillna(0)
q95 = train[train["停车位"] > 0]["停车位"].quantile(0.95)
X_21 = np.where(train["停车位"] > q95, q95, train["停车位"])
X_21 = np.log1p(X_21)

#X22，楼层系数与是否电梯的交互项
X_22 = X_2 * X_8

#X24,建筑面积
X_23 = clean_area(train['建筑面积'])

# 2. 特征整合与预处理
# -------------------------------------------------
# 将离散生成的特征整合为DataFrame
feature_components = [
    X_1.add_prefix('城市_'),
    X_2,
    X_3.add_prefix('户型_'),
    X_4['得房率_修正'],
    X_5,
    X_6,
    X_7,
    X_8,
    pd.DataFrame(X_9, columns=['别墅类型']),
    X_10,
    X_11,
    X_12,
    X_13,
    X_14,
    X_15.add_prefix('区域_'),
    X_16.add_prefix('板块_'),
    X_17,
    X_18,
    X_19,
    X_20.add_prefix('环线_'),
    pd.DataFrame(X_21, columns=['停车位']),
    pd.DataFrame(X_22, columns=['交互项:楼层系数与是否电梯']),
    X_23
]
# 横向拼接所有特征
X = pd.concat(feature_components, axis=1)

# 3. 数据标准化
# -------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# ==== 新增部分：目标变量对数变换 & 标准化 ====
y_original = train['价格']
y_log = np.log(y_original)  # 先取自然对数
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y_log.values.reshape(-1, 1)).flatten()

# ==== 数据划分使用对数标准化后的y ====
X_train, X_val, y_train_scaled, y_val_scaled = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=111)


# 4. 测试集特征处理（复用训练集的处理逻辑）
# -------------------------------------------------
def preprocess_test_data(test_df, details_df, train_city_avg=None):
    """预处理测试集数据"""
    # 城市独热编码（需与训练集列对齐）
    X_test_1 = pd.get_dummies(test['城市'], prefix='X_1', dtype=int)
    missing_cols_1 = set(X_1.columns) - set(X_test_1.columns)
    for col in missing_cols_1:
        X_test_1[col] = 0
    X_test_1 = X_test_1[X_1.columns]
    # 楼层系数
    parsed_test_2 = [parse_floor(item) for item in test['所在楼层']]
    X_test_2_df = pd.DataFrame(parsed_test_2, columns=['desc', 'total'])
    X_test_2 = pd.Series(
        get_floor_value(X_test_2_df['desc'], X_test_2_df['total']),
        name='楼层系数'
    )
    parsed_test_3 = [parse_layout(item) for item in test['房屋户型']]
    X_test_3 = pd.DataFrame([x for x in parsed_test_3 if x is not None])
    # 得房率（使用训练集的城市平均值）
    test['建筑面积'] = clean_area(test['建筑面积'])
    test['套内面积'] = clean_area(test['套内面积'])
    test['得房率'] = test['套内面积'] / test['建筑面积']
    # 使用训练集的城市平均值
    test['得房率_修正'] = test.apply(
        lambda row: train_city_avg.get(row['城市'], 0.75) if pd.isna(row['得房率']) else row['得房率'],
        axis=1
    )
    X_test_4 = test['得房率_修正']
    X_test_5 = process_direction(test['房屋朝向'])
    X_test_6 = process_renovation(test['装修情况'])
    X_test_7 = test['梯户比例'].apply(parse_ratio)
    X_test_8 = process_elevator(test['配备电梯'])
    X_test_9 = villa_value(test['别墅类型'])
    X_test_10 = test['房屋用途'].isin(housing_keywords).astype(int)
    X_test_11 = encode_property_type(test['交易权属'])
    X_test_12 = year_value(test['房屋年限'])
    X_test_13 = encode_ownership(test['产权所属'])
    X_test_14 = process_property_text(test)
    X_test_15 = pd.get_dummies(test['区域'], prefix='X_15', dtype=int)
    missing_cols_15 = set(X_15.columns) - set(X_test_15.columns)
    for col in missing_cols_15:
        X_test_15[col] = 0
    X_test_15 = X_test_15[X_15.columns]
    X_test_16 = pd.get_dummies(test['板块_x'], prefix='X_16', dtype=int)
    missing_cols_16 = set(X_16.columns) - set(X_test_16.columns)
    for col in missing_cols_16:
        X_test_16[col] = 0
    X_test_16 = X_test_16[X_16.columns]
    end_test = test["建筑年代"].apply(extract_end_year)
    end_test = end_test.fillna(median_age)
    X_test_17 = 2020 - end_test
    test["房屋数"] = test["房屋总数"].str.extract(r"(\d+)").astype(float)
    test["楼栋数"] = test["楼栋总数"].str.extract(r"(\d+)").astype(float)
    X_test_18 = pd.DataFrame({"户栋比": test["房屋数"]/test["楼栋数"]})
    X_test_18 = X_test_18.fillna(median_hudongbi)
    X_test_19 = test["绿 化 率"].apply(clean_percent)
    X_test_19 = X_test_19.fillna(median_value)
    test['城市_环线'] = test['城市'].astype(str) + '_' + test['环线'].fillna('缺失')
    X_test_20 = pd.get_dummies(test['城市_环线'], prefix='X_20', dtype=int, drop_first=True)
    test["停车位"] = test["停车位"].fillna(0)
    X_test_21 = np.where(test["停车位"] > q95, q95, test["停车位"])
    X_test_21 = np.log1p(X_test_21)
    X_test_22 = X_test_2 * X_test_8
    X_test_23 = clean_area(test['建筑面积'])

    # 整合特征
    feature_components_test = [
        X_test_1.add_prefix('城市_'),
        X_test_2,
        X_test_3.add_prefix('户型_'),
        X_test_4,
        X_test_5,
        X_test_6,
        X_test_7,
        X_test_8,
        pd.DataFrame(X_test_9, columns=['别墅类型']),
        X_test_10,
        X_test_11,
        X_test_12,
        X_test_13,
        X_test_14,
        X_test_15.add_prefix('区域_'),
        X_test_16.add_prefix('板块_'),
        X_test_17,
        X_test_18,
        X_test_19,
        X_test_20.add_prefix('环线_'),
        pd.DataFrame(X_test_21, columns=['停车位']),
        pd.DataFrame(X_test_22, columns=['交互项:楼层系数与是否电梯']),
        X_test_23
    ]

    return pd.concat(feature_components_test, axis=1)

# 获取训练集的得房率城市平均值（用于填充测试集缺失值）
train_city_avg = X_4.groupby(train['城市'])['得房率_修正'].mean().to_dict()
# 处理测试集数据
X_test = preprocess_test_data(test, details, train_city_avg)

# 5.测试集数据标准化（使用训练集的scaler）
# -------------------------------------------------
X_test_scaled = scaler.transform(X_test)  # 注意：使用训练集的scaler

# 6. 模型训练与评估
# -------------------------------------------------
# 定义模型配置
models_config = {
    'OLS': {
        'model': LinearRegression(),
        'params': {},
    },
    'Lasso': {
        'model': Lasso(max_iter=10000),
        'params': {
            'alpha': [0.001]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [1000]
        }
    },
    'ElasticNet': {
        'model': ElasticNet(max_iter=10000),
        'params': {
            'alpha': [0.01],
            'l1_ratio': [0.005]
        }
    }
}


def inverse_transform(y_scaled):
    """逆变换步骤：标准化逆变换 -> 指数变换"""
    y_log = scaler_y.inverse_transform(y_scaled.reshape(-1, 1)).flatten()
    return np.exp(y_log)
def inverse_mae(y_true_scaled, y_pred_scaled):
    y_true = inverse_transform(y_true_scaled)
    y_pred = inverse_transform(y_pred_scaled)
    return mean_absolute_error(y_true, y_pred)
def inverse_rmse(y_true_scaled, y_pred_scaled):
    y_true = inverse_transform(y_true_scaled)
    y_pred = inverse_transform(y_pred_scaled)
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring = {
    'MAE': make_scorer(inverse_mae),
    'RMSE': make_scorer(inverse_rmse)
}

# ==== 修改部分：模型训练和评估流程 ====
performance_report = []
results = {}

for model_name in models_config:
    print(f"\n=== 训练 {model_name} ===")
    config = models_config[model_name]
    metrics = {'Model': model_name}

    if model_name == 'OLS':
        # 样本内评估
        model = config['model'].fit(X_train, y_train_scaled)
        y_train_pred_scaled = model.predict(X_train)
        metrics['In-sample MAE'] = inverse_mae(y_train_scaled, y_train_pred_scaled)
        metrics['In-sample RMSE'] = inverse_rmse(y_train_scaled, y_train_pred_scaled)

        # 交叉验证（需要重新实现以包含逆变换）
        kf = KFold(n_splits=6, shuffle=True, random_state=111)
        cv_mae_scores = []
        cv_rmse_scores = []
        for train_idx, val_idx in kf.split(X_scaled):
            model_cv = LinearRegression().fit(X_scaled[train_idx], y_scaled[train_idx])
            y_pred_scaled = model_cv.predict(X_scaled[val_idx])
            cv_mae_scores.append(inverse_mae(y_scaled[val_idx], y_pred_scaled))
            cv_rmse_scores.append(inverse_rmse(y_scaled[val_idx], y_pred_scaled))
        metrics['CV MAE'] = np.mean(cv_mae_scores)
        metrics['CV RMSE'] = np.mean(cv_rmse_scores)

        # 验证集评估
        y_val_pred_scaled = model.predict(X_val)
        metrics['Out-sample MAE'] = inverse_mae(y_val_scaled, y_val_pred_scaled)
        metrics['Out-sample RMSE'] = inverse_rmse(y_val_scaled, y_val_pred_scaled)

    else:
        # 网格搜索（使用自定义评分）
        gscv = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=6,
            scoring=scoring,
            refit='RMSE',
            return_train_score=True,
            n_jobs=-1
        )
        gscv.fit(X_scaled, y_scaled)

        # 获取最佳模型
        best_model = gscv.best_estimator_
        results[model_name] = {
            'model': gscv.best_estimator_,
            'params': gscv.best_params_
        }
        print(f"{model_name}训练完成，最佳参数: {gscv.best_params_}")

        # 样本内评估
        y_train_pred_scaled = best_model.predict(X_scaled)
        metrics['In-sample MAE'] = inverse_mae(y_scaled, y_train_pred_scaled)
        metrics['In-sample RMSE'] = inverse_rmse(y_scaled, y_train_pred_scaled)

        # 交叉验证结果
        metrics['CV MAE'] = gscv.cv_results_['mean_test_MAE'][gscv.best_index_]
        metrics['CV RMSE'] = gscv.cv_results_['mean_test_RMSE'][gscv.best_index_]

        # 验证集评估
        y_val_pred_scaled = best_model.predict(X_val)
        metrics['Out-sample MAE'] = inverse_mae(y_val_scaled, y_val_pred_scaled)
        metrics['Out-sample RMSE'] = inverse_rmse(y_val_scaled, y_val_pred_scaled)

    # 计算Score
    weighted_error = 0.3 * metrics['CV MAE'] + 0.4 * metrics['Out-sample MAE'] + 0.3 * metrics['Out-sample RMSE']
    metrics['Score'] = ((1 - weighted_error / (weighted_error.max() * 1.2)).clip(0, 1) * 100).round(2)
    performance_report.append(metrics)


# 5. 生成报告表格
# -------------------------------------------------
report_df = pd.DataFrame(performance_report)
columns_order = ['Model', 'In-sample MAE', 'In-sample RMSE',
                 'Out-sample MAE', 'Out-sample RMSE',
                 'CV MAE', 'CV RMSE', 'Score']
report_df = report_df[columns_order]

print("\n=== 性能报告 ===")
print(report_df.to_markdown(index=False))

# 6. 异常值处理与预测数量
# -------------------------------------------------
def calculate_and_save(model_name, predictions_scaled):
    """使用原始量级计算异常值"""
    predictions = inverse_transform(predictions_scaled)

    q1 = np.percentile(predictions, 25)
    q3 = np.percentile(predictions, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    is_outlier = (predictions < lower_bound) | (predictions > upper_bound)
    outlier_count = sum(is_outlier)

    # 保存原始量级预测结果
    result_df = test[['ID']].copy()
    result_df['Price'] = np.round(predictions, 2)
    result_df.to_csv(f'predictions_{model_name}.csv', index=False)

    return {
        'model': model_name,
        'outlier_count': outlier_count,
        'outlier_ratio': f"{outlier_count / len(predictions):.2%}",
        'lower_bound': round(lower_bound, 2),
        'upper_bound': round(upper_bound, 2),
        'predictions': predictions
    }

# 预测测试集并保存结果
outlier_stats = []
for model_name in models_config:
    if model_name in results:
        pred_scaled = results[model_name]['model'].predict(X_test_scaled)
    else:
        model = models_config[model_name]['model'].fit(X_scaled, y_scaled)
        pred_scaled = model.predict(X_test_scaled)


    stats = calculate_and_save(model_name, pred_scaled)
    outlier_stats.append(stats)
    print(f"{model_name}异常值统计:")
    print(f"  异常值数量: {stats['outlier_count']}/{len(stats['predictions'])}")
    print(f"  异常值比例: {stats['outlier_ratio']}")
    print(f"  正常值范围: [{stats['lower_bound']}, {stats['upper_bound']}]")

# 生成异常值综合报告
outlier_report = pd.DataFrame(outlier_stats)[
    ['model', 'outlier_count', 'outlier_ratio', 'lower_bound', 'upper_bound']
]
outlier_report.to_csv('outliers_summary.csv', index=False)
print("\n各模型异常值统计已保存到 outliers_summary.csv")



ModuleNotFoundError: No module named 'cn2an'