## 期中展示

## 1.OLS回归，并加上了皮尔逊系数检验与共线性检验，便于即时调整变量。

In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
# ========== 1. 读取数据 ==========
train_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
test_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')
original_test = test_df[['ID']].copy()

# ========== 2. 特征工程函数 ==========
def clean_area(df):
    for col in ['建筑面积', '套内面积']:
        df[col] = df[col].astype(str).str.replace('㎡', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['有效面积'] = df['套内面积'].fillna(df['建筑面积'])
    df['有效面积'] = df['有效面积'].fillna(df['有效面积'].median())
    return df
# 重新计算房屋用途均价映射（确保已生成）
用途均价 = train_df.groupby('房屋用途')['价格'].mean()
train_df['房屋用途均价'] = train_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())

# 确保城市编码与房屋年限等级字段已存在
if '城市编码' not in train_df.columns:
    train_df['城市编码'] = train_df['城市'].astype('category').cat.codes
if '房屋年限等级' not in train_df.columns:
    train_df['房屋年限等级'] = train_df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))

# 创建城市年限交互项
train_df['城市年限交互'] = train_df['城市编码'] * train_df['房屋年限等级']
def extract_floor(df):
    df['当前楼层_raw'] = df['所在楼层'].str.extract(r'第?(\d+)[层楼]')[0].astype(float)
    df['总楼层'] = df['所在楼层'].str.extract(r'共?(\d+)[层楼]')[0].astype(float)
    df.loc[df['当前楼层_raw'].isna(), '当前楼层_raw'] = df['所在楼层'].str.extract(r'(底|低|中|高|顶)')[0].map({
        '底': 1, '低': 2, '中': 3, '高': 4, '顶': 5
    })
    df['当前楼层_raw'] = df['当前楼层_raw'].fillna(1)
    df['总楼层'] = df['总楼层'].fillna(df['总楼层'].median())
    df['当前楼层'] = (df['当前楼层_raw'] / df['总楼层']).clip(0, 1)
    df.drop(columns=['当前楼层_raw'], inplace=True)
    return df

def extract_layout(df):
    df['室'] = df['房屋户型'].str.extract(r'(\d+)室').fillna(0).astype(int)
    df['厅'] = df['房屋户型'].str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['厨'] = df['房屋户型'].str.extract(r'(\d+)厨').fillna(0).astype(int)
    df['卫'] = df['房屋户型'].str.extract(r'(\d+)卫').fillna(0).astype(int)
    df['总房间数'] = df[['室', '厅', '厨', '卫']].sum(axis=1)
    return df

def enrich_features(df):
    df['是否满五'] = df['房屋年限'].apply(lambda x: 1 if '满五' in str(x) else 0)
    df['朝向_含南'] = df['房屋朝向'].apply(lambda x: 1 if '南' in str(x) else 0)
    df['配备电梯'] = df['配备电梯'].map({'有': 1, '无': 0}).fillna(0)
    df['城市编码'] = df['城市'].astype('category').cat.codes
    df['区域编码'] = df['区域'].astype('category').cat.codes
    df['房屋年限等级'] = df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))
    ring_map = {'一环内': 1.0, '一至二环': 1.5, '二环内': 2.0, '二至三环': 2.5,
                '三至四环': 3.5, '四至五环': 4.5, '三环外': 5.0, '五至六环': 5.5,
                '四环外': 6.0, '六环外': 6.5, '内环内': 1.0, '内环至中环': 1.8,
                '中环至外环': 4.2, '内环至外环': 3.0, '外环外': 7.0}
    df['环线数值'] = df['环线'].map(ring_map).fillna(4.0)
    df['环线面积'] = df['环线数值'] * df['有效面积']
    df['是否精装'] = (df['装修情况'].fillna('其他') == '精装').astype(int)
    df['精装面积'] = df['是否精装'] * df['有效面积']
    df['产权_非共有'] = (df['产权所属'].fillna('其他') == '非共有').astype(int)
    df['产权_共有'] = (df['产权所属'].fillna('其他') == '共有').astype(int)
    df['产权_非共有_面积'] = df['产权_非共有'] * df['有效面积']
    df['产权_共有_面积'] = df['产权_共有'] * df['有效面积']
    return df

def cluster_price(df, ref_df=None):
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    if ref_df is None:
        kmeans = KMeans(n_clusters=10, random_state=42)
        df['聚类标签'] = kmeans.fit_predict(df[['lon', 'lat']])
        df['单价'] = df['价格'] / df['有效面积']
        df['聚类均单价'] = df.groupby('聚类标签')['单价'].transform('mean')
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df, kmeans, df.groupby('聚类标签')['单价'].mean()
    else:
        df['聚类标签'] = ref_df[0].predict(df[['lon', 'lat']])
        df['聚类均单价'] = df['聚类标签'].map(ref_df[1])
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df

# ========== 3. 清洗与特征工程 ==========
for df in [train_df, test_df]:
    df = clean_area(df)
    df = extract_floor(df)
    df = extract_layout(df)
    df = enrich_features(df)
test_df['房屋用途均价'] = test_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())
test_df['城市年限交互'] = test_df['城市编码'] * test_df['房屋年限等级']
train_df, kmeans_model, price_map = cluster_price(train_df)
test_df = cluster_price(test_df, ref_df=(kmeans_model, price_map))

for df in [train_df, test_df]:
    df['房间×聚类均单价'] = df['总房间数'] * df['聚类均单价']
    df['电梯×面积'] = df['配备电梯'] * df['有效面积']
    df['精装×面积×均价'] = df['是否精装'] * df['有效面积'] * df['聚类均单价']
    df['非共有×电梯×面积'] = df['产权_非共有'] * df['配备电梯'] * df['有效面积']
    df['房数×电梯×环线'] = df['总房间数'] * df['配备电梯'] * df['环线数值']

# ========== 4. 去除极端值 ==========
q1 = train_df['价格'].quantile(0.01)
q99 = train_df['价格'].quantile(0.99)
train_df = train_df[(train_df['价格'] >= q1) & (train_df['价格'] <= q99)].copy()
def clip_outliers(df, column, lower_quantile=0.01, upper_quantile=0.99):
    lower = df[column].quantile(lower_quantile)
    upper = df[column].quantile(upper_quantile)
    df[column] = df[column].clip(lower, upper)
    return df

for col in ['有效面积', '总楼层', '聚类估价']:
    train_df = clip_outliers(train_df, col)


from sklearn.model_selection import cross_val_score, KFold

# ========== 5. 训练模型 ==========
features = [
    '城市编码','区域编码','房屋年限等级','是否满五','朝向_含南','房屋用途均价',
    '室','厅','厨','卫','总楼层','有效面积','配备电梯',
    '环线数值','lon','城市编码','城市年限交互',
    '精装面积', '城市', '聚类均单价', '环线面积',
    '产权_非共有_面积', '产权_共有_面积', '总房间数', '房间×聚类均单价',
    '电梯×面积', '精装×面积×均价', '非共有×电梯×面积', '房数×电梯×环线'
]


X = train_df[all_features].dropna()
面积_series = train_df.loc[X.index, '有效面积']
y = train_df.loc[X.index, '价格'] / 面积_series  # 单位房价作为目标

print(f"✅ 剔除异常值后用于训练的数据数量: {len(X)}")

X_train, X_val, y_train, y_val, area_train, area_val = train_test_split(
    X, y, 面积_series, test_size=0.2, random_state=42
)

from sklearn.preprocessing import MinMaxScaler  # 替换导入

# 生成测试集特征
test_X = test_df[all_features].copy()

# 使用极差标准化（MinMaxScaler）
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.fit_transform(X)
test_X_scaled = scaler.transform(test_X)

# 模型训练
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# ========== 6. 评估 ==========
# 验证集预测
y_val_pred_unit = model.predict(X_val_scaled)
y_val_pred_price = y_val_pred_unit * area_val
y_val_true_price = y_val * area_val

rmse_val = mean_squared_error(y_val_true_price, y_val_pred_price, squared=False)
mae_val = mean_absolute_error(y_val_true_price, y_val_pred_price)
print(f"✅ 验证集 RMSE（还原后）: {rmse_val:,.2f}")
print(f"✅ 验证集 MAE （还原后）: {mae_val:,.2f}")

# 训练集预测
y_train_pred_unit = model.predict(X_train_scaled)
y_train_pred_price = y_train_pred_unit * area_train
y_train_true_price = y_train * area_train

rmse_train = mean_squared_error(y_train_true_price, y_train_pred_price, squared=False)
mae_train = mean_absolute_error(y_train_true_price, y_train_pred_price)
print(f"✅ 训练集 RMSE（还原后）: {rmse_train:,.2f}")
print(f"✅ 训练集 MAE （还原后）: {mae_train:,.2f}")

# ========== 7. 6-Fold 交叉验证（RMSE & MAE，还原后） ==========
def cross_val_rmse_mae(X, y_unit, area_series, n_splits=6):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_list, mae_list = [], []

    for i, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y_unit.iloc[train_idx], y_unit.iloc[val_idx]
        area_train_cv, area_val_cv = area_series.iloc[train_idx], area_series.iloc[val_idx]

        scaler_cv = MinMaxScaler()
        X_train_scaled = scaler_cv.fit_transform(X_train_cv)
        X_val_scaled = scaler_cv.transform(X_val_cv)

        model_cv = LinearRegression()
        model_cv.fit(X_train_scaled, y_train_cv)
        y_pred_unit = model_cv.predict(X_val_scaled)

        # 还原为总价
        y_pred_price = y_pred_unit * area_val_cv
        y_true_price = y_val_cv * area_val_cv

        rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
        mae = mean_absolute_error(y_true_price, y_pred_price)

        print(f"Fold {i}: RMSE = {rmse:,.2f}, MAE = {mae:,.2f}")
        rmse_list.append(rmse)
        mae_list.append(mae)

    print(f"\n✅ 6折CV 平均 RMSE（还原价）: {np.mean(rmse_list):,.2f}")
    print(f"✅ 6折CV 平均 MAE （还原价）: {np.mean(mae_list):,.2f}")

# 调用函数
cross_val_rmse_mae(X, y, 面积_series)

# ========== 8. 回归方程 ==========
print("✅ 回归方程:")
print("Intercept:", model.intercept_)
for name, coef in zip(X.columns, model.coef_):
    print(f"{name}: {coef:.4f}")

# ========== 9. 预测测试集 ==========
test_X = test_df[all_features].copy()
test_X_scaled = scaler.transform(test_X)
test_unit_price_pred = model.predict(test_X_scaled)
test_area = test_df['建筑面积']
test_price_pred = test_unit_price_pred * test_area

# ========== 10. 生成提交文件 ==========
submission = test_df[['ID']].copy()
submission['Price'] = np.round(test_price_pred).astype(int)
submission.to_csv('submission.csv', index=False)
print("✅ 预测结果已保存为 submission.csv")
# ========== 11. 分析单位房价与自变量的相关性 + 共线性 ==========

# 重新计算单位房价
train_df['单位房价'] = train_df['价格'] / train_df['有效面积']

# 可用特征 = 当前用于训练的 all_features
features_to_analyze = all_features.copy()

# 只保留在 train_df 中存在且为数值型的列
features_to_analyze = [f for f in features_to_analyze if f in train_df.columns and np.issubdtype(train_df[f].dtype, np.number)]

# 添加单位房价用于相关性分析
correlation_df = train_df[features_to_analyze + ['单位房价']].copy().dropna()

# 计算单位房价与每个变量的皮尔逊相关系数
correlations = correlation_df.corr()['单位房价'].drop('单位房价').sort_values(ascending=False)

print("\n📊 与单位房价的皮尔逊相关系数（前20个）:")
print(correlations.head(20))

from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd


X_vif = train_df[all_features].dropna().copy()

# 有些模型（如 LinearRegression）需要剔除非数值列（如 '城市' ）
X_vif = X_vif.select_dtypes(include=[np.number])

# 计算 VIF
vif_df = pd.DataFrame()
vif_df["Feature"] = X_vif.columns
vif_df["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

# 排序输出
vif_df = vif_df.sort_values(by="VIF", ascending=False)

# 显示高 VIF 特征（> 10 的常被视为高度共线）
print("\n📊 VIF 检查结果（前几项）:")
print(vif_df.head(15))

high_vif = vif_df[vif_df["VIF"] > 10]
if not high_vif.empty:
    print("\n⚠️ 以下变量存在高共线性（VIF > 10）:")
    print(high_vif)
else:
    print("✅ 未发现 VIF > 10 的变量，共线性处于可接受范围")

✅ 剔除异常值后用于训练的数据数量: 82386
✅ 验证集 RMSE（还原后）: 1,016,894.67
✅ 验证集 MAE （还原后）: 574,432.51
✅ 训练集 RMSE（还原后）: 1,053,513.27
✅ 训练集 MAE （还原后）: 592,190.03
Fold 1: RMSE = 1,013,263.62, MAE = 572,010.91
Fold 2: RMSE = 1,064,575.10, MAE = 590,765.82
Fold 3: RMSE = 1,046,965.05, MAE = 586,461.15
Fold 4: RMSE = 1,019,420.36, MAE = 584,046.09
Fold 5: RMSE = 1,060,740.82, MAE = 597,158.51
Fold 6: RMSE = 1,071,619.50, MAE = 602,055.35

✅ 6折CV 平均 RMSE（还原价）: 1,046,097.41
✅ 6折CV 平均 MAE （还原价）: 588,749.64
✅ 回归方程:
Intercept: -8436.147789056227
城市编码: 2906.9185
区域编码: 4802.8151
房屋年限等级: 2431.4137
是否满五: 351.0997
厅: -11911.9841
当前楼层: 0.0000
总楼层: 5138.1640
有效面积: -15333.1903
lon: 1635.6306
lat: 919.3524
城市年限交互: -5541.2640
聚类均单价: 62529.0886
总房间数: 80968.7581
房间×聚类均单价: -34085.2304
精装×面积×均价: 16354.3818
✅ 预测结果已保存为 submission.csv

📊 与单位房价的皮尔逊相关系数（前20个）:
聚类均单价       0.830390
房间×聚类均单价    0.734685
房屋年限等级      0.436869
精装×面积×均价    0.406465
是否满五        0.386126
lon         0.213123
lat         0.190246
区域编码       -0.087104
有效面积    

## 2.LASSO

In [9]:
#lasso
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
# ========== 1. 读取数据 ==========
train_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
test_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')
original_test = test_df[['ID']].copy()

# ========== 2. 特征工程函数 ==========
def clean_area(df):
    for col in ['建筑面积', '套内面积']:
        df[col] = df[col].astype(str).str.replace('㎡', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['有效面积'] = df['套内面积'].fillna(df['建筑面积'])
    df['有效面积'] = df['有效面积'].fillna(df['有效面积'].median())
    return df
# 重新计算房屋用途均价映射（确保已生成）
用途均价 = train_df.groupby('房屋用途')['价格'].mean()
train_df['房屋用途均价'] = train_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())

# 确保城市编码与房屋年限等级字段已存在
if '城市编码' not in train_df.columns:
    train_df['城市编码'] = train_df['城市'].astype('category').cat.codes
if '房屋年限等级' not in train_df.columns:
    train_df['房屋年限等级'] = train_df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))

# 创建城市年限交互项
train_df['城市年限交互'] = train_df['城市编码'] * train_df['房屋年限等级']
def extract_floor(df):
    df['当前楼层_raw'] = df['所在楼层'].str.extract(r'第?(\d+)[层楼]')[0].astype(float)
    df['总楼层'] = df['所在楼层'].str.extract(r'共?(\d+)[层楼]')[0].astype(float)
    df.loc[df['当前楼层_raw'].isna(), '当前楼层_raw'] = df['所在楼层'].str.extract(r'(底|低|中|高|顶)')[0].map({
        '底': 1, '低': 2, '中': 3, '高': 4, '顶': 5
    })
    df['当前楼层_raw'] = df['当前楼层_raw'].fillna(1)
    df['总楼层'] = df['总楼层'].fillna(df['总楼层'].median())
    df['当前楼层'] = (df['当前楼层_raw'] / df['总楼层']).clip(0, 1)
    df.drop(columns=['当前楼层_raw'], inplace=True)
    return df

def extract_layout(df):
    df['室'] = df['房屋户型'].str.extract(r'(\d+)室').fillna(0).astype(int)
    df['厅'] = df['房屋户型'].str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['厨'] = df['房屋户型'].str.extract(r'(\d+)厨').fillna(0).astype(int)
    df['卫'] = df['房屋户型'].str.extract(r'(\d+)卫').fillna(0).astype(int)
    df['总房间数'] = df[['室', '厅', '厨', '卫']].sum(axis=1)
    return df

def enrich_features(df):
    df['是否满五'] = df['房屋年限'].apply(lambda x: 1 if '满五' in str(x) else 0)
    df['朝向_含南'] = df['房屋朝向'].apply(lambda x: 1 if '南' in str(x) else 0)
    df['配备电梯'] = df['配备电梯'].map({'有': 1, '无': 0}).fillna(0)
    df['城市编码'] = df['城市'].astype('category').cat.codes
    df['区域编码'] = df['区域'].astype('category').cat.codes
    df['房屋年限等级'] = df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))
    ring_map = {'一环内': 1.0, '一至二环': 1.5, '二环内': 2.0, '二至三环': 2.5,
                '三至四环': 3.5, '四至五环': 4.5, '三环外': 5.0, '五至六环': 5.5,
                '四环外': 6.0, '六环外': 6.5, '内环内': 1.0, '内环至中环': 1.8,
                '中环至外环': 4.2, '内环至外环': 3.0, '外环外': 7.0}
    df['环线数值'] = df['环线'].map(ring_map).fillna(4.0)
    df['环线面积'] = df['环线数值'] * df['有效面积']
    df['是否精装'] = (df['装修情况'].fillna('其他') == '精装').astype(int)
    df['精装面积'] = df['是否精装'] * df['有效面积']
    df['产权_非共有'] = (df['产权所属'].fillna('其他') == '非共有').astype(int)
    df['产权_共有'] = (df['产权所属'].fillna('其他') == '共有').astype(int)
    df['产权_非共有_面积'] = df['产权_非共有'] * df['有效面积']
    df['产权_共有_面积'] = df['产权_共有'] * df['有效面积']
    return df

def cluster_price(df, ref_df=None):
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    if ref_df is None:
        kmeans = KMeans(n_clusters=10, random_state=42)
        df['聚类标签'] = kmeans.fit_predict(df[['lon', 'lat']])
        df['单价'] = df['价格'] / df['有效面积']
        df['聚类均单价'] = df.groupby('聚类标签')['单价'].transform('mean')
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df, kmeans, df.groupby('聚类标签')['单价'].mean()
    else:
        df['聚类标签'] = ref_df[0].predict(df[['lon', 'lat']])
        df['聚类均单价'] = df['聚类标签'].map(ref_df[1])
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df

# ========== 3. 清洗与特征工程 ==========
for df in [train_df, test_df]:
    df = clean_area(df)
    df = extract_floor(df)
    df = extract_layout(df)
    df = enrich_features(df)
test_df['房屋用途均价'] = test_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())
test_df['城市年限交互'] = test_df['城市编码'] * test_df['房屋年限等级']
train_df, kmeans_model, price_map = cluster_price(train_df)
test_df = cluster_price(test_df, ref_df=(kmeans_model, price_map))

for df in [train_df, test_df]:
    df['房间×聚类均单价'] = df['总房间数'] * df['聚类均单价']
    df['电梯×面积'] = df['配备电梯'] * df['有效面积']
    df['精装×面积×均价'] = df['是否精装'] * df['有效面积'] * df['聚类均单价']
    df['非共有×电梯×面积'] = df['产权_非共有'] * df['配备电梯'] * df['有效面积']
    df['房数×电梯×环线'] = df['总房间数'] * df['配备电梯'] * df['环线数值']

# ========== 4. 去除极端值 ==========
q1 = train_df['价格'].quantile(0.01)
q99 = train_df['价格'].quantile(0.99)
train_df = train_df[(train_df['价格'] >= q1) & (train_df['价格'] <= q99)].copy()
def clip_outliers(df, column, lower_quantile=0.01, upper_quantile=0.99):
    lower = df[column].quantile(lower_quantile)
    upper = df[column].quantile(upper_quantile)
    df[column] = df[column].clip(lower, upper)
    return df

for col in ['有效面积', '总楼层', '聚类估价']:
    train_df = clip_outliers(train_df, col)




# ========== 5. 训练模型 ==========

from sklearn.linear_model import Lasso  # 使用Lasso

all_features = [
    '城市编码','区域编码','房屋年限等级','是否满五','朝向_含南','房屋用途均价',
    '室','厅','厨','卫','总楼层','有效面积','配备电梯',
    '环线数值','lon','lat','城市编码','城市年限交互',
    '精装面积', '城市', '聚类均单价', '环线面积',
    '产权_非共有_面积', '产权_共有_面积', '总房间数', '房间×聚类均单价',
    '电梯×面积', '精装×面积×均价', '非共有×电梯×面积', '房数×电梯×环线'
]

X = train_df[all_features].dropna()
面积_series = train_df.loc[X.index, '有效面积']
y = train_df.loc[X.index, '价格'] / 面积_series  # 单位房价作为目标

print(f"✅ 剔除异常值后用于训练的数据数量: {len(X)}")

X_train, X_val, y_train, y_val, area_train, area_val = train_test_split(
    X, y, 面积_series, test_size=0.2, random_state=111
)

# 生成测试集特征
test_X = test_df[all_features].copy()

# 使用极差标准化（MinMaxScaler）
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.fit_transform(X)
test_X_scaled = scaler.transform(test_X)

# 模型训练
model = Lasso(alpha=0.1)
model.fit(X_train_scaled, y_train)

# ========== 6. 评估 ==========
y_val_pred_unit = model.predict(X_val_scaled)
y_val_pred_price = y_val_pred_unit * area_val
y_val_true_price = y_val * area_val

rmse_val = mean_squared_error(y_val_true_price, y_val_pred_price, squared=False)
mae_val = mean_absolute_error(y_val_true_price, y_val_pred_price)
print(f"✅ 验证集 RMSE（还原后）: {rmse_val:,.2f}")
print(f"✅ 验证集 MAE （还原后）: {mae_val:,.2f}")

y_train_pred_unit = model.predict(X_train_scaled)
y_train_pred_price = y_train_pred_unit * area_train
y_train_true_price = y_train * area_train

rmse_train = mean_squared_error(y_train_true_price, y_train_pred_price, squared=False)
mae_train = mean_absolute_error(y_train_true_price, y_train_pred_price)
print(f"✅ 训练集 RMSE（还原后）: {rmse_train:,.2f}")
print(f"✅ 训练集 MAE （还原后）: {mae_train:,.2f}")

# ========== 7. 6-Fold 交叉验证（RMSE & MAE，还原后） ==========
def cross_val_rmse_mae(X, y_unit, area_series, n_splits=6):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_list, mae_list = [], []

    for i, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y_unit.iloc[train_idx], y_unit.iloc[val_idx]
        area_train_cv, area_val_cv = area_series.iloc[train_idx], area_series.iloc[val_idx]

        scaler_cv = MinMaxScaler()
        X_train_scaled = scaler_cv.fit_transform(X_train_cv)
        X_val_scaled = scaler_cv.transform(X_val_cv)

        model_cv = Lasso(alpha=1)
        model_cv.fit(X_train_scaled, y_train_cv)
        y_pred_unit = model_cv.predict(X_val_scaled)

        y_pred_price = y_pred_unit * area_val_cv
        y_true_price = y_val_cv * area_val_cv

        rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
        mae = mean_absolute_error(y_true_price, y_pred_price)

        print(f"Fold {i}: RMSE = {rmse:,.2f}, MAE = {mae:,.2f}")
        rmse_list.append(rmse)
        mae_list.append(mae)

    print(f"\n✅ 6折CV 平均 RMSE（还原价）: {np.mean(rmse_list):,.2f}")
    print(f"✅ 6折CV 平均 MAE （还原价）: {np.mean(mae_list):,.2f}")

cross_val_rmse_mae(X, y, 面积_series)

# ========== 8. 回归方程 ==========
print("✅ 回归方程:")
print("Intercept:", model.intercept_)
for name, coef in zip(X.columns, model.coef_):
    print(f"{name}: {coef:.4f}")

# ========== 9. 预测测试集 ==========
test_unit_price_pred = model.predict(test_X_scaled)
test_area = test_df['有效面积']
test_price_pred = test_unit_price_pred * test_area

# ========== 10. 生成提交文件 ==========
submission = test_df[['ID']].copy()
submission['Price'] = np.round(test_price_pred).astype(int)
submission.to_csv('submission.csv', index=False)
print("✅ 预测结果已保存为 submission.csv")

✅ 剔除异常值后用于训练的数据数量: 82459


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


✅ 验证集 RMSE（还原后）: 990,639.54
✅ 验证集 MAE （还原后）: 581,383.28
✅ 训练集 RMSE（还原后）: 994,865.18
✅ 训练集 MAE （还原后）: 589,683.78


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Fold 1: RMSE = 1,001,941.78, MAE = 590,340.06


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Fold 2: RMSE = 1,019,331.93, MAE = 592,284.02
Fold 3: RMSE = 994,168.83, MAE = 582,061.15
Fold 4: RMSE = 1,027,850.71, MAE = 594,914.60


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Fold 5: RMSE = 991,092.79, MAE = 588,495.74


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


Fold 6: RMSE = 1,015,009.22, MAE = 601,801.70

✅ 6折CV 平均 RMSE（还原价）: 1,008,232.54
✅ 6折CV 平均 MAE （还原价）: 591,649.54
✅ 回归方程:
Intercept: -2586.77929633213
城市编码: 199.6758
区域编码: 5721.7846
房屋年限等级: 4489.0617
是否满五: 101.2579
朝向_含南: 56.5939
房屋用途均价: 14643.7825
室: -4102.4708
厅: -7251.9568
厨: 22093.5688
卫: 12522.8733
总楼层: 1196.1769
有效面积: -17320.7768
配备电梯: -1592.0934
环线数值: -22169.7842
lon: 2582.1745
lat: 2452.4973
城市编码: 0.0000
城市年限交互: -9561.2792
精装面积: 4396.6165
城市: 4059.1120
聚类均单价: 60579.3215
环线面积: -192227.4407
产权_非共有_面积: 636513.0680
产权_共有_面积: 44801.5532
总房间数: 34510.0876
房间×聚类均单价: -22551.4101
电梯×面积: -350044.6444
精装×面积×均价: 10386.3671
非共有×电梯×面积: -63821.6495
房数×电梯×环线: 35135.6556
✅ 预测结果已保存为 submission.csv


## 3. RIDGE(最佳）

In [13]:
#ridge

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
# ========== 1. 读取数据 ==========
train_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
test_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')
original_test = test_df[['ID']].copy()

# ========== 2. 特征工程函数 ==========
def clean_area(df):
    for col in ['建筑面积', '套内面积']:
        df[col] = df[col].astype(str).str.replace('㎡', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['有效面积'] = df['套内面积'].fillna(df['建筑面积'])
    df['有效面积'] = df['有效面积'].fillna(df['有效面积'].median())
    return df
# 重新计算房屋用途均价映射（确保已生成）
用途均价 = train_df.groupby('房屋用途')['价格'].mean()
train_df['房屋用途均价'] = train_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())

# 确保城市编码与房屋年限等级字段已存在
if '城市编码' not in train_df.columns:
    train_df['城市编码'] = train_df['城市'].astype('category').cat.codes
if '房屋年限等级' not in train_df.columns:
    train_df['房屋年限等级'] = train_df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))

# 创建城市年限交互项
train_df['城市年限交互'] = train_df['城市编码'] * train_df['房屋年限等级']
def extract_floor(df):
    df['当前楼层_raw'] = df['所在楼层'].str.extract(r'第?(\d+)[层楼]')[0].astype(float)
    df['总楼层'] = df['所在楼层'].str.extract(r'共?(\d+)[层楼]')[0].astype(float)
    df.loc[df['当前楼层_raw'].isna(), '当前楼层_raw'] = df['所在楼层'].str.extract(r'(底|低|中|高|顶)')[0].map({
        '底': 1, '低': 2, '中': 3, '高': 4, '顶': 5
    })
    df['当前楼层_raw'] = df['当前楼层_raw'].fillna(1)
    df['总楼层'] = df['总楼层'].fillna(df['总楼层'].median())
    df['当前楼层'] = (df['当前楼层_raw'] / df['总楼层']).clip(0, 1)
    df.drop(columns=['当前楼层_raw'], inplace=True)
    return df

def extract_layout(df):
    df['室'] = df['房屋户型'].str.extract(r'(\d+)室').fillna(0).astype(int)
    df['厅'] = df['房屋户型'].str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['厨'] = df['房屋户型'].str.extract(r'(\d+)厨').fillna(0).astype(int)
    df['卫'] = df['房屋户型'].str.extract(r'(\d+)卫').fillna(0).astype(int)
    df['总房间数'] = df[['室', '厅', '厨', '卫']].sum(axis=1)
    return df

def enrich_features(df):
    df['是否满五'] = df['房屋年限'].apply(lambda x: 1 if '满五' in str(x) else 0)
    df['朝向_含南'] = df['房屋朝向'].apply(lambda x: 1 if '南' in str(x) else 0)
    df['配备电梯'] = df['配备电梯'].map({'有': 1, '无': 0}).fillna(0)
    df['城市编码'] = df['城市'].astype('category').cat.codes
    df['区域编码'] = df['区域'].astype('category').cat.codes
    df['房屋年限等级'] = df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))
    ring_map = {'一环内': 1.0, '一至二环': 1.5, '二环内': 2.0, '二至三环': 2.5,
                '三至四环': 3.5, '四至五环': 4.5, '三环外': 5.0, '五至六环': 5.5,
                '四环外': 6.0, '六环外': 6.5, '内环内': 1.0, '内环至中环': 1.8,
                '中环至外环': 4.2, '内环至外环': 3.0, '外环外': 7.0}
    df['环线数值'] = df['环线'].map(ring_map).fillna(4.0)
    df['环线面积'] = df['环线数值'] * df['有效面积']
    df['是否精装'] = (df['装修情况'].fillna('其他') == '精装').astype(int)
    df['精装面积'] = df['是否精装'] * df['有效面积']
    df['产权_非共有'] = (df['产权所属'].fillna('其他') == '非共有').astype(int)
    df['产权_共有'] = (df['产权所属'].fillna('其他') == '共有').astype(int)
    df['产权_非共有_面积'] = df['产权_非共有'] * df['有效面积']
    df['产权_共有_面积'] = df['产权_共有'] * df['有效面积']
    return df

def cluster_price(df, ref_df=None):
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    if ref_df is None:
        kmeans = KMeans(n_clusters=10, random_state=42)
        df['聚类标签'] = kmeans.fit_predict(df[['lon', 'lat']])
        df['单价'] = df['价格'] / df['有效面积']
        df['聚类均单价'] = df.groupby('聚类标签')['单价'].transform('mean')
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df, kmeans, df.groupby('聚类标签')['单价'].mean()
    else:
        df['聚类标签'] = ref_df[0].predict(df[['lon', 'lat']])
        df['聚类均单价'] = df['聚类标签'].map(ref_df[1])
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df

# ========== 3. 清洗与特征工程 ==========
for df in [train_df, test_df]:
    df = clean_area(df)
    df = extract_floor(df)
    df = extract_layout(df)
    df = enrich_features(df)
test_df['房屋用途均价'] = test_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())
test_df['城市年限交互'] = test_df['城市编码'] * test_df['房屋年限等级']
train_df, kmeans_model, price_map = cluster_price(train_df)
test_df = cluster_price(test_df, ref_df=(kmeans_model, price_map))

for df in [train_df, test_df]:
    df['房间×聚类均单价'] = df['总房间数'] * df['聚类均单价']
    df['电梯×面积'] = df['配备电梯'] * df['有效面积']
    df['精装×面积×均价'] = df['是否精装'] * df['有效面积'] * df['聚类均单价']
    df['非共有×电梯×面积'] = df['产权_非共有'] * df['配备电梯'] * df['有效面积']
    df['房数×电梯×环线'] = df['总房间数'] * df['配备电梯'] * df['环线数值']

# ========== 4. 去除极端值 ==========
q1 = train_df['价格'].quantile(0.01)
q99 = train_df['价格'].quantile(0.99)
train_df = train_df[(train_df['价格'] >= q1) & (train_df['价格'] <= q99)].copy()
def clip_outliers(df, column, lower_quantile=0.01, upper_quantile=0.99):
    lower = df[column].quantile(lower_quantile)
    upper = df[column].quantile(upper_quantile)
    df[column] = df[column].clip(lower, upper)
    return df

for col in ['有效面积', '总楼层', '聚类估价']:
    train_df = clip_outliers(train_df, col)


from sklearn.linear_model import Ridge  # 使用 Ridge 回归

# ========== 5. 训练模型 ==========
all_features = [
    '城市编码','区域编码','房屋年限等级','是否满五','朝向_含南','房屋用途均价',
    '室','厅','厨','卫','当前楼层','总楼层','配备电梯',
    '环线数值','lon','lat','城市编码','有效面积','城市年限交互',
    '精装面积', '城市', '聚类均单价', '环线面积',
    '产权_非共有_面积', '产权_共有_面积', '总房间数', '房间×聚类均单价',
    '电梯×面积', '精装×面积×均价', '非共有×电梯×面积', '房数×电梯×环线'
]

X = train_df[all_features].dropna()
面积_series = train_df.loc[X.index, '有效面积']
y = train_df.loc[X.index, '价格'] / 面积_series

print(f"✅ 剔除异常值后用于训练的数据数量: {len(X)}")

X_train, X_val, y_train, y_val, area_train, area_val = train_test_split(
    X, y, 面积_series, test_size=0.2, random_state=42
)

test_X = test_df[all_features].copy()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.fit_transform(X)
test_X_scaled = scaler.transform(test_X)

# 模型训练（Ridge）
model = Ridge(alpha=0.1)
model.fit(X_train_scaled, y_train)

# ========== 6. 评估 ==========
y_val_pred_unit = model.predict(X_val_scaled)
y_val_pred_price = y_val_pred_unit * area_val
y_val_true_price = y_val * area_val

rmse_val = mean_squared_error(y_val_true_price, y_val_pred_price, squared=False)
mae_val = mean_absolute_error(y_val_true_price, y_val_pred_price)
print(f"✅ 验证集 RMSE（还原后）: {rmse_val:,.2f}")
print(f"✅ 验证集 MAE （还原后）: {mae_val:,.2f}")

y_train_pred_unit = model.predict(X_train_scaled)
y_train_pred_price = y_train_pred_unit * area_train
y_train_true_price = y_train * area_train

rmse_train = mean_squared_error(y_train_true_price, y_train_pred_price, squared=False)
mae_train = mean_absolute_error(y_train_true_price, y_train_pred_price)
print(f"✅ 训练集 RMSE（还原后）: {rmse_train:,.2f}")
print(f"✅ 训练集 MAE （还原后）: {mae_train:,.2f}")

# ========== 7. 6-Fold 交叉验证（RMSE & MAE，还原后） ==========
def cross_val_rmse_mae(X, y_unit, area_series, n_splits=6):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_list, mae_list = [], []

    for i, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y_unit.iloc[train_idx], y_unit.iloc[val_idx]
        area_train_cv, area_val_cv = area_series.iloc[train_idx], area_series.iloc[val_idx]

        scaler_cv = MinMaxScaler()
        X_train_scaled = scaler_cv.fit_transform(X_train_cv)
        X_val_scaled = scaler_cv.transform(X_val_cv)

        model_cv = Ridge(alpha=0.1)
        model_cv.fit(X_train_scaled, y_train_cv)
        y_pred_unit = model_cv.predict(X_val_scaled)

        y_pred_price = y_pred_unit * area_val_cv
        y_true_price = y_val_cv * area_val_cv

        rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
        mae = mean_absolute_error(y_true_price, y_pred_price)

        print(f"Fold {i}: RMSE = {rmse:,.2f}, MAE = {mae:,.2f}")
        rmse_list.append(rmse)
        mae_list.append(mae)

    print(f"\n✅ 6折CV 平均 RMSE（还原价）: {np.mean(rmse_list):,.2f}")
    print(f"✅ 6折CV 平均 MAE （还原价）: {np.mean(mae_list):,.2f}")

cross_val_rmse_mae(X, y, 面积_series)

# ========== 8. 回归方程 ==========
print("✅ 回归方程:")
print("Intercept:", model.intercept_)
for name, coef in zip(X.columns, model.coef_):
    print(f"{name}: {coef:.4f}")

# ========== 9. 预测测试集 ==========
test_unit_price_pred = model.predict(test_X_scaled)
test_area = test_df['有效面积']
test_price_pred = test_unit_price_pred * test_area

# ========== 10. 生成提交文件 ==========
submission = test_df[['ID']].copy()
submission['Price'] = np.round(test_price_pred).astype(int)
submission.to_csv('submission.csv', index=False)
print("✅ 预测结果已保存为 submission.csv")

✅ 剔除异常值后用于训练的数据数量: 82386
✅ 验证集 RMSE（还原后）: 977,203.08
✅ 验证集 MAE （还原后）: 592,062.87
✅ 训练集 RMSE（还原后）: 1,000,861.78
✅ 训练集 MAE （还原后）: 602,237.81
Fold 1: RMSE = 972,690.76, MAE = 588,763.84
Fold 2: RMSE = 1,007,689.72, MAE = 602,810.92
Fold 3: RMSE = 1,004,834.55, MAE = 600,401.14
Fold 4: RMSE = 967,938.53, MAE = 589,722.97
Fold 5: RMSE = 1,005,951.88, MAE = 605,310.61
Fold 6: RMSE = 1,027,290.66, MAE = 609,677.09

✅ 6折CV 平均 RMSE（还原价）: 997,732.68
✅ 6折CV 平均 MAE （还原价）: 599,447.76
✅ 回归方程:
Intercept: -541.9388602440522
城市编码: 1252.1358
区域编码: 5620.7528
房屋年限等级: 3428.5726
是否满五: 250.0679
朝向_含南: -44.8971
房屋用途均价: 21778.9007
室: 4296.1051
厅: -1812.3877
厨: 29363.6455
卫: 17928.3534
当前楼层: 0.0000
总楼层: 1295.7406
配备电梯: -3009.9434
环线数值: -30592.7743
lon: 2219.2049
lat: 3511.3794
城市编码: 1252.1358
有效面积: -14931.7787
城市年限交互: -7743.8468
精装面积: 3780.3611
城市: 1252.1358
聚类均单价: 61628.7780
环线面积: 82751.3605
产权_非共有_面积: 267046.4623
产权_共有_面积: 81908.7827
总房间数: 14902.8773
房间×聚类均单价: -26460.5921
电梯×面积: -229315.2515
精装×面积×均价: 10317.7

## 4.Elastic Net

In [14]:
#elastic net

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
# ========== 1. 读取数据 ==========
train_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_train.csv')
test_df = pd.read_csv('/home/mw/input/quant4533/ruc_Class25Q1_test.csv')
original_test = test_df[['ID']].copy()

# ========== 2. 特征工程函数 ==========
def clean_area(df):
    for col in ['建筑面积', '套内面积']:
        df[col] = df[col].astype(str).str.replace('㎡', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['有效面积'] = df['套内面积'].fillna(df['建筑面积'])
    df['有效面积'] = df['有效面积'].fillna(df['有效面积'].median())
    return df
# 重新计算房屋用途均价映射（确保已生成）
用途均价 = train_df.groupby('房屋用途')['价格'].mean()
train_df['房屋用途均价'] = train_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())

# 确保城市编码与房屋年限等级字段已存在
if '城市编码' not in train_df.columns:
    train_df['城市编码'] = train_df['城市'].astype('category').cat.codes
if '房屋年限等级' not in train_df.columns:
    train_df['房屋年限等级'] = train_df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))

# 创建城市年限交互项
train_df['城市年限交互'] = train_df['城市编码'] * train_df['房屋年限等级']
def extract_floor(df):
    df['当前楼层_raw'] = df['所在楼层'].str.extract(r'第?(\d+)[层楼]')[0].astype(float)
    df['总楼层'] = df['所在楼层'].str.extract(r'共?(\d+)[层楼]')[0].astype(float)
    df.loc[df['当前楼层_raw'].isna(), '当前楼层_raw'] = df['所在楼层'].str.extract(r'(底|低|中|高|顶)')[0].map({
        '底': 1, '低': 2, '中': 3, '高': 4, '顶': 5
    })
    df['当前楼层_raw'] = df['当前楼层_raw'].fillna(1)
    df['总楼层'] = df['总楼层'].fillna(df['总楼层'].median())
    df['当前楼层'] = (df['当前楼层_raw'] / df['总楼层']).clip(0, 1)
    df.drop(columns=['当前楼层_raw'], inplace=True)
    return df

def extract_layout(df):
    df['室'] = df['房屋户型'].str.extract(r'(\d+)室').fillna(0).astype(int)
    df['厅'] = df['房屋户型'].str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['厨'] = df['房屋户型'].str.extract(r'(\d+)厨').fillna(0).astype(int)
    df['卫'] = df['房屋户型'].str.extract(r'(\d+)卫').fillna(0).astype(int)
    df['总房间数'] = df[['室', '厅', '厨', '卫']].sum(axis=1)
    return df

def enrich_features(df):
    df['是否满五'] = df['房屋年限'].apply(lambda x: 1 if '满五' in str(x) else 0)
    df['朝向_含南'] = df['房屋朝向'].apply(lambda x: 1 if '南' in str(x) else 0)
    df['配备电梯'] = df['配备电梯'].map({'有': 1, '无': 0}).fillna(0)
    df['城市编码'] = df['城市'].astype('category').cat.codes
    df['区域编码'] = df['区域'].astype('category').cat.codes
    df['房屋年限等级'] = df['房屋年限'].map(lambda x: 3 if '满五' in str(x) else (2 if '满两' in str(x) else 1))
    ring_map = {'一环内': 1.0, '一至二环': 1.5, '二环内': 2.0, '二至三环': 2.5,
                '三至四环': 3.5, '四至五环': 4.5, '三环外': 5.0, '五至六环': 5.5,
                '四环外': 6.0, '六环外': 6.5, '内环内': 1.0, '内环至中环': 1.8,
                '中环至外环': 4.2, '内环至外环': 3.0, '外环外': 7.0}
    df['环线数值'] = df['环线'].map(ring_map).fillna(4.0)
    df['环线面积'] = df['环线数值'] * df['有效面积']
    df['是否精装'] = (df['装修情况'].fillna('其他') == '精装').astype(int)
    df['精装面积'] = df['是否精装'] * df['有效面积']
    df['产权_非共有'] = (df['产权所属'].fillna('其他') == '非共有').astype(int)
    df['产权_共有'] = (df['产权所属'].fillna('其他') == '共有').astype(int)
    df['产权_非共有_面积'] = df['产权_非共有'] * df['有效面积']
    df['产权_共有_面积'] = df['产权_共有'] * df['有效面积']
    return df

def cluster_price(df, ref_df=None):
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce') + np.random.normal(0, 0.01, size=len(df))
    if ref_df is None:
        kmeans = KMeans(n_clusters=10, random_state=42)
        df['聚类标签'] = kmeans.fit_predict(df[['lon', 'lat']])
        df['单价'] = df['价格'] / df['有效面积']
        df['聚类均单价'] = df.groupby('聚类标签')['单价'].transform('mean')
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df, kmeans, df.groupby('聚类标签')['单价'].mean()
    else:
        df['聚类标签'] = ref_df[0].predict(df[['lon', 'lat']])
        df['聚类均单价'] = df['聚类标签'].map(ref_df[1])
        df['聚类估价'] = df['聚类均单价'] * df['有效面积']
        return df

# ========== 3. 清洗与特征工程 ==========
for df in [train_df, test_df]:
    df = clean_area(df)
    df = extract_floor(df)
    df = extract_layout(df)
    df = enrich_features(df)
test_df['房屋用途均价'] = test_df['房屋用途'].map(用途均价).fillna(train_df['价格'].mean())
test_df['城市年限交互'] = test_df['城市编码'] * test_df['房屋年限等级']
train_df, kmeans_model, price_map = cluster_price(train_df)
test_df = cluster_price(test_df, ref_df=(kmeans_model, price_map))

for df in [train_df, test_df]:
    df['房间×聚类均单价'] = df['总房间数'] * df['聚类均单价']
    df['电梯×面积'] = df['配备电梯'] * df['有效面积']
    df['精装×面积×均价'] = df['是否精装'] * df['有效面积'] * df['聚类均单价']
    df['非共有×电梯×面积'] = df['产权_非共有'] * df['配备电梯'] * df['有效面积']
    df['房数×电梯×环线'] = df['总房间数'] * df['配备电梯'] * df['环线数值']

# ========== 4. 去除极端值 ==========
q1 = train_df['价格'].quantile(0.01)
q99 = train_df['价格'].quantile(0.99)
train_df = train_df[(train_df['价格'] >= q1) & (train_df['价格'] <= q99)].copy()
def clip_outliers(df, column, lower_quantile=0.01, upper_quantile=0.99):
    lower = df[column].quantile(lower_quantile)
    upper = df[column].quantile(upper_quantile)
    df[column] = df[column].clip(lower, upper)
    return df

for col in ['有效面积', '总楼层', '聚类估价']:
    train_df = clip_outliers(train_df, col)


from sklearn.linear_model import ElasticNet  # 使用 ElasticNet 回归

# ========== 5. 训练模型 ==========
all_features = [
    '城市编码','区域编码','房屋年限等级','是否满五','朝向_含南','房屋用途均价',
    '室','厅','厨','卫','总楼层','有效面积','配备电梯',
    '环线数值','lon','lat','城市编码','城市年限交互',
    '精装面积', '城市', '聚类均单价', '环线面积',
    '产权_非共有_面积', '产权_共有_面积', '总房间数', '房间×聚类均单价',
    '电梯×面积', '精装×面积×均价', '非共有×电梯×面积', '房数×电梯×环线'
]

X = train_df[all_features].dropna()
面积_series = train_df.loc[X.index, '有效面积']
y = train_df.loc[X.index, '价格'] / 面积_series

print(f"✅ 剔除异常值后用于训练的数据数量: {len(X)}")

X_train, X_val, y_train, y_val, area_train, area_val = train_test_split(
    X, y, 面积_series, test_size=0.2, random_state=42
)

test_X = test_df[all_features].copy()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.fit_transform(X)
test_X_scaled = scaler.transform(test_X)

# 模型训练（ElasticNet）
model = ElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X_train_scaled, y_train)

# ========== 6. 评估 ==========
y_val_pred_unit = model.predict(X_val_scaled)
y_val_pred_price = y_val_pred_unit * area_val
y_val_true_price = y_val * area_val

rmse_val = mean_squared_error(y_val_true_price, y_val_pred_price, squared=False)
mae_val = mean_absolute_error(y_val_true_price, y_val_pred_price)
print(f"✅ 验证集 RMSE（还原后）: {rmse_val:,.2f}")
print(f"✅ 验证集 MAE （还原后）: {mae_val:,.2f}")

y_train_pred_unit = model.predict(X_train_scaled)
y_train_pred_price = y_train_pred_unit * area_train
y_train_true_price = y_train * area_train

rmse_train = mean_squared_error(y_train_true_price, y_train_pred_price, squared=False)
mae_train = mean_absolute_error(y_train_true_price, y_train_pred_price)
print(f"✅ 训练集 RMSE（还原后）: {rmse_train:,.2f}")
print(f"✅ 训练集 MAE （还原后）: {mae_train:,.2f}")

# ========== 7. 6-Fold 交叉验证（RMSE & MAE，还原后） ==========
def cross_val_rmse_mae(X, y_unit, area_series, n_splits=6):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_list, mae_list = [], []

    for i, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y_unit.iloc[train_idx], y_unit.iloc[val_idx]
        area_train_cv, area_val_cv = area_series.iloc[train_idx], area_series.iloc[val_idx]

        scaler_cv = MinMaxScaler()
        X_train_scaled = scaler_cv.fit_transform(X_train_cv)
        X_val_scaled = scaler_cv.transform(X_val_cv)

        model_cv = ElasticNet(alpha=0.1, l1_ratio=0.5)
        model_cv.fit(X_train_scaled, y_train_cv)
        y_pred_unit = model_cv.predict(X_val_scaled)

        y_pred_price = y_pred_unit * area_val_cv
        y_true_price = y_val_cv * area_val_cv

        rmse = mean_squared_error(y_true_price, y_pred_price, squared=False)
        mae = mean_absolute_error(y_true_price, y_pred_price)

        print(f"Fold {i}: RMSE = {rmse:,.2f}, MAE = {mae:,.2f}")
        rmse_list.append(rmse)
        mae_list.append(mae)

    print(f"\n✅ 6折CV 平均 RMSE（还原价）: {np.mean(rmse_list):,.2f}")
    print(f"✅ 6折CV 平均 MAE （还原价）: {np.mean(mae_list):,.2f}")

cross_val_rmse_mae(X, y, 面积_series)

# ========== 8. 回归方程 ==========
print("✅ 回归方程:")
print("Intercept:", model.intercept_)
for name, coef in zip(X.columns, model.coef_):
    print(f"{name}: {coef:.4f}")

# ========== 9. 预测测试集 ==========
test_unit_price_pred = model.predict(test_X_scaled)
test_area = test_df['有效面积']
test_price_pred = test_unit_price_pred * test_area

# ========== 10. 生成提交文件 ==========
submission = test_df[['ID']].copy()
submission['Price'] = np.round(test_price_pred).astype(int)
submission.to_csv('submission.csv', index=False)
print("✅ 预测结果已保存为 submission.csv")

✅ 剔除异常值后用于训练的数据数量: 82459
✅ 验证集 RMSE（还原后）: 1,132,946.25
✅ 验证集 MAE （还原后）: 705,847.13
✅ 训练集 RMSE（还原后）: 1,149,910.34
✅ 训练集 MAE （还原后）: 707,383.14
Fold 1: RMSE = 1,132,910.09, MAE = 704,887.29
Fold 2: RMSE = 1,146,687.56, MAE = 705,034.11
Fold 3: RMSE = 1,132,568.66, MAE = 697,153.31
Fold 4: RMSE = 1,150,048.60, MAE = 708,729.46
Fold 5: RMSE = 1,166,220.53, MAE = 710,716.30
Fold 6: RMSE = 1,154,253.90, MAE = 718,450.50

✅ 6折CV 平均 RMSE（还原价）: 1,147,114.89
✅ 6折CV 平均 MAE （还原价）: 707,495.16
✅ 回归方程:
Intercept: 14225.398166488372
城市编码: -2972.0050
区域编码: 2571.7639
房屋年限等级: 4712.7083
是否满五: 2415.9786
朝向_含南: 69.7005
房屋用途均价: 744.2884
室: -121.4877
厅: -543.2436
厨: 484.9370
卫: 297.6376
总楼层: -404.1836
有效面积: -1567.2995
配备电梯: 1174.2675
环线数值: -7197.0587
lon: 3521.8675
lat: 2725.8243
城市编码: -2971.9860
城市年限交互: -3104.3478
精装面积: 1305.4177
城市: -2971.9944
聚类均单价: 26249.6510
环线面积: -109.9649
产权_非共有_面积: -27.7646
产权_共有_面积: 18.5970
总房间数: 45.1782
房间×聚类均单价: 6917.5880
电梯×面积: 13.7250
精装×面积×均价: 2748.1640
非共有×电梯×面积: 1.7952
房数×电梯×环线