## 欢迎进入 Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [1]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

lasso_model.pkl  price_segment_models.pkl  selected_features.pkl
prediction.csv	 segmented_prediction.csv  word2vec_model.model


In [2]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

quant4533


In [3]:
import subprocess
import sys

# 检测模块是否安装
def install_and_import(package):
    try:
        __import__(package)
        print(f"{package} 已经安装")
    except ImportError:
        print(f"{package} 未安装，正在安装...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"{package} 安装成功！")
        __import__(package)

# 模块名称列表，名称使用pip中的标准模块名称
modules = [
    "numpy", "pandas", "random", "re", "time", "warnings", "pickle",
    "sklearn", "joblib", "numba", "gensim", "matplotlib", "seaborn",
    "geopy", "openpyxl", "tensorflow","tqdm"
]

for module in modules:
    install_and_import(module)

numpy 已经安装
pandas 已经安装
random 已经安装
re 已经安装
time 已经安装
pickle 已经安装
sklearn 已经安装
joblib 已经安装
numba 已经安装


  "class": algorithms.Blowfish,


gensim 已经安装
matplotlib 已经安装
seaborn 已经安装
geopy 已经安装
openpyxl 已经安装
tensorflow 已经安装
tqdm 已经安装


In [4]:
# 基础模块
import numpy as np
import pandas as pd
import random
import re
import time
import warnings
import pickle

# 数据处理与预处理模块
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict, KFold
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel, RFE, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 机器学习模型模块
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,r2_score
from sklearn.neighbors import BallTree

# 并行与加速模块
from joblib import Parallel, delayed
from sklearn.utils import parallel_backend
from numba import njit

# 自然语言处理模块
from gensim.models import Word2Vec

# 数据可视化模块
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm

# 地理计算模块
from geopy.distance import geodesic

# Excel 处理模块
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment

# 深度学习模块
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

# 其他模块
import joblib
from tqdm import tqdm
from scipy import stats


GPU Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


In [5]:
# 读取数据
X_train_cleaned = pd.read_csv('/home/mw/temp/X_train_cleaned.csv')
X_test_cleaned = pd.read_csv('/home/mw/temp/X_test_cleaned.csv')
X_predict_cleaned = pd.read_csv('/home/mw/temp/X_predict_cleaned.csv')
y_train = pd.read_csv('/home/mw/temp/y_train.csv')
y_test = pd.read_csv('/home/mw/temp/y_test.csv')

In [6]:
print(type(X_train_cleaned))

<class 'pandas.core.frame.DataFrame'>


In [7]:
X_train_cleaned=X_train_cleaned.astype('float64')
X_test_cleaned=X_test_cleaned.astype('float64')
X_predict_cleaned=X_predict_cleaned.astype('float64')
y_train=y_train.astype('float64')
y_test=y_test.astype('float64')

In [8]:
y_train_original = y_train['价格'].values.flatten()
y_test_original = y_test['价格'].values.flatten()

In [9]:
print(X_train_cleaned.head())
print(y_train.head())

     东    南    西    北   东南   东北   西南   西北   地铁  房本满两年  ...  poly_建筑面积 套内面积  \
0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  ...     5363.054840   
1  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0    0.0  ...      819.462474   
2  1.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0    0.0  ...    11612.658800   
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1.0  ...     1278.320504   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    0.0  ...      776.307148   

   poly_建筑面积 室  poly_建筑面积 厅  poly_套内面积 室  poly_套内面积 厅  poly_室 厅         租金估值  \
0       162.96        81.48   131.641012    65.820506       2.0  2956.818667   
1        31.85        31.85    25.728806    25.728806       1.0  2206.234333   
2       362.82       241.88   288.060000   192.040000       6.0  8915.277795   
3        39.78        39.78    32.134754    32.134754       1.0  1389.240000   
4        31.00        31.00    25.042166    25.042166       1.0  1662.815018   

       房间密度      相对楼层       得房率  
0  0.024546  0.5

In [10]:
# 1. 首先看一下原始分布
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(y_train, bins=50)
plt.title('原始目标变量分布')
plt.xlabel('价格')
plt.ylabel('频率')

Text(0, 0.5, '频率')

In [11]:
# 确保y值是一维数组
y_train_values = y_train['价格'].values.flatten()
y_test_values = y_test['价格'].values.flatten()

# 对数变换
y_train_transformed = np.log1p(y_train_values)

# 绘制变换后的分布
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(y_train_transformed, bins=50)
plt.title(f'对数变换后分布')
plt.tight_layout()
plt.show()

In [12]:
# 对测试集应用相同的变换
y_test_transformed = np.log1p(y_test_values)

# 特征筛选

In [13]:
# 标准化后保留DataFrame结构和列名
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_cleaned),
    columns=X_train_cleaned.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_cleaned),
    columns=X_test_cleaned.columns
)

In [14]:
# 步骤1: 方差过滤 - 移除低方差特征
variance_threshold = 0.008  
var_selector = VarianceThreshold(threshold=variance_threshold)
X_train_var = var_selector.fit_transform(X_train_cleaned)
var_features = X_train_cleaned.columns[var_selector.get_support()]

print(f"方差筛选后的特征数量: {len(var_features)}")
print(f"移除了 {X_train_cleaned.shape[1] - len(var_features)} 个低方差特征")

# 更新数据集，只保留通过方差筛选的特征
X_train_filtered = X_train_cleaned[var_features]


方差筛选后的特征数量: 148
移除了 1106 个低方差特征


In [15]:
print(var_features)

Index(['东', '南', '西', '北', '东南', '东北', '西南', '西北', '地铁', '房本满两年',
       ...
       'poly_室', 'poly_厅', 'poly_建筑面积 套内面积', 'poly_建筑面积 室', 'poly_建筑面积 厅',
       'poly_套内面积 室', 'poly_套内面积 厅', 'poly_室 厅', '租金估值', '相对楼层'],
      dtype='object', length=148)


In [16]:
# 步骤2: 相关性筛选 - 移除高度相关的特征
# 计算相关矩阵
correlation_matrix = X_train_filtered.corr().abs()

# 创建掩码以获取相关矩阵的上三角部分（不包括对角线）
upper_triangle = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
high_correlation = correlation_matrix.where(upper_triangle)

# 设置相关性阈值并找出高度相关的特征
correlation_threshold = 0.70  
features_to_drop = []

# 遍历上三角矩阵找出高度相关的特征对
for col in high_correlation.columns:
    # 获取与当前特征高度相关的特征
    correlated_features = high_correlation[col][high_correlation[col] > correlation_threshold].index.tolist()
    if correlated_features and col not in features_to_drop:
        # 在每对高度相关的特征中，我们处理逻辑
        for feature in correlated_features:
            if feature not in features_to_drop:
                # 如果其中一个是租金估值，则总是排除租金估值
                if col == '租金估值':
                    features_to_drop.append(col)
                    break  # 找到租金估值就直接跳出内循环
                elif feature == '租金估值':
                    features_to_drop.append(feature)
                    continue  # 处理下一个feature
                
                # 否则按原逻辑执行：保留与目标变量相关性更强的那个
                corr_with_target_col = abs(pd.Series(X_train_filtered[col]).corr(pd.Series(y_train_transformed)))
                corr_with_target_feature = abs(pd.Series(X_train_filtered[feature]).corr(pd.Series(y_train_transformed)))
                
                if corr_with_target_col < corr_with_target_feature:
                    features_to_drop.append(col)
                else:
                    features_to_drop.append(feature)

# 确保租金估值在移除列表中
if '租金估值' in X_train_filtered.columns and '租金估值' not in features_to_drop:
    features_to_drop.append('租金估值')

# 去除重复项
features_to_drop = list(set(features_to_drop))
print(f"相关性筛选将移除 {len(features_to_drop)} 个高度相关的特征")
print(f"租金估值是否在移除列表中: {'租金估值' in features_to_drop}")

# 移除高度相关的特征
remaining_features = [f for f in X_train_filtered.columns if f not in features_to_drop]
X_train_filtered_final = X_train_filtered[remaining_features]

print(f"初步筛选后的最终特征数量: {X_train_filtered_final.shape[1]}")

# 保存筛选后的特征集
X_test_filtered = X_test_cleaned[remaining_features]
X_predict_filtered = X_predict_cleaned[remaining_features]

# 显示剩余特征
print(remaining_features)

相关性筛选将移除 31 个高度相关的特征
租金估值是否在移除列表中: True
初步筛选后的最终特征数量: 117
['东', '南', '西', '北', '东南', '东北', '西南', '西北', '地铁', '房本满两年', '房本满五年', '装修', '房屋年限', '环线_一至二环', '环线_三环外', '环线_三至四环', '环线_中环至外环', '环线_二至三环', '环线_五至六环', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '建筑结构_未知结构', '建筑结构_框架结构', '建筑结构_混合结构', '建筑结构_砖混结构', '建筑结构_钢混结构', '建筑结构_钢结构', '装修情况_毛坯', '装修情况_简装', '装修情况_精装', '交易权属_商品房', '交易权属_已购公房', '交易权属_拆迁还建房', '房屋用途_别墅', '房屋用途_商业办公类', '房屋用途_商住两用', '房屋用途_普通住宅', '产权所属_非共有', '年份', '厨', '卫', '楼层数值', '总层数', '梯户比例', '楼龄', '房屋总数数值', '楼栋总数数值', '绿化率数值', '容积率数值', '停车位数值', '停车费用数值', '每平米月租金中位数', '交易年份', '交易月份_sin', '交易季度_cos', '上次交易年份', '上次交易月份_cos', '上次交易季度_sin', '城市_1', '城市_2', '城市_3', '城市_4', '城市_5', '城市_6', '区域_5.0', '区域_7.0', '区域_11.0', '区域_12.0', '区域_13.0', '区域_20.0', '区域_22.0', '区域_30.0', '区域_40.0', '区域_43.0', '区域_45.0', '区域_46.0', '区域_53.0', '区域_59.0', '区域_60.0', '区域_62.0', '区域_64.0', '区域_65.0', '区域_66.0', '区域_71.0', '区域_74.0', '区域_80.0', '区域_81.0', '区域_82.0', '区域_87.0', '区域_93.0', '区域_97.0', '区域_98.0

In [17]:
# 将筛选后的数据保存到临时目录
X_train_filtered_final.to_csv('/home/mw/temp/X_train_filtered.csv', index=False)
X_test_filtered.to_csv('/home/mw/temp/X_test_filtered.csv', index=False)
X_predict_filtered.to_csv('/home/mw/temp/X_predict_filtered.csv', index=False)

In [18]:
# 加载初步筛选后的数据
X_train = pd.read_csv('/home/mw/temp/X_train_filtered.csv')
y_train = y_train_transformed

In [19]:
# 方法1: 使用随机森林模型计算特征重要性
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# 获取特征重要性
rf_importances = rf_model.feature_importances_
rf_indices = np.argsort(rf_importances)[::-1]

# 可视化随机森林的特征重要性（前20个特征）
plt.figure(figsize=(12, 8))
plt.title('随机森林 - 特征重要性')
plt.bar(range(20), rf_importances[rf_indices[:20]])
plt.xticks(range(20), X_train.columns[rf_indices[:20]], rotation=90)
plt.tight_layout()
plt.show()

# 方法2: 使用梯度提升树模型计算特征重要性
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    random_state=42
)
gb_model.fit(X_train, y_train)


# 获取特征重要性
gb_importances = gb_model.feature_importances_
gb_indices = np.argsort(gb_importances)[::-1]

# 可视化梯度提升树的特征重要性（前20个特征）
plt.figure(figsize=(12, 8))
plt.title('梯度提升树 - 特征重要性')
plt.bar(range(20), gb_importances[gb_indices[:20]])
plt.xticks(range(20), X_train.columns[gb_indices[:20]], rotation=90)
plt.tight_layout()
plt.show()

# 合并两个模型的特征重要性结果
combined_importances = (rf_importances + gb_importances) / 2
combined_indices = np.argsort(combined_importances)[::-1]

In [20]:
print("combined_indices长度:", len(combined_indices))
print("X_train.columns长度:", len(X_train.columns))

combined_indices长度: 117
X_train.columns长度: 117


In [21]:
# 使用组合模型的特征重要性进行特征选择
# 设置阈值，选择重要性最高的前n个特征
n_features_to_select = 23  

# 选择最重要的n个特征
top_features_indices = combined_indices[0:n_features_to_select]
top_features = X_train.columns[top_features_indices].tolist()

print(f"\n选择了前 {n_features_to_select} 个最重要的特征")
print(f"特征重要性范围: {combined_importances[top_features_indices[0]]:.4f} 到 {combined_importances[top_features_indices[-1]]:.4f}")

# 创建最终的特征集
X_train_final = X_train[top_features]
X_test = pd.read_csv('/home/mw/temp/X_test_filtered.csv')
X_test_final = X_test[top_features]
X_predict = pd.read_csv('/home/mw/temp/X_predict_filtered.csv')
X_predict_final = X_predict[top_features]

print(f"\n最终选择的特征数量: {X_train_final.shape[1]}")

# 显示所选特征及其重要性
feature_importance_df = pd.DataFrame({
    'Feature': top_features,
    'Importance': combined_importances[top_features_indices]
})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print("\20个最重要的特征及其重要性:")
print(feature_importance_df.head(23))


选择了前 23 个最重要的特征
特征重要性范围: 0.3044 到 0.0076

最终选择的特征数量: 23
个最重要的特征及其重要性:
      Feature  Importance
0   每平米月租金中位数    0.304402
1      poly_室    0.137103
2           卫    0.067911
3        城市_5    0.057526
4        城市_2    0.035244
5        楼层数值    0.025733
6     环线_五至六环    0.023438
7      停车费用数值    0.021648
8         总层数    0.020455
9        梯户比例    0.020071
10      房本满五年    0.019173
11       城市_3    0.017212
12       城市_6    0.017075
13    区域_45.0    0.016215
14         楼龄    0.015983
15       城市_4    0.014969
16      容积率数值    0.012772
17    区域_62.0    0.012557
18      停车位数值    0.011665
19     房屋总数数值    0.009157
20    环线_四至五环    0.008954
21     楼栋总数数值    0.007876
22      绿化率数值    0.007569


In [22]:
# 保存最终的特征集
X_train_final.to_csv('/home/mw/temp/X_train_final.csv', index=False)
X_test_final.to_csv('/home/mw/temp/X_test_final.csv', index=False)
X_predict_final.to_csv('/home/mw/temp/X_predict_final.csv', index=False)

# 保存特征重要性文件，以便后续分析
feature_importance_df.to_csv('/home/mw/temp/feature_importance.csv', index=False)

In [23]:
# 1. 首先检查特征列名
print("实际可用的特征列名:")
print(X_train.columns.tolist())

实际可用的特征列名:
['东', '南', '西', '北', '东南', '东北', '西南', '西北', '地铁', '房本满两年', '房本满五年', '装修', '房屋年限', '环线_一至二环', '环线_三环外', '环线_三至四环', '环线_中环至外环', '环线_二至三环', '环线_五至六环', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '建筑结构_未知结构', '建筑结构_框架结构', '建筑结构_混合结构', '建筑结构_砖混结构', '建筑结构_钢混结构', '建筑结构_钢结构', '装修情况_毛坯', '装修情况_简装', '装修情况_精装', '交易权属_商品房', '交易权属_已购公房', '交易权属_拆迁还建房', '房屋用途_别墅', '房屋用途_商业办公类', '房屋用途_商住两用', '房屋用途_普通住宅', '产权所属_非共有', '年份', '厨', '卫', '楼层数值', '总层数', '梯户比例', '楼龄', '房屋总数数值', '楼栋总数数值', '绿化率数值', '容积率数值', '停车位数值', '停车费用数值', '每平米月租金中位数', '交易年份', '交易月份_sin', '交易季度_cos', '上次交易年份', '上次交易月份_cos', '上次交易季度_sin', '城市_1', '城市_2', '城市_3', '城市_4', '城市_5', '城市_6', '区域_5.0', '区域_7.0', '区域_11.0', '区域_12.0', '区域_13.0', '区域_20.0', '区域_22.0', '区域_30.0', '区域_40.0', '区域_43.0', '区域_45.0', '区域_46.0', '区域_53.0', '区域_59.0', '区域_60.0', '区域_62.0', '区域_64.0', '区域_65.0', '区域_66.0', '区域_71.0', '区域_74.0', '区域_80.0', '区域_81.0', '区域_82.0', '区域_87.0', '区域_93.0', '区域_97.0', '区域_98.0', '板块_43.0', '板块_127.0', '板块_213.0', '板块_242.0

In [24]:
# 定义反变换函数
def inverse_transform(y_pred):
    return np.expm1(y_pred)  # exp(x)-1


In [25]:
def evaluate_all(model_name, model, X_train, X_test, y_train_t, y_test_t, lambda_opt=None):
    # 确保有原始值用于比较
    y_train_original = y_train_values  # 使用全局变量中的原始值
    y_test_original = y_test_values    # 使用全局变量中的原始值
    
    # 训练集预测
    y_train_pred_t = model.predict(X_train)
    # 使用log变换的反变换 - 指数函数
    y_train_pred = np.expm1(y_train_pred_t)  # exp(x) - 1
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train_original, y_train_pred))
    
    # 测试集预测
    y_test_pred_t = model.predict(X_test)
    y_test_pred = np.expm1(y_test_pred_t)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test_original, y_test_pred))
    
    # 添加异常值检测和处理
    # 如果预测值超出合理范围，进行截断
    y_train_pred = np.clip(y_train_pred, 
                           a_min=max(0, y_train_original.min() * 0.5), 
                           a_max=y_train_original.max() * 1.5)
    y_test_pred = np.clip(y_test_pred, 
                          a_min=max(0, y_test_original.min() * 0.5), 
                          a_max=y_test_original.max() * 1.5)
    
    # 简化：使用普通交叉验证，然后自己计算原始尺度的MAE
    kf = KFold(n_splits=6, shuffle=True, random_state=42)
    cv_maes = []
    
    for train_idx, val_idx in kf.split(X_train):
        # 分割数据
        X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_cv_train, y_cv_val_t = y_train_t[train_idx], y_train_t[val_idx]
        
        # 转换y_cv_val_t回原始尺度 - 使用log变换的反变换
        y_cv_val = np.expm1(y_cv_val_t)
        
        # 训练模型
        model.fit(X_cv_train, y_cv_train)
        
        # 预测并转回原始尺度
        y_cv_pred_t = model.predict(X_cv_val)
        y_cv_pred = np.expm1(y_cv_pred_t)
        
        # 对预测值进行截断
        y_cv_pred = np.clip(y_cv_pred, 
                            a_min=max(0, y_train_original.min() * 0.5), 
                            a_max=y_train_original.max() * 1.5)
        
        # 计算MAE并保存
        mae = mean_absolute_error(y_cv_val, y_cv_pred)
        cv_maes.append(mae)
    
    # 计算平均交叉验证MAE
    cv_mae = np.mean(cv_maes)
    
    # 显示结果（所有指标都在原始尺度上）
    print(f"\n{model_name} 评估:")
    print(f"训练集 MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}")
    print(f"测试集 MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}")
    print(f"6折交叉验证 MAE: {cv_mae:.2f}")
    
    return {
        'Model': model_name,
        'Train MAE': train_mae,
        'Train RMSE': train_rmse,
        'Test MAE': test_mae,
        'Test RMSE': test_rmse,
        'CV MAE': cv_mae
    }

## OLS模型

In [26]:
# 加载特征选择后的数据
X_train = pd.read_csv('/home/mw/temp/X_train_final.csv')
X_test = pd.read_csv('/home/mw/temp/X_test_final.csv')

print(f"进一步筛选前的特征数量: {X_train.shape[1]}")


进一步筛选前的特征数量: 23


In [27]:
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [29]:
# 计算VIF值
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data.sort_values("VIF", ascending=False)

# 计算特征的VIF
vif_df = calculate_vif(pd.DataFrame(X_train_scaled))
print("\n方差膨胀因子 (VIF) - 前10个最高值:")
print(vif_df.head(10))


方差膨胀因子 (VIF) - 前10个最高值:
    Feature       VIF
4         4  4.086671
12       12  3.153772
8         8  2.709892
3         3  2.645262
0         0  2.254259
1         1  2.065795
2         2  1.969258
6         6  1.924586
15       15  1.847046
5         5  1.839605


In [30]:
# 设置VIF阈值
vif_threshold = 5.0
high_vif_features = vif_df[vif_df["VIF"] > vif_threshold]["Feature"].tolist()

# 移除高VIF值的特征
X_train = pd.DataFrame(X_train_scaled)
X_test = pd.DataFrame(X_test_scaled)
X_train_filtered = X_train.drop(columns=high_vif_features, errors='ignore')
X_test_filtered = X_test.drop(columns=high_vif_features, errors='ignore')

print(f"\n移除了 {len(high_vif_features)} 个高VIF值的特征")
print(f"筛选后的特征数量: {X_train_filtered.shape[1]}")


移除了 0 个高VIF值的特征
筛选后的特征数量: 23


In [31]:
results = []

In [32]:
print("="*50)
print("训练线性回归 (OLS) 模型")
lr_model = LinearRegression()
lr_model.fit(X_train_filtered, y_train_transformed)
results.append(evaluate_all('OLS', lr_model, X_train_filtered, X_test_filtered, 
                         y_train_transformed, y_test_transformed, lambda_opt = None))


训练线性回归 (OLS) 模型

OLS 评估:
训练集 MAE: 531011.47, RMSE: 5736385.38
测试集 MAE: 677767.63, RMSE: 1977980.43
6折交叉验证 MAE: 492734.97


# LASSO

In [33]:
# 加载特征选择后的数据
X_train = pd.read_csv('/home/mw/temp/X_train_final.csv').astype(float)
X_test = pd.read_csv('/home/mw/temp/X_test_final.csv').astype(float)
X_predict = pd.read_csv('/home/mw/temp/X_predict_final.csv').astype(float)

# 标准化特征
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
print(type(X_train))

<class 'numpy.ndarray'>


In [35]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [36]:
# Lasso回归（L1正则化）
print("="*50)
print("训练Lasso回归模型，带网格搜索")
# 定义参数网格
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索
lasso_grid = GridSearchCV(Lasso(max_iter=10000, random_state=42), 
                          param_grid, cv=6, scoring='neg_mean_absolute_error')
lasso_grid.fit(X_train, y_train_transformed)

# 输出最佳参数
best_alpha = lasso_grid.best_params_['alpha']
print(f"最佳alpha参数: {best_alpha}")

训练Lasso回归模型，带网格搜索
最佳alpha参数: 0.001


In [37]:
# 使用最佳参数训练模型
lasso_model = Lasso(alpha=best_alpha, max_iter=10000, random_state=42)
lasso_model.fit(X_train, y_train_transformed)

# 评估模型
lasso_results = evaluate_all('Lasso', lasso_model, X_train, X_test, 
                             y_train_transformed, y_test_transformed, lambda_opt = None)
results.append(lasso_results)


Lasso 评估:
训练集 MAE: 530657.21, RMSE: 5608490.81
测试集 MAE: 677862.50, RMSE: 1945572.37
6折交叉验证 MAE: 493046.42


# Ridge

In [38]:
# 岭回归（L2正则化）
print("="*50)
print("训练岭回归模型，带网格搜索")

# 定义参数网格
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# 网格搜索
ridge_grid = GridSearchCV(Ridge(random_state=42), 
                          param_grid, cv=6, scoring='neg_mean_absolute_error')
ridge_grid.fit(X_train, y_train_transformed)

# 输出最佳参数
best_alpha = ridge_grid.best_params_['alpha']
print(f"最佳alpha参数: {best_alpha}")

训练岭回归模型，带网格搜索
最佳alpha参数: 0.001


In [39]:
# 使用最佳参数训练模型
ridge_model = Ridge(alpha=best_alpha, random_state=42)
ridge_model.fit(X_train, y_train_transformed)

# 评估模型
ridge_results = evaluate_all('Ridge', ridge_model, X_train, X_test, 
                            y_train_transformed, y_test_transformed, lambda_opt = None)
results.append(ridge_results)


Ridge 评估:
训练集 MAE: 531011.47, RMSE: 5736385.03
测试集 MAE: 677767.64, RMSE: 1977980.35
6折交叉验证 MAE: 492734.97


# Elastic Net

In [40]:
# 弹性网络（结合L1和L2正则化），使用SGD加速
print("="*50)
print("训练弹性网络模型（SGD），带网格搜索")

# 定义参数网格，包含alpha（正则化强度）和l1_ratio（L1比例）
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# 网格搜索
sgd_elastic_grid = GridSearchCV(
    SGDRegressor(penalty='elasticnet', max_iter=1000, tol=1e-3, random_state=42),
    param_grid, cv=6, scoring='neg_mean_absolute_error'
)
sgd_elastic_grid.fit(X_train, y_train_transformed)

# 输出最佳参数
best_alpha = sgd_elastic_grid.best_params_['alpha']
best_l1_ratio = sgd_elastic_grid.best_params_['l1_ratio']
print(f"最佳参数: alpha={best_alpha}, l1_ratio={best_l1_ratio}")

训练弹性网络模型（SGD），带网格搜索
最佳参数: alpha=0.0001, l1_ratio=0.3


In [41]:
# 使用最佳参数训练模型
sgd_elastic_model = SGDRegressor(
    penalty='elasticnet', alpha=best_alpha, l1_ratio=best_l1_ratio,
    max_iter=1000, tol=1e-3, random_state=42
)
sgd_elastic_model.fit(X_train, y_train_transformed)

# 评估模型
elastic_results = evaluate_all('Elastic Net (SGD)', sgd_elastic_model, X_train, X_test, 
                              y_train_transformed, y_test_transformed, lambda_opt = None)
results.append(elastic_results)



Elastic Net (SGD) 评估:
训练集 MAE: 536804.26, RMSE: 6290262.12
测试集 MAE: 677563.73, RMSE: 2150258.52
6折交叉验证 MAE: 495699.14


In [42]:
# 整理所有模型的性能结果
results_df = pd.DataFrame(results)

# 根据测试集 MAE 指标选择最优模型（值越小越好）
best_model_index = results_df['Test MAE'].idxmin()
best_model_name = results_df.loc[best_model_index, 'Model']
print(f"\n根据测试集 MAE，最优模型是: {best_model_name}")
print(f"性能指标: MAE = {results_df.loc[best_model_index, 'Test MAE']:.2f}, RMSE = {results_df.loc[best_model_index, 'Test RMSE']:.2f}")

# 找到最优模型的对象
if best_model_name == 'OLS':
    best_model = lr_model
elif best_model_name == 'Lasso':
    best_model = lasso_model
elif best_model_name == 'Ridge':
    best_model = ridge_model
elif best_model_name == 'Elastic Net (SGD)':
    best_model = sgd_elastic_model
else:
    raise ValueError(f"未知的最优模型名称: {best_model_name}")





根据测试集 MAE，最优模型是: Elastic Net (SGD)
性能指标: MAE = 677563.73, RMSE = 2150258.52


In [43]:
# 1. 加载预测数据
X_predict = pd.read_csv('/home/mw/temp/X_predict_final.csv')

# 2. 因为X_train现在是DataFrame化的标准化数据，列名可能丢失
# 我们需要确保使用相同的特征
X_train_original = pd.read_csv('/home/mw/temp/X_train_final.csv')
original_columns = X_train_original.columns

# 确保X_predict包含所需的所有列
X_predict = X_predict[original_columns]

# 3. 应用与训练数据相同的标准化
X_predict_scaled = scaler.transform(X_predict)

# 4. 进行预测（使用转换后的尺度）
predictions_transformed = ridge_model.predict(X_predict_scaled)

# 5. 将预测结果转换回原始尺度 - 使用对数变换的反变换
predictions = np.expm1(predictions_transformed)  # exp(x) - 1

# 6. 对异常值进行处理 - 基于训练数据的范围
max_price = y_train_original.max() * 1.5  # 允许预测值比训练集最大值高50%
min_price = max(0, y_train_original.min() * 0.5)  # 最低为0或训练集最小值的一半
predictions = np.clip(predictions, a_min=min_price, a_max=max_price)

# 输出预测值范围，用于检查合理性
print(f"预测价格范围: {predictions.min()} 至 {predictions.max()}")
print(f"训练数据价格范围: {y_train_original.min()} 至 {y_train_original.max()}")

# 7. 创建提交文件
submission = pd.DataFrame({
    'Id': range(len(predictions)),  # 或使用适当的ID列
    'Price': predictions
})

# 8. 保存预测结果到CSV文件
submission.to_csv('/home/mw/project/prediction.csv', index=False)

print("预测完成，结果已保存至 prediction.csv 文件")

预测价格范围: 236333.44539557354 至 11103000.0
训练数据价格范围: 78280.0 至 7402000.0
预测完成，结果已保存至 prediction.csv 文件
