In [1]:
#导入必要的库
import pandas as pd
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import MinMaxScaler
import re
import os
import numpy as np

# 数据预处理

### 1、去除建筑面积数据中的单位，用每平方米房价（价格/建筑面积）代替价格作为回归变量进行回归。
### 2、对建筑面积进行等距划分
### 3、对所需要的变量进行独热编码

In [2]:
# 加载训练集数据
train_data_path = r"C:\Users\邓双贤\Desktop\train_data .csv"
train_df = pd.read_csv(train_data_path)

# 计算每平方米房价
train_df['建筑面积data'] = train_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
train_df['每平方米房价'] = (train_df['价格'] / train_df['建筑面积data']).astype(int)

# 定义建筑面积区间并进行独热编码
bins = np.arange(0, train_df['建筑面积data'].max() + 20, 20)
labels = [f'{int(bins[i])}-{int(bins[i+1])}' for i in range(len(bins) - 1)]
train_df['建筑面积区间'] = pd.cut(train_df['建筑面积data'], bins=bins, labels=labels, right=False)
area_range_dummies = pd.get_dummies(train_df['建筑面积区间'], prefix='建筑面积区间')
train_df = pd.concat([train_df, area_range_dummies], axis=1)

#选择特征变量
train_features = ['城市','区域','板块','小区名称' ,'装修情况', '配备电梯', '交易权属', '房屋用途', '房屋年限', '产权所属', '年份' , '所在楼层', '别墅类型','梯户比例','房屋优势'] + list(area_range_dummies.columns)

X_train = train_df[train_features]
y_train = train_df['每平方米房价']

# 对需要独热编码的特征进行处理
categorical_cols = ['城市','区域','板块','小区名称' ,'装修情况', '配备电梯', '交易权属', '房屋用途', '房屋年限', '产权所属', '年份' , '所在楼层', '别墅类型','梯户比例','房屋优势']
X_train = pd.get_dummies(X_train, columns=categorical_cols)

# 保存训练集的列名
train_columns = X_train.columns

# 训练集数据归一化
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

# 加载测试数据
test_data_path = r"C:\Users\邓双贤\Desktop\test_data.csv"
test_df = pd.read_csv(test_data_path)

# 对测试集建筑面积进行处理
test_df['建筑面积data'] = test_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
test_df['建筑面积区间'] = pd.cut(test_df['建筑面积data'], bins=bins, labels=labels, right=False)
test_area_range_dummies = pd.get_dummies(test_df['建筑面积区间'], prefix='建筑面积区间')
test_df = pd.concat([test_df, test_area_range_dummies], axis=1)

# 对测试集需要独热编码的特征进行处理
test_df = pd.get_dummies(test_df, columns=categorical_cols)
# 确保测试集数据的特征列与训练集一致
test_df = test_df.reindex(columns=train_columns, fill_value=0)

# 对测试集数据进行归一化
test_df = scaler.transform(test_df)

  train_df['建筑面积data'] = train_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
  test_df['建筑面积data'] = test_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


# 构建OLS模型预测

In [3]:
#构建线性回归模型
model_1 = LinearRegression()  # alpha是正则化强度，可调整
model_1.fit(X_train, y_train)

In [4]:
# 进行预测
y_pred = model_1.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

# 创建包含编号和预测总房价的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred
})

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\邓双贤\Desktop\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'LinearRgeression.csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

  test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


预测结果：
          id         price
0          0  2.204612e+07
1          1  1.090666e+07
2          2  3.555474e+06
3          3  2.626686e+06
4          4  5.266061e+06
...      ...           ...
14781  14781 -9.031345e+17
14782  14782 -1.894825e+18
14783  14783 -1.058152e+18
14784  14784 -1.308260e+18
14785  14785 -2.512521e+18

[14786 rows x 2 columns]


In [5]:
#保存模型系数
model_1_csv_path = os.path.join(save_folder, '线性回归模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_1.coef_})
coefficients_df.to_csv(model_1_csv_path, index=False)

# 构建Ridge模型预测（alpha取1）

In [6]:
#构建Ridge回归模型
model_2 = Ridge(alpha=1)  # alpha是正则化强度，可调整
model_2.fit(X_train, y_train)

In [24]:
# 进行预测
y_pred = model_2.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

# 创建包含编号和预测总房价的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred
})

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\邓双贤\Desktop\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'Ridge(alpha=1.0).csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

  test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


预测结果：
          id         price
0          0  2.111474e+07
1          1  1.098914e+07
2          2  3.507857e+06
3          3  2.618809e+06
4          4  5.276066e+06
...      ...           ...
14781  14781  3.790670e+05
14782  14782  3.415633e+05
14783  14783  7.258100e+05
14784  14784  8.545331e+05
14785  14785  2.951273e+05

[14786 rows x 2 columns]


In [8]:
#保存模型系数
model_2_csv_path = os.path.join(save_folder, 'Ridge(alpha=1.0)模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_2.coef_})
coefficients_df.to_csv(model_2_csv_path, index=False)

# 构建Lasso模型预测（alpha取1）

In [11]:
#构建Lasso回归模型
model_3 = Lasso(alpha=1)  # alpha是正则化强度，可调整
model_3.fit(X_train, y_train)

In [12]:
# 进行预测
y_pred = model_3.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

# 创建包含编号和预测总房价的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred
})

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\邓双贤\Desktop\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'LASSO(alpha=1.0).csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

  test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


预测结果：
          id         price
0          0  1.896318e+07
1          1  1.145202e+07
2          2  3.413721e+06
3          3  2.594425e+06
4          4  5.396250e+06
...      ...           ...
14781  14781  5.374573e+05
14782  14782  1.011584e+06
14783  14783  1.020825e+06
14784  14784  1.272245e+06
14785  14785  1.230555e+06

[14786 rows x 2 columns]


In [13]:
#保存模型系数
model_3_csv_path = os.path.join(save_folder, 'LASSO(alpha=1.0)模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_3.coef_})
coefficients_df.to_csv(model_3_csv_path, index=False)

# 计算模型mae，mrse和进行六折交叉验证

## OLS模型

In [14]:
#导入必要的库
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split,KFold

In [15]:
# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [16]:
# 训练模型
model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_LinearRegression = model_LinearRegression.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_LinearRegression = y_train_pred_LinearRegression * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_LinearRegression = mean_absolute_error(train_price, train_pred_price_LinearRegression)
train_rmse_LinearRegression = np.sqrt(mean_squared_error(train_price, train_pred_price_LinearRegression))

# 在测试集上进行预测
y_test_pred_LinearRegression = model_LinearRegression.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_LinearRegression = y_test_pred_LinearRegression * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_LinearRegression = mean_absolute_error(test_price, test_pred_price_LinearRegression)
test_rmse_LinearRegression = np.sqrt(mean_squared_error(test_price, test_pred_price_LinearRegression))

# 输出结果
print(f"训练集 MAE_LinearRegression: {train_mae_LinearRegression}")
print(f"训练集 RMSE_LinearRegression: {train_rmse_LinearRegression}")
print(f"测试集 MAE_LinearRegression: {test_mae_LinearRegression}")
print(f"测试集 RMSE_LinearRegression: {test_rmse_LinearRegression}")

训练集 MAE_LinearRegression: 1943229823931381.8
训练集 RMSE_LinearRegression: 3.084359243496618e+17
测试集 MAE_LinearRegression: 9715918153305010.0
测试集 RMSE_LinearRegression: 6.896754961542548e+17


In [17]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_LinearRegression = []
rmse_scores_LinearRegression = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = LinearRegression()
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_LinearRegression.append(mae)
    rmse_scores_LinearRegression.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_LinearRegression = np.mean(mae_scores_LinearRegression)
average_rmse_LinearRegression = np.mean(rmse_scores_LinearRegression)

print(f"六折交叉验证平均 MAE_LinearRegression: {average_mae_LinearRegression}")
print(f"六折交叉验证平均 RMSE_LinearRegression: {average_rmse_LinearRegression}")

六折交叉验证平均 MAE_LinearRegression: 3664633694302102.5
六折交叉验证平均 RMSE_LinearRegression: 2.6929283235104048e+17


## Ridge模型（alpha=1）

In [18]:
# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [19]:
# 训练Ridge模型
model_Ridge = Ridge(alpha=1.0)  # alpha是正则化强度，可调整
model_Ridge.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_Ridge = model_Ridge.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_Ridge = y_train_pred_Ridge * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_Ridge = mean_absolute_error(train_price, train_pred_price_Ridge)
train_rmse_Ridge = np.sqrt(mean_squared_error(train_price, train_pred_price_Ridge))

# 在测试集上进行预测
y_test_pred_Ridge = model_Ridge.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_Ridge = y_test_pred_Ridge * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_Ridge = mean_absolute_error(test_price, test_pred_price_Ridge)
test_rmse_Ridge = np.sqrt(mean_squared_error(test_price, test_pred_price_Ridge))

# 输出结果
print(f"训练集 MAE_Ridge: {train_mae_Ridge}")
print(f"训练集 RMSE_Ridge: {train_rmse_Ridge}")
print(f"测试集 MAE_Ridge: {test_mae_Ridge}")
print(f"测试集 RMSE_Ridge: {test_rmse_Ridge}")

训练集 MAE_Ridge: 180544.59254791614
训练集 RMSE_Ridge: 563248.6060159897
测试集 MAE_Ridge: 204989.08606196844
测试集 RMSE_Ridge: 911592.9431834978


In [20]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_Ridge = []
rmse_scores_Ridge = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = Ridge(alpha=1.0)
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_Ridge.append(mae)
    rmse_scores_Ridge.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_Ridge = np.mean(mae_scores_Ridge)
average_rmse_Ridge = np.mean(rmse_scores_Ridge)

print(f"六折交叉验证平均 MAE_Ridge: {average_mae_Ridge}")
print(f"六折交叉验证平均 RMSE_Ridge: {average_rmse_Ridge}")

六折交叉验证平均 MAE_Ridge: 192397.4539808272
六折交叉验证平均 RMSE_Ridge: 603142.5206718688


## Lasso模型（alpha=1）

In [21]:
# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [22]:
# 训练Lasso模型
model_Lasso = Lasso(alpha=1.0)  # alpha是正则化强度，可调整
model_Lasso.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_Lasso = model_Lasso.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_Lasso = y_train_pred_Lasso * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_Lasso = mean_absolute_error(train_price, train_pred_price_Lasso)
train_rmse_Lasso = np.sqrt(mean_squared_error(train_price, train_pred_price_Lasso))

# 在测试集上进行预测
y_test_pred_Lasso = model_Lasso.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_Lasso = y_test_pred_Lasso * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_Lasso = mean_absolute_error(test_price, test_pred_price_Lasso)
test_rmse_Lasso = np.sqrt(mean_squared_error(test_price, test_pred_price_Lasso))

# 输出结果
print(f"训练集 MAE_Lasso: {train_mae_Lasso}")
print(f"训练集 RMSE_Lasso: {train_rmse_Lasso}")
print(f"测试集 MAE_Lasso: {test_mae_Lasso}")
print(f"测试集 RMSE_Lasso: {test_rmse_Lasso}")

  model = cd_fast.enet_coordinate_descent(


训练集 MAE_Lasso: 238799.41142467328
训练集 RMSE_Lasso: 729446.8435517849
测试集 MAE_Lasso: 259820.68901244795
测试集 RMSE_Lasso: 1112560.9213454449


In [23]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_Lasso = []
rmse_scores_Lasso = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = Lasso(alpha=1.0)
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_Lasso.append(mae)
    rmse_scores_Lasso.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_Lasso = np.mean(mae_scores_Lasso)
average_rmse_Lasso = np.mean(rmse_scores_Lasso)

print(f"六折交叉验证平均 MAE_Lasso: {average_mae_Lasso}")
print(f"六折交叉验证平均 RMSE_Lasso: {average_rmse_Lasso}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


六折交叉验证平均 MAE_Lasso: 244670.07048609146
六折交叉验证平均 RMSE_Lasso: 718453.3087542999


  model = cd_fast.enet_coordinate_descent(


# 最优模型：采用Ridge回归（alpha=0.12）并在此前基础上增添几个特征变量

In [2]:
# 加载训练集数据
train_data_path = r"C:\Users\邓双贤\Desktop\train_data .csv"
train_df = pd.read_csv(train_data_path)

# 计算每平方米房价
train_df['建筑面积data'] = train_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
train_df['每平方米房价'] = (train_df['价格'] / train_df['建筑面积data']).astype(int)

# 对上次交易数据进行分类处理
train_df['上次交易分类'] = train_df['上次交易'].notna().astype(int)
train_df['核心卖点分类'] = train_df['核心卖点'].notna().astype(int)
train_df['户型介绍分类'] = train_df['户型介绍'].notna().astype(int)
train_df['周边配套分类'] = train_df['周边配套'].notna().astype(int)
train_df['交通出行分类'] = train_df['交通出行'].notna().astype(int)

# 定义建筑面积区间并进行独热编码
bins = np.arange(0, train_df['建筑面积data'].max() + 11, 11)
labels = [f'{int(bins[i])}-{int(bins[i+1])}' for i in range(len(bins) - 1)]
train_df['建筑面积区间'] = pd.cut(train_df['建筑面积data'], bins=bins, labels=labels, right=False)
area_range_dummies = pd.get_dummies(train_df['建筑面积区间'], prefix='建筑面积区间')
train_df = pd.concat([train_df, area_range_dummies], axis=1)

#选择特征变量
train_features = ['城市','区域','板块' ,'小区名称','装修情况', '配备电梯', '交易权属', '交易时间','房屋用途', '房屋年限', '产权所属', '年份' , '所在楼层','核心卖点分类' ,'周边配套分类','别墅类型','梯户比例','房屋优势','上次交易分类'] + list(area_range_dummies.columns)
,
X_train = train_df[train_features]
y_train = train_df['每平方米房价']

# 对需要独热编码的特征进行处理
categorical_cols = ['城市','区域','板块' ,'小区名称','装修情况', '配备电梯', '交易权属', '交易时间','房屋用途', '房屋年限', '产权所属', '年份' , '所在楼层', '别墅类型','梯户比例','房屋优势']
X_train = pd.get_dummies(X_train, columns=categorical_cols)

# 保存训练集的列名
train_columns = X_train.columns

# 训练集数据归一化
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

# 加载测试数据
test_data_path = r"C:\Users\邓双贤\Desktop\test_data.csv"
test_df = pd.read_csv(test_data_path)

# 对测试集建筑面积进行处理
test_df['建筑面积data'] = test_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
test_df['建筑面积区间'] = pd.cut(test_df['建筑面积data'], bins=bins, labels=labels, right=False)
test_area_range_dummies = pd.get_dummies(test_df['建筑面积区间'], prefix='建筑面积区间')
test_df = pd.concat([test_df, test_area_range_dummies], axis=1)

# 对测试集上次交易数据进行分类处理
test_df['上次交易分类'] = test_df['上次交易'].notna().astype(int)
test_df['核心卖点分类'] = train_df['核心卖点'].notna().astype(int)
test_df['户型介绍分类'] = train_df['户型介绍'].notna().astype(int)
test_df['周边配套分类'] = train_df['周边配套'].notna().astype(int)
test_df['交通出行分类'] = train_df['交通出行'].notna().astype(int)

# 对测试集需要独热编码的特征进行处理
test_df = pd.get_dummies(test_df, columns=categorical_cols)

# 确保测试集数据的特征列与训练集一致
test_df = test_df.reindex(columns=train_columns, fill_value=0)

# 对测试集数据进行归一化
test_df = scaler.transform(test_df)

  train_df['建筑面积data'] = train_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
  test_df['建筑面积data'] = test_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


In [3]:
#构建Ridge回归模型
model_2 = Ridge(alpha=0.12)  # alpha是正则化强度，可调整
model_2.fit(X_train, y_train)

# 进行预测
y_pred = model_2.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

# 创建包含编号和预测总房价的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred
})

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\邓双贤\Desktop\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'Ridge(alpha=0.12).csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

  test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)


预测结果：
          id         price
0          0  2.210900e+07
1          1  1.089942e+07
2          2  3.536460e+06
3          3  2.627118e+06
4          4  5.235206e+06
...      ...           ...
14781  14781  2.697238e+05
14782  14782  1.802037e+05
14783  14783  5.577225e+05
14784  14784  6.726198e+05
14785  14785  3.898741e+04

[14786 rows x 2 columns]


In [4]:
#导入必要的库
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split,KFold

# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

# 训练Ridge模型
model_Ridge = Ridge(alpha=0.12)  # alpha是正则化强度，可调整
model_Ridge.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_Ridge = model_Ridge.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_Ridge = y_train_pred_Ridge * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_Ridge = mean_absolute_error(train_price, train_pred_price_Ridge)
train_rmse_Ridge = np.sqrt(mean_squared_error(train_price, train_pred_price_Ridge))

# 在测试集上进行预测
y_test_pred_Ridge = model_Ridge.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_Ridge = y_test_pred_Ridge * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_Ridge = mean_absolute_error(test_price, test_pred_price_Ridge)
test_rmse_Ridge = np.sqrt(mean_squared_error(test_price, test_pred_price_Ridge))

# 输出结果
print(f"训练集 MAE_Ridge: {train_mae_Ridge}")
print(f"训练集 RMSE_Ridge: {train_rmse_Ridge}")
print(f"测试集 MAE_Ridge: {test_mae_Ridge}")
print(f"测试集 RMSE_Ridge: {test_rmse_Ridge}")

# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_Ridge = []
rmse_scores_Ridge = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = Ridge(alpha=0.12)
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_Ridge.append(mae)
    rmse_scores_Ridge.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_Ridge = np.mean(mae_scores_Ridge)
average_rmse_Ridge = np.mean(rmse_scores_Ridge)

print(f"六折交叉验证平均 MAE_Ridge: {average_mae_Ridge}")
print(f"六折交叉验证平均 RMSE_Ridge: {average_rmse_Ridge}")

训练集 MAE_Ridge: 173419.86912447814
训练集 RMSE_Ridge: 522229.5684565084
测试集 MAE_Ridge: 203875.9710073116
测试集 RMSE_Ridge: 880276.6230610757
六折交叉验证平均 MAE_Ridge: 192772.06162695357
六折交叉验证平均 RMSE_Ridge: 597325.9685277418
