In [None]:
#导入必要的库
import pandas as pd
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
import re
import os
import numpy as np
from sklearn.linear_model import RidgeCV , LassoCV

# 数据预处理

### 1、去除建筑面积数据中的单位，用每平方米房价（价格/建筑面积）代替价格作为回归变量进行回归。
### 2、对建筑面积进行等距划分
### 3、对所需要的变量进行独热编码
### 4. 补充缺失数据

In [None]:
# 加载训练集数据
train_data_path = r"C:\Users\21517\Desktop\期中考试\input\quant4533\train_data.csv"
train_df = pd.read_csv(train_data_path)

# 计算每平方米房价
train_df['建筑面积data'] = train_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
train_df['套内面积data'] = train_df['套内面积'].str.extract('(\d+\.?\d*)').astype(float)
train_df['每平方米房价'] = (train_df['价格'] / train_df['建筑面积data']).astype(float)

# 计算公摊比
train_df['套内面积data'] = train_df['套内面积'].str.extract('(\d+\.?\d*)').astype(float)
train_df['公摊比'] = train_df['套内面积data'] / train_df['建筑面积data']

# 用公摊比的均值填充缺失值
mean_share_ratio = train_df['公摊比'].mean()
train_df['公摊比'].fillna(mean_share_ratio, inplace=True)

# 对公摊比每 0.1 进行划分
bins = np.arange(0, train_df['公摊比'].max() + 0.1, 0.1)
labels = [f'{bins[i]:.1f}-{bins[i + 1]:.1f}' for i in range(len(bins) - 1)]
train_df['公摊比区间'] = pd.cut(train_df['公摊比'], bins=bins, labels=labels, right=False)

# 对公摊比区间进行独热编码
share_ratio_dummies = pd.get_dummies(train_df['公摊比区间'], prefix='公摊比区间')
train_df = pd.concat([train_df, share_ratio_dummies], axis=1)

# 对上次交易数据进行分类处理
train_df['多手房'] = train_df['上次交易'].notna().astype(int)

# 去除异常面积
train_df = train_df[(train_df['建筑面积data'] > 20) & (train_df['建筑面积data'] < 1000)]

# 计算IQR去除房价异常
q1 = train_df['每平方米房价'].quantile(0.25)
q3 = train_df['每平方米房价'].quantile(0.75)
iqr = q3 - q1
train_df = train_df[(train_df['每平方米房价'] >= (q1 - 1.5 * iqr)) & (train_df['每平方米房价'] <= (q3 + 1.5 * iqr))]

# 对建筑面积进行区间划分
bins = np.arange(0, train_df['建筑面积data'].max() + 20, 20)
labels = [f'{int(bins[i])}-{int(bins[i+1])}' for i in range(len(bins) - 1)]
train_df['建筑面积区间'] = pd.cut(train_df['建筑面积data'], bins=bins, labels=labels, right=False)
area_range_dummies = pd.get_dummies(train_df['建筑面积区间'], prefix='建筑面积区间')
train_df = pd.concat([train_df, area_range_dummies], axis=1)

#使用KNeighborsClassifier填充缺失的“环线”变量
known_ringline = train_df[train_df['环线'].notna()]
unknown_ringline = train_df[train_df['环线'].isna()]
train_df['lon'] = pd.to_numeric(train_df['lon'], errors='coerce')
train_df['lat'] = pd.to_numeric(train_df['lat'], errors='coerce')
X_known = known_ringline[['lon', 'lat']]
y_known = known_ringline['环线']
X_unknown = unknown_ringline[['lon', 'lat']]
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_known, y_known)
predicted_ringlines = knn_clf.predict(X_unknown)
train_df.loc[unknown_ringline.index, '环线'] = predicted_ringlines

# 选择特征变量
categorical_cols = ['城市', '区域', '板块','环线', '小区名称', '装修情况', '配备电梯', '交易权属', '房屋用途', '房屋年限', 
                    '产权所属', '交易时间', '年份', '所在楼层', '别墅类型', '梯户比例', '房屋优势']
train_features = ['城市', '区域', '板块', '环线','年份', '小区名称', '装修情况', '配备电梯', '交易权属', '房屋用途', 
                  '房屋年限', '产权所属', '交易时间', '所在楼层', '别墅类型', '梯户比例', '房屋优势', '多手房'] + list(area_range_dummies.columns) + list(share_ratio_dummies.columns)

X_train = train_df[train_features]
y_train = train_df['每平方米房价']

# 对需要独热编码的特征进行处理
X_train = pd.get_dummies(X_train, columns=categorical_cols)

# 保存训练集的列名
train_columns = X_train.columns

# 训练集数据归一化
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

# 加载测试数据
test_data_path = r"C:\Users\21517\Desktop\期中考试\input\quant4533\test_data.csv"
test_df = pd.read_csv(test_data_path)

# 对测试集建筑面积进行处理
bins = np.arange(0, train_df['建筑面积data'].max() + 20, 20)
labels = [f'{int(bins[i])}-{int(bins[i+1])}' for i in range(len(bins) - 1)]
test_df['建筑面积data'] = test_df['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
test_df['建筑面积区间'] = pd.cut(test_df['建筑面积data'], bins=bins, labels=labels, right=False)
test_area_range_dummies = pd.get_dummies(test_df['建筑面积区间'], prefix='建筑面积区间')
test_df = pd.concat([test_df, test_area_range_dummies], axis=1)

# 计算测试集公摊比
test_df['套内面积data'] = test_df['套内面积'].str.extract('(\d+\.?\d*)').astype(float)
test_df['公摊比'] = test_df['套内面积data'] / test_df['建筑面积data']
test_mean_share_ratio = test_df['公摊比'].mean()
test_df['公摊比'].fillna(test_mean_share_ratio, inplace=True)
bins = np.arange(0, train_df['公摊比'].max() + 0.1, 0.1)
labels = [f'{bins[i]:.1f}-{bins[i + 1]:.1f}' for i in range(len(bins) - 1)]
test_df['公摊比区间'] = pd.cut(test_df['公摊比'], bins=bins, labels=labels, right=False)
test_share_ratio_dummies = pd.get_dummies(test_df['公摊比区间'], prefix='公摊比区间')
test_df = pd.concat([test_df, test_share_ratio_dummies], axis=1)

#使用KNeighborsClassifier填充缺失的“环线”变量
known_ringline = train_df[train_df['环线'].notna()]
unknown_ringline = test_df[test_df['环线'].isna()]
test_df['lon'] = pd.to_numeric(train_df['lon'], errors='coerce')
test_df['lat'] = pd.to_numeric(train_df['lat'], errors='coerce')
X_known = known_ringline[['lon', 'lat']]
y_known = known_ringline['环线']
X_unknown = unknown_ringline[['lon', 'lat']]
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_known, y_known)
predicted_ringlines = knn_clf.predict(X_unknown)
test_df.loc[unknown_ringline.index, '环线'] = predicted_ringlines

# 对测试集需要独热编码的特征进行处理
test_df = pd.get_dummies(test_df, columns=categorical_cols)


# 确保测试集数据的特征列与训练集一致
test_df = test_df.reindex(columns=train_columns, fill_value=0)

# 对测试集数据进行归一化
test_df = scaler.transform(test_df)

# 构建OLS模型预测

In [None]:
#构建线性回归模型
model_1 = LinearRegression()
model_1.fit(X_train, y_train)

In [None]:
# 进行预测
y_pred = model_1.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

#重新读取原始数据
raw_test_df = pd.read_csv(test_data_path)

# 创建包含编号、预测总房价和板块编号的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred,
    '单位房价': y_pred,
    '板块': raw_test_df['板块'],
    'area': test_df_original['建筑面积data'] 
})

# 去除房价低于 200000 的数据
filtered_df = result_df[result_df['price'] >= 200000]

# 按板块分组计算每个板块的单位房价均值
block_means = filtered_df.groupby('板块')['单位房价'].mean()

# 遍历每个板块，将低于 200000 的值用该板块的单位房价乘以房屋面积所得的值替换
for block in result_df['板块'].unique():
    block_mask = result_df['板块'] == block
    low_price_mask = result_df['price'] < 200000
    combined_mask = block_mask & low_price_mask
    result_df.loc[combined_mask, 'price'] = result_df.loc[block_mask, '单位房价'] * result_df.loc[block_mask, 'area']

# 删除除了 id 和 price 以外的列
result_df = result_df[['id', 'price']]

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\21517\Desktop\期中考试\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'OLS.csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

In [None]:
#保存模型系数
model_1_csv_path = os.path.join(save_folder, '线性回归模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_1.coef_})
coefficients_df.to_csv(model_1_csv_path, index=False)

# 构建Ridge模型预测

In [None]:
from sklearn.linear_model import RidgeCV

#构建Ridge回归模型
alphas = [0.01, 0.1, 1, 10, 100]
model_2 = RidgeCV(alphas=alphas, cv=5).fit(X_train, y_train)
print(f"最佳alpha: {model_2.alpha_}")

In [None]:
# 进行预测
y_pred = model_2.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

#重新读取原始数据
raw_test_df = pd.read_csv(test_data_path)

# 创建包含编号、预测总房价和板块编号的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred,
    '单位房价': y_pred,
    '板块': raw_test_df['板块'],
    'area': test_df_original['建筑面积data'] 
})

# 去除房价低于 200000 的数据
filtered_df = result_df[result_df['price'] >= 200000]

# 按板块分组计算每个板块的单位房价均值
block_means = filtered_df.groupby('板块')['单位房价'].mean()

# 遍历每个板块，将低于 200000 的值用该板块的单位房价乘以房屋面积所得的值替换
for block in result_df['板块'].unique():
    block_mask = result_df['板块'] == block
    low_price_mask = result_df['price'] < 200000
    combined_mask = block_mask & low_price_mask
    result_df.loc[combined_mask, 'price'] = result_df.loc[block_mask, '单位房价'] * result_df.loc[block_mask, 'area']

# 删除除了 id 和 price 以外的列
result_df = result_df[['id', 'price']]

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\21517\Desktop\期中考试\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'Ridge.csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

In [None]:
#保存模型系数
model_2_csv_path = os.path.join(save_folder, 'Ridge模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_2.coef_})
coefficients_df.to_csv(model_2_csv_path, index=False)

# 构建Lasso模型预测

In [None]:
from sklearn.linear_model import LassoCV

#构建Lasso回归模型
alphas = [0.01, 0.1, 1, 10, 100]
model_3 = LassoCV(alphas=alphas, cv=5).fit(X_train, y_train)
print(f"最佳alpha: {model_3.alpha_}")

In [None]:
# 进行预测
y_pred = model_3.predict(test_df)

# 结合建筑面积计算总房价
test_df_original = pd.read_csv(test_data_path)
test_df_original['建筑面积data'] = test_df_original['建筑面积'].str.extract('(\d+\.?\d*)').astype(float)
total_price_pred = y_pred * test_df_original['建筑面积data']

#重新读取原始数据
raw_test_df = pd.read_csv(test_data_path)

# 创建包含编号、预测总房价和板块编号的表格
result_df = pd.DataFrame({
    'id': range(0, len(total_price_pred)),
    'price': total_price_pred,
    '单位房价': y_pred,
    '板块': raw_test_df['板块'],
    'area': test_df_original['建筑面积data'] 
})

# 去除房价低于 200000 的数据
filtered_df = result_df[result_df['price'] >= 200000]

# 按板块分组计算每个板块的单位房价均值
block_means = filtered_df.groupby('板块')['单位房价'].mean()

# 遍历每个板块，将低于 200000 的值用该板块的单位房价乘以房屋面积所得的值替换
for block in result_df['板块'].unique():
    block_mask = result_df['板块'] == block
    low_price_mask = result_df['price'] < 200000
    combined_mask = block_mask & low_price_mask
    result_df.loc[combined_mask, 'price'] = result_df.loc[block_mask, '单位房价'] * result_df.loc[block_mask, 'area']

# 删除除了 id 和 price 以外的列
result_df = result_df[['id', 'price']]

print('预测结果：')
print(result_df)

# 指定保存结果的文件夹路径
save_folder = r"C:\Users\21517\Desktop\期中考试\HW3"
# 如果文件夹不存在，则创建文件夹
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# 构建保存文件的完整路径
csv_path = os.path.join(save_folder, 'Lasso.csv')
# 将结果保存为 CSV 文件
result_df.to_csv(csv_path, index=False)

In [None]:
#保存模型系数
model_3_csv_path = os.path.join(save_folder, 'LASSO模型系数.csv')
coefficients_df = pd.DataFrame({'Feature': train_columns, 'Coefficient': model_3.coef_})
coefficients_df.to_csv(model_3_csv_path, index=False)

# 计算模型mae，mrse和进行六折交叉验证

## OLS模型

In [None]:
#导入必要的库
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split,KFold

In [None]:

#要先把train_df打乱一下顺序，然后再划分训练集和测试集
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
X_train = train_df[train_features]
X_train = pd.get_dummies(X_train, columns=categorical_cols)
y_train = train_df['每平方米房价']

# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [None]:
# 训练模型
model_LinearRegression = LinearRegression()
model_LinearRegression.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_LinearRegression = model_LinearRegression.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_LinearRegression = y_train_pred_LinearRegression * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_LinearRegression = mean_absolute_error(train_price, train_pred_price_LinearRegression)
train_rmse_LinearRegression = np.sqrt(mean_squared_error(train_price, train_pred_price_LinearRegression))

# 在测试集上进行预测
y_test_pred_LinearRegression = model_LinearRegression.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_LinearRegression = y_test_pred_LinearRegression * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_LinearRegression = mean_absolute_error(test_price, test_pred_price_LinearRegression)
test_rmse_LinearRegression = np.sqrt(mean_squared_error(test_price, test_pred_price_LinearRegression))

# 输出结果
print(f"训练集 MAE_LinearRegression: {train_mae_LinearRegression}")
print(f"训练集 RMSE_LinearRegression: {train_rmse_LinearRegression}")
print(f"测试集 MAE_LinearRegression: {test_mae_LinearRegression}")
print(f"测试集 RMSE_LinearRegression: {test_rmse_LinearRegression}")

In [None]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_LinearRegression = []
rmse_scores_LinearRegression = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = LinearRegression()
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_LinearRegression.append(mae)
    rmse_scores_LinearRegression.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_LinearRegression = np.mean(mae_scores_LinearRegression)
average_rmse_LinearRegression = np.mean(rmse_scores_LinearRegression)

print(f"六折交叉验证平均 MAE_LinearRegression: {average_mae_LinearRegression}")
print(f"六折交叉验证平均 RMSE_LinearRegression: {average_rmse_LinearRegression}")

## Ridge模型

In [None]:
#要先把train_df打乱一下顺序，然后再划分训练集和测试集
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
X_train = train_df[train_features]
X_train = pd.get_dummies(X_train, columns=categorical_cols)
y_train = train_df['每平方米房价']

# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [None]:
# 训练Ridge模型
model_Ridge = Ridge(alpha=1.0)  # alpha是正则化强度，可调整
model_Ridge.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_Ridge = model_Ridge.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_Ridge = y_train_pred_Ridge * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_Ridge = mean_absolute_error(train_price, train_pred_price_Ridge)
train_rmse_Ridge = np.sqrt(mean_squared_error(train_price, train_pred_price_Ridge))

# 在测试集上进行预测
y_test_pred_Ridge = model_Ridge.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_Ridge = y_test_pred_Ridge * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_Ridge = mean_absolute_error(test_price, test_pred_price_Ridge)
test_rmse_Ridge = np.sqrt(mean_squared_error(test_price, test_pred_price_Ridge))

# 输出结果
print(f"训练集 MAE_Ridge: {train_mae_Ridge}")
print(f"训练集 RMSE_Ridge: {train_rmse_Ridge}")
print(f"测试集 MAE_Ridge: {test_mae_Ridge}")
print(f"测试集 RMSE_Ridge: {test_rmse_Ridge}")

In [None]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_Ridge = []
rmse_scores_Ridge = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = Ridge(alpha=1.0)
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_Ridge.append(mae)
    rmse_scores_Ridge.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_Ridge = np.mean(mae_scores_Ridge)
average_rmse_Ridge = np.mean(rmse_scores_Ridge)

print(f"六折交叉验证平均 MAE_Ridge: {average_mae_Ridge}")
print(f"六折交叉验证平均 RMSE_Ridge: {average_rmse_Ridge}")

## Lasso模型

In [None]:
#要先把train_df打乱一下顺序，然后再划分训练集和测试集
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
X_train = train_df[train_features]
X_train = pd.get_dummies(X_train, columns=categorical_cols)
y_train = train_df['每平方米房价']

# 划分数据集，80% 用于训练，20% 用于验证
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 创建六折交叉验证对象
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [None]:
# 训练Lasso模型
alphas = [0.01, 0.1, 1, 10, 100]
model_2 = LassoCV(alphas=alphas, cv=6).fit(X_train, y_train)
print(f"最佳alpha: {model_2.alpha_}")
model_Lasso = Lasso(alpha=model_2.alpha_)
model_Lasso.fit(X_train_train, y_train_train)

# 在训练集上进行预测
y_train_pred_Lasso = model_Lasso.predict(X_train)

# 将每平方米房价还原成价格
train_price = y_train * train_df['建筑面积data']
train_pred_price_Lasso = y_train_pred_Lasso * train_df['建筑面积data']

# 计算训练集的 MAE 和 RMSE
train_mae_Lasso = mean_absolute_error(train_price, train_pred_price_Lasso)
train_rmse_Lasso = np.sqrt(mean_squared_error(train_price, train_pred_price_Lasso))

# 在测试集上进行预测
y_test_pred_Lasso = model_Lasso.predict(X_train_test)

# 将测试集每平方米房价还原成价格
test_price = y_train_test * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']
test_pred_price_Lasso = y_test_pred_Lasso * train_df.iloc[train_test_split(X_train, y_train, test_size=0.2, random_state=42)[3].index]['建筑面积data']

# 计算测试集的 MAE 和 RMSE
test_mae_Lasso = mean_absolute_error(test_price, test_pred_price_Lasso)
test_rmse_Lasso = np.sqrt(mean_squared_error(test_price, test_pred_price_Lasso))

# 输出结果
print(f"训练集 MAE_Lasso: {train_mae_Lasso}")
print(f"训练集 RMSE_Lasso: {train_rmse_Lasso}")
print(f"测试集 MAE_Lasso: {test_mae_Lasso}")
print(f"测试集 RMSE_Lasso: {test_rmse_Lasso}")

In [None]:
# 初始化存储每次验证的 MAE 和 RMSE 的列表
mae_scores_Lasso = []
rmse_scores_Lasso = []

# 进行六折交叉验证
for train_index, test_index in kf.split(X_train):
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # 创建并训练模型
    model = Lasso(alpha=1.0)
    model.fit(X_train_train, y_train_train)

    # 在验证集上进行预测
    y_pred = model.predict(X_train_test)

    # 将每平方米房价还原成价格
    val_price = y_train_test * train_df.iloc[test_index]['建筑面积data']
    val_pred_price = y_pred * train_df.iloc[test_index]['建筑面积data']

    # 计算 MAE 和 RMSE
    mae = mean_absolute_error(val_price, val_pred_price)
    rmse = np.sqrt(mean_squared_error(val_price, val_pred_price))

    # 将每次验证的 MAE 和 RMSE 添加到列表中
    mae_scores_Lasso.append(mae)
    rmse_scores_Lasso.append(rmse)

# 计算平均 MAE 和 RMSE
average_mae_Lasso = np.mean(mae_scores_Lasso)
average_rmse_Lasso = np.mean(rmse_scores_Lasso)

print(f"六折交叉验证平均 MAE_Lasso: {average_mae_Lasso}")
print(f"六折交叉验证平均 RMSE_Lasso: {average_rmse_Lasso}")