In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime


In [2]:
# 加载数据
data = pd.read_csv('ruc_Class25Q1_train.csv')

# 数据预处理

In [3]:
# 处理建筑面积的数据
# 定义函数来提取建筑面积的数值
def extract_area_value(area_str):
    if isinstance(area_str, str):
        # 提取所有数字和小数点
        numbers = ''.join(filter(lambda x: x.isdigit() or x == '.', area_str))
        try:
            return float(numbers)
        except ValueError:
            return None
    return None

# 应用函数到建筑面积列
data['square'] = data['建筑面积'].apply(extract_area_value)

# 查看处理后的结果（前几行）
print(data[['建筑面积','square']].head())

      建筑面积  square
0    52.3㎡   52.30
1  127.44㎡  127.44
2  228.54㎡  228.54
3    43.6㎡   43.60
4   39.85㎡   39.85


In [4]:
# 定义函数将汉字数字转换为阿拉伯数字
def chinese_to_arabic(chinese_num):
    chinese_num_map = {
        '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
        '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
        '十': 10, '百': 100, '千': 1000, '万': 10000,
        '亿': 100000000
    }
    result = 0
    temp = 0
    for char in chinese_num:
        if char in ['十', '百', '千', '万', '亿']:
            if temp == 0:
                temp = 1
            result += temp * chinese_num_map[char]
            temp = 0
        else:
            temp = chinese_num_map[char]
    result += temp
    return result


# 定义函数计算一梯有几户
def calculate_units_per_elevator(ratio):
    if isinstance(ratio, str):
        try:
            parts = ratio.split('梯')
            if len(parts) == 2:
                elevators = chinese_to_arabic(parts[0])
                units = chinese_to_arabic(parts[1].split('户')[0])
                return units / elevators if elevators > 0 else None
        except ValueError:
            pass
    return None


# 应用函数生成新列
data['units_elevator'] = data['梯户比例'].apply(calculate_units_per_elevator)

# 查看结果的前几行
print(data[['梯户比例', 'units_elevator']].head())


     梯户比例  units_elevator
0    一梯三户        3.000000
1    一梯两户        2.000000
2    一梯一户        1.000000
3  三梯二十三户        7.666667
4   两梯十一户        5.500000


In [6]:
# 处理交易时间和上次交易的数据
def time_interval(df):
    time_list = []
    for index, row in df.iterrows():
        try:
            # 尝试把交易时间转换为日期时间格式
            transaction_time = pd.to_datetime(row['交易时间'])
            # 尝试把上次交易时间转换为日期时间格式
            last_transaction_time = pd.to_datetime(row['上次交易'])
            # 计算时间间隔
            interval = (transaction_time - last_transaction_time).days
            time_list.append(interval)
        except (ValueError, TypeError):
            # 若转换失败，把该时间间隔设为 None
            time_list.append(None)
    # 把时间间隔列表添加为新的一列
    df['time_interval'] = time_list
    return df
data=time_interval(data)
print(data[['time_interval']].head())

   time_interval
0         2568.0
1         3381.0
2         2824.0
3          892.0
4          787.0


In [5]:
# 处理核心卖点的数据
def classify_core_selling_point(row):
    if pd.notnull(row['核心卖点']) and row['核心卖点'] != '':
        return 1
    return 0
data['selling_point']=data.apply(classify_core_selling_point, axis = 1)

In [7]:
# 处理周边配套的数据
def classify_surrounding(row):
    if pd.notnull(row['周边配套']) and row['周边配套'] != '':
        return 1
    return 0
data['surrounding']=data.apply(classify_surrounding, axis = 1)

In [8]:
# 处理交通出行的数据
def classify_traffic(row):
    if pd.notnull(row['交通出行']) and row['交通出行'] != '':
        return 1
    return 0
data['traffic']=data.apply(classify_traffic, axis = 1)

In [9]:
# 处理登记年份的数据
def calculate_year_interval(df):
    # 获取当前年份
    current_year = datetime.now().year
    # 计算时间间隔（以年为单位）
    df['time_year'] = current_year - df['年份']
    return df


data = calculate_year_interval(data)

In [10]:
# 进行独热编码
categorical_features = ['城市','区域','板块','环线','建筑结构','装修情况','交易权属','别墅类型','房屋用途','房屋年限','产权所属','配备电梯','房屋朝向','所在楼层','房屋户型']  
data_encoded = pd.get_dummies(data, columns=categorical_features)

# 获取独热编码后新增的特征名
new_features = [col for col in data_encoded.columns if col not in data.columns]

# 建立模型

In [11]:

data['price'] = data['价格'] 

# 选择特定的特征
selected_features = [
    'square',
    'units_elevator',
    'time_interval',
    'selling_point',
    'surrounding',
    'traffic',
    'time_year',
    'lon',
    'lat']+new_features

X = data_encoded[selected_features]

# 计算每列的均值
means = X.mean()

# 使用均值填充缺失值
X = X.fillna(means)


y = data['price']


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=111)

In [12]:
# 使用选择后的特征进行训练
model = LinearRegression()

model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 计算 MSE
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差 (MSE): {mse}")



# 计算 MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"平均绝对误差 (MAE): {mae}")

# 计算 RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"均方根误差 (RMSE): {rmse}")

均方误差 (MSE): 1598366013515.8877
平均绝对误差 (MAE): 551471.9325790198
均方根误差 (RMSE): 1264265.0092112364


In [17]:
# Ridge 回归

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print(f"Ridge 回归均方误差 (MSE): {mse_ridge}")
print(f"Ridge 平均绝对误差 (MAE): {mae_ridge}")
print(f"Ridge 均方根误差 (RMSE): {rmse_ridge}")

Ridge 回归均方误差 (MSE): 1574937423168.9668
Ridge 平均绝对误差 (MAE):, 550754.6040536367
Ridge 均方根误差 (RMSE): 1254965.1083472269


In [14]:
# Lasso 回归

lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"Lasso 回归均方误差 (MSE): {mse_lasso}")
print(f"Lasso 平均绝对误差 (MAE): {mae_lasso}")
print(f"Lasso 均方根误差 (RMSE): {rmse_lasso}")

  model = cd_fast.enet_coordinate_descent(


Lasso 回归均方误差 (MSE): 1567046402607.2651
Lasso 平均绝对误差 (MAE): 551394.17733307
Lasso 均方根误差 (RMSE): 1251817.2400982762


In [20]:
# Elastic Net 回归

elastic_net_model = ElasticNet(alpha=0.1, l1_ratio=1)
elastic_net_model.fit(X_train, y_train)
y_pred_elastic_net = elastic_net_model.predict(X_test)
mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net)
mae_elastic_net = mean_absolute_error(y_test, y_pred_elastic_net)
rmse_elastic_net = np.sqrt(mean_squared_error(y_test, y_pred_elastic_net))
print(f"Elastic Net 回归均方误差 (MSE): {mse_elastic_net}")
print(f"Elastic Net 平均绝对误差 (MAE): {mae_elastic_net}")
print(f"Elastic Net 均方根误差 (RMSE): {rmse_elastic_net}")

  model = cd_fast.enet_coordinate_descent(


Elastic Net 回归均方误差 (MSE): 1660894123417.3818
Elastic Net 平均绝对误差 (MAE): 552717.233477041
Elastic Net 均方根误差 (RMSE): 1288756.8131410137


# 进行预测

In [16]:
test_data = pd.read_csv('ruc_Class25Q1_test.csv')



test_data['square'] = test_data['建筑面积'].apply(extract_area_value)
test_data['units_elevator'] = test_data['梯户比例'].apply(calculate_units_per_elevator)
test_data = time_interval(test_data)
test_data['selling_point'] = test_data.apply(classify_core_selling_point, axis = 1)
test_data['surrounding'] = test_data.apply(classify_surrounding, axis = 1)
test_data['traffic'] = test_data.apply(classify_traffic, axis = 1)
test_data = calculate_year_interval(test_data)
test_data = pd.get_dummies(test_data, columns=categorical_features)

In [22]:
# 提取房屋 ID 列
house_ids = test_data['ID']

# 检查 selected_features 中的特征是否在 test_data 中缺失
missing_features = [feature for feature in selected_features if feature not in test_data.columns]

# 为缺失的特征添加列，并将值设为 0
for feature in missing_features:
    test_data[feature] = 0
# 选择相同的特征
X_test_new = test_data[selected_features]

# 计算每列的均值
means = X_test_new.mean()

# 使用均值填充缺失值
X_test_new = X_test_new.fillna(means)



# 使用训练好的模型进行预测
y_pred_new = model.predict(X_test_new)

# 创建包含房屋 ID 和预测价格的 DataFrame
result_df = pd.DataFrame({
    'ID': house_ids,
    'Price': y_pred_new
})

# 将结果保存为 CSV 文件
result_df.to_csv('prediction_result1.csv', index=False)

print("预测结果已保存为 prediction_result1.csv")


预测结果已保存为 prediction_result1.csv


In [21]:
# 提取房屋 ID 列
house_ids = test_data['ID']

# 检查 selected_features 中的特征是否在 test_data 中缺失
missing_features = [feature for feature in selected_features if feature not in test_data.columns]

# 为缺失的特征添加列，并将值设为 0
for feature in missing_features:
    test_data[feature] = 0
# 选择相同的特征
X_test_new = test_data[selected_features]

# 计算每列的均值
means = X_test_new.mean()

# 使用均值填充缺失值
X_test_new = X_test_new.fillna(means)

# 使用训练好的模型进行预测
y_pred_new = ridge_model.predict(X_test_new)

# 创建包含房屋 ID 和预测价格的 DataFrame
result_df = pd.DataFrame({
    'ID': house_ids,
    'Price': y_pred_new
})

# 将结果保存为 CSV 文件
result_df.to_csv('prediction_result2.csv', index=False)

print("预测结果已保存为 prediction_result2.csv")


预测结果已保存为 prediction_result2.csv


In [18]:
# 提取房屋 ID 列
house_ids = test_data['ID']

# 检查 selected_features 中的特征是否在 test_data 中缺失
missing_features = [feature for feature in selected_features if feature not in test_data.columns]

# 为缺失的特征添加列，并将值设为 0
for feature in missing_features:
    test_data[feature] = 0
# 选择相同的特征
X_test_new = test_data[selected_features]

# 计算每列的均值
means = X_test_new.mean()

# 使用均值填充缺失值
X_test_new = X_test_new.fillna(means)

# 使用训练好的模型进行预测
y_pred_new = lasso_model.predict(X_test_new)

# 创建包含房屋 ID 和预测价格的 DataFrame
result_df = pd.DataFrame({
    'ID': house_ids,
    'Price': y_pred_new
})

# 将结果保存为 CSV 文件
result_df.to_csv('prediction_result3.csv', index=False)

print("预测结果已保存为 prediction_result3.csv")


预测结果已保存为 prediction_result3.csv


In [21]:
# 提取房屋 ID 列
house_ids = test_data['ID']

# 检查 selected_features 中的特征是否在 test_data 中缺失
missing_features = [feature for feature in selected_features if feature not in test_data.columns]

# 为缺失的特征添加列，并将值设为 0
for feature in missing_features:
    test_data[feature] = 0
# 选择相同的特征
X_test_new = test_data[selected_features]

# 计算每列的均值
means = X_test_new.mean()

# 使用均值填充缺失值
X_test_new = X_test_new.fillna(means)

# 使用训练好的模型进行预测
y_pred_new = elastic_net_model.predict(X_test_new)

# 创建包含房屋 ID 和预测价格的 DataFrame
result_df = pd.DataFrame({
    'ID': house_ids,
    'Price': y_pred_new
})

# 将结果保存为 CSV 文件
result_df.to_csv('prediction_result4.csv', index=False)

print("预测结果已保存为 prediction_result4.csv")


预测结果已保存为 prediction_result4.csv
