In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
from scipy import stats

In [3]:
fullData = pd.read_csv('/home/mw/project/fullData.csv')

In [4]:
fullData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98919 entries, 0 to 98918
Columns: 109 entries, Unnamed: 0 to cluster_label_12
dtypes: float64(13), int64(96)
memory usage: 82.3 MB


In [5]:
fullData.head(5)

Unnamed: 0.1,Unnamed: 0,价格,建筑面积,lon,lat,ID,建筑年代,房屋总数,楼栋总数,绿 化 率,...,cluster_label_3,cluster_label_4,cluster_label_5,cluster_label_6,cluster_label_7,cluster_label_8,cluster_label_9,cluster_label_10,cluster_label_11,cluster_label_12
0,0,6564200.0,52.3,116.389326,39.963727,,1977.5,1317,19,30.0,...,0,0,0,0,0,0,0,0,0,0
1,1,4174000.0,127.44,116.354287,40.079237,,2005.0,2317,40,30.0,...,0,0,0,0,0,0,0,0,0,0
2,2,16310000.0,228.54,116.543168,40.078165,,2001.5,1249,565,30.1,...,0,0,0,0,0,0,0,0,0,0
3,3,2834600.0,43.6,116.357585,39.98003,,2015.0,577,12,40.0,...,0,0,0,0,0,0,0,0,0,0
4,4,1954000.0,39.85,116.299697,39.940604,,2010.5,1685,19,60.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
fullData.drop(['Unnamed: 0'], axis=1, inplace=True)
fullData.drop(['ID'], axis=1, inplace=True)

In [5]:
fullData["价格"] = np.log1p(fullData["价格"])

In [6]:
def replace_outliers_with_quantiles(df, columns):
    for column in columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # 替换异常值
        df[column] = np.where(df[column] < lower_bound, lower_bound,
                              np.where(df[column] > upper_bound, upper_bound, df[column]))
    return df

In [7]:
# 选择要处理的列
columns_to_process = ['建筑面积', '房屋总数', '楼栋总数','停车位','一户平均电梯数','房间数量','客厅数量','厨房数量','卫生间数量','平均租金']

# 调用函数替换异常值
fullData = replace_outliers_with_quantiles(fullData, columns_to_process)

In [8]:
fullData['交通出行_停车位'] = fullData['交通出行_count'] * fullData['停车位']
fullData['楼层_电梯'] = fullData['实际楼层位置'] * fullData['配备电梯_有']

# 查看相关系数

In [9]:
corrDf = fullData.corr()

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
plt.figure(figsize=(20,20))
sns.heatmap(corrDf,cmap=plt.cm.RdBu,linecolor='white',annot=False)
plt.savefig("heatmap.jpg")

In [10]:
# 查看每个特征与 price 的相关系数，并按降序排列
result = corrDf['价格'].map(abs).sort_values(ascending=False)

pd.set_option('display.max_rows', None)
print(result)

价格                  1.000000
平均租金                0.739200
建筑面积                0.497669
环线_无                0.457431
卫生间数量               0.397088
房间数量                0.384442
城市_5                0.368926
环线_五至六环             0.285618
城市_3                0.273653
一户平均电梯数             0.257174
客厅数量                0.247288
环线_四至五环             0.236789
城市_2                0.207129
房屋用途_车库             0.197780
北                   0.164797
装修情况_精装             0.160693
环线_三至四环             0.156508
环线_二环内              0.152283
停车费用                0.145789
环线_内环至中环            0.135793
容 积 率               0.135160
卖点-户型_count         0.133373
南                   0.132208
上次交易间隔              0.131304
房屋用途_别墅             0.124188
环线_中环至外环            0.123380
年份_2020.0           0.118449
建筑结构_混合结构           0.117866
房屋总数                0.117800
房屋用途_商住两用           0.117467
别墅类型_无              0.113444
年份_2022.0           0.112324
环线_外环外              0.110061
城市_6                0.103993
年份_2019.0     

In [11]:
cols = corrDf['价格'][abs(corrDf['价格'])>0.1].index
fullNew = fullData[cols]
print(fullNew.columns)

Index(['价格', '建筑面积', '房屋总数', '容 积 率', '停车费用', '平均租金', '房间数量', '客厅数量', '卫生间数量',
       '南', '北', '一户平均电梯数', '卖点-户型_count', '上次交易间隔', '城市_2', '城市_3', '城市_5',
       '城市_6', '环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_五至六环', '环线_内环至中环',
       '环线_四至五环', '环线_外环外', '环线_无', '建筑结构_混合结构', '装修情况_精装', '别墅类型_无',
       '房屋用途_别墅', '房屋用途_商住两用', '房屋用途_车库', '年份_2020.0', '年份_2022.0'],
      dtype='object')


In [15]:
# 不显示数值
sns.heatmap(fullNew.corr(), cmap=plt.cm.RdBu, linecolor='white', annot=False)
plt.savefig("heatmap.jpg")
plt.show()

In [17]:
X_train = fullNew.loc[:84132].drop(columns=["价格"])
X_test = fullNew.loc[84133:].drop(columns=["价格"])
y_train = fullNew.loc[:84132].价格
X_train.shape,y_train.shape,X_test.shape

((84133, 33), (84133,), (14786, 33))

In [18]:
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train,test_size=0.2, random_state=111)

In [19]:
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse

In [16]:
fullNew.describe()

Unnamed: 0,价格,建筑面积,房屋总数,容 积 率,停车费用,平均租金,房间数量,客厅数量,卫生间数量,南,...,环线_外环外,环线_无,建筑结构_混合结构,装修情况_精装,别墅类型_无,房屋用途_别墅,房屋用途_商住两用,房屋用途_车库,年份_2020.0,年份_2022.0
count,84133.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,...,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0,98919.0
mean,14.048803,93.34055,1866.817346,2.7434,234.205368,3812.852256,2.419525,1.396248,1.353522,0.833419,...,0.034432,0.473771,0.114033,0.38022,0.991963,0.011423,0.011565,0.006136,0.125628,0.437853
std,0.889657,36.772977,1437.36691,1.446804,226.041606,1904.634227,0.893648,0.599648,0.582651,0.372603,...,0.182338,0.499314,0.317853,0.485443,0.089288,0.106269,0.106918,0.078094,0.331431,0.496125
min,11.26806,10.0,1.0,0.03,0.0,1375.5,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.455116,66.18,729.0,2.0,50.0,2434.575,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,13.952225,88.76,1461.0,2.5,150.0,2933.111111,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,14.592999,115.0,2651.0,3.2,400.0,4713.222222,3.0,2.0,2.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
max,18.196912,188.23,5534.0,35.0,2500.0,8131.193056,4.5,3.5,3.5,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# LinearRegression

In [20]:
linear_model = LinearRegression()
linear_model.fit(X_train_part, y_train_part)

LinearRegression()

In [21]:
#out sample
y_pred_linear = linear_model.predict(X_val)
calculate_metrics(y_val, y_pred_linear)

(0.27482779716189215, 0.3687819927156506)

In [18]:
'''
#最后可以试着用全部的来预测————效果不好
l_2 = linear_model.fit(X_train, y_train)
'''

In [22]:
#in sample
l_2_model = linear_model.fit(X_train, y_train)
y_pred_2 = l_2_model.predict(X_val)
calculate_metrics(y_val, y_pred_2)

(0.2746525450929027, 0.3685418701288531)

In [23]:
y_final_pre_linear = linear_model.predict(X_test)
y_test_original = np.expm1(y_final_pre_linear)

#把所有的变成正数
y_test_original = np.abs(y_test_original)

#导出文件
df = pd.DataFrame({
    'ID': range(len(y_test_original)),
    'Price': y_test_original
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/linear.csv'
df.to_csv(csv_file_path, index=False)

In [39]:
'''
#标准化会不会更好？————没啥变化？
l_s_model = linear_model.fit(X_train_part_s, y_train_part)
#in sample
y_pred_linear_s = l_s_model.predict(X_val_s)
calculate_metrics(y_val, y_pred_linear_s)
'''

(0.2760676067124385, 0.3708686149769649)

In [24]:
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error

In [25]:
# 6 folds
#111
scoring = {
    'RMSE': make_scorer(mean_squared_error, greater_is_better=False, squared=False),  # RMSE
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False)  # MAE
}

# 交叉验证
def cv_score(model,X,y):
    cv_results = cross_validate(model, X, y, cv=6, scoring=scoring, return_train_score=True)
    # 输出结果
    print("Test RMSE:", -cv_results['test_RMSE'].mean())  # 取负数是因为 sklearn 默认是损失函数，需要反转
    print("Test MAE:", -cv_results['test_MAE'].mean())

In [26]:
cv_score(linear_model,X_train,y_train)

Test RMSE: 0.48434350078714483
Test MAE: 0.38349993965074863


# Lasso

In [27]:
from sklearn.preprocessing import StandardScaler

In [27]:
'''
# 对特征进行标准化处理
scaler = StandardScaler()
X_train_part_s = scaler.fit_transform(X_train_part)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)
'''

In [28]:
# out sample
lasso_model = Lasso()
lasso_model.fit(X_train_part, y_train_part)

y_pred_lasso = lasso_model.predict(X_val)

calculate_metrics(y_val, y_pred_lasso)

(0.3571935801975962, 0.48529041381537735)

In [29]:
#in sample
la_2_model = lasso_model.fit(X_train, y_train)
y_pred_la2 = la_2_model.predict(X_val)
calculate_metrics(y_val, y_pred_la2)

(0.35714698352016405, 0.4852619049430544)

In [30]:
y_final_lasso = lasso_model.predict(X_test)
y_final_lasso = np.abs(y_final_lasso)

y_test_original_l = np.expm1(y_final_lasso)

#把所有的变成正数
y_test_original_l = np.abs(y_test_original_l)

In [31]:
df = pd.DataFrame({
    'ID': range(len(y_test_original_l)),
    'Price': y_test_original_l
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/lasso.csv'
df.to_csv(csv_file_path, index=False)

In [32]:
# 6
cv_score(lasso_model,X_train,y_train)

Test RMSE: 0.509884778801541
Test MAE: 0.39014115343653094


# Ridge

In [33]:
ridge_model = Ridge()

In [35]:
ridge_model.fit(X_train_part, y_train_part)

Ridge()

In [36]:
#out sample
y_pre_ridge = ridge_model.predict(X_val)
calculate_metrics(y_val, y_pre_ridge)

(0.27483068660951754, 0.36878848311728035)

In [42]:
#in sample
r2_model = ridge_model.fit(X_train, y_train)
y_pre_r2 = r2_model.predict(X_val)
calculate_metrics(y_val, y_pre_r2)

(0.27465418283044796, 0.36854636164405224)

In [38]:
y_final_pre_ridge = ridge_model.predict(X_test)

y_test_original_r = np.expm1(y_final_pre_ridge)
y_test_original_r = np.abs(y_test_original_r)

In [39]:
df = pd.DataFrame({
    'ID': range(len(y_test_original_r)),
    'Price': y_test_original_r
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/ridge.csv'
df.to_csv(csv_file_path, index=False)

In [40]:
# 6
cv_score(ridge_model,X_train,y_train)

Test RMSE: 0.4842561933609073
Test MAE: 0.3834490671954305


# Elastic Net

In [43]:
from sklearn.linear_model import ElasticNet

In [73]:
e_model = ElasticNet(alpha=0.01, l1_ratio=0.3)

# 训练模型
e_model.fit(X_train_part, y_train_part)

# 在测试集上进行预测
y_pred_e = e_model.predict(X_val)

# 计算评估指标
calculate_metrics(y_val, y_pred_e)

(0.2874469608019353, 0.38869862908224634)

In [70]:
# 在测试集上进行预测
e2_model=e_model.fit(X_train, y_train)
y_pred_e2 = e2_model.predict(X_val)

# 计算评估指标
calculate_metrics(y_val, y_pred_e2)

(0.2746733593691009, 0.3685852607940574)

In [72]:
cv_score(e_model,X_train,y_train)

Test RMSE: 0.48376818489209866
Test MAE: 0.38319268470057216


In [74]:
y_final_pre_e = e_model.predict(X_test)

y_test_original_e = np.expm1(y_final_pre_e)
y_test_original_e = np.abs(y_test_original_e)
df = pd.DataFrame({
    'ID': range(len(y_test_original_e)),
    'Price': y_test_original_e
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/enet.csv'
df.to_csv(csv_file_path, index=False)

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

In [67]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.01, 0.1,0.3, 0.5, 0.7]
}
model = ElasticNet()
# 使用 GridSearchCV 进行超参数调优，以 MAE 作为评分指标
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train_part, y_train_part)

# 输出最优参数和最优得分
print("最优参数:", grid_search.best_params_)
print("最优负 MAE 得分:", grid_search.best_score_)
print("最优 MAE 得分:", -grid_search.best_score_)

# 使用最优参数的模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

# 计算测试集上的评估指标
test_mae = mean_absolute_error(y_val, y_pred)
test_r2 = r2_score(y_val, y_pred)

print(f"测试集 MAE: {test_mae}")
print(f"测试集 R^2: {test_r2}")

最优参数: {'alpha': 0.001, 'l1_ratio': 0.01}
最优负 MAE 得分: -0.27594338147499897
最优 MAE 得分: 0.27594338147499897
测试集 MAE: 0.2753635452229726
测试集 R^2: 0.8248433302948701


# 模型评估

In [51]:
# 确定最佳模型
models = {
    'OLS': cv_rmse_ols,
    'LASSO': cv_rmse_lasso,
    'Ridge': cv_rmse_ridge
}
best_model_name = min(models, key=models.get)
if best_model_name == 'OLS':
    best_model = ols_model
    best_train_mae = train_mae_ols
    best_val_mae = val_mae_ols
    best_cv_mae = cv_mae_ols
elif best_model_name == 'LASSO':
    best_model = lasso_model
    best_train_mae = train_mae_lasso
    best_val_mae = val_mae_lasso
    best_cv_mae = cv_mae_lasso
else:
    best_model = ridge_model
    best_train_mae = train_mae_ridge
    best_val_mae = val_mae_ridge
    best_cv_mae = cv_mae_ridge

In [52]:
# 制作表格
data = {
    'Metrics': ['OLS', 'LASSO', 'Best Model'],
    'In sample': [train_mae_ols, train_mae_lasso, best_train_mae],
    'out of sample': [val_mae_ols, val_mae_lasso, best_val_mae],
    'Cross-validation': [cv_mae_ols, cv_mae_lasso, best_cv_mae],
}
df = pd.DataFrame(data)
print(df)

      Metrics      In sample  out of sample  Cross-validation  Datahub Score
0         OLS  755815.932180  778998.642401     758718.337169             60
1       LASSO  755932.955388  779005.902572     758827.910246             61
2  Best Model  755815.932180  778998.642401     758718.337169             62


In [41]:
y_pred_combine = (y_test_original + y_test_original_r) / 2

df = pd.DataFrame({
    'ID': range(len(y_pred_combine)),
    'Price': y_pred_combine
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/combine.csv'
df.to_csv(csv_file_path, index=False)

# 优化

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

In [76]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
     |████████████████████████████████| 255.9 MB 60 kB/s              ██                        | 63.9 MB 60.5 MB/s eta 0:00:04                 | 86.9 MB 8.8 MB/s eta 0:00:20  | 102.2 MB 8.8 MB/s eta 0:00:18MB/s eta 0:00:18███                  | 112.7 MB 4.3 MB/s eta 0:00:34
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
Note: you may need to restart the kernel to use updated packages.


In [77]:
import xgboost as xgb

In [78]:
modelxg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)

# 训练模型
modelxg.fit(X_train, y_train)

# 在测试集上进行预测
y_pred_xg = modelxg.predict(X_test)

y_test_original_xg = np.expm1(y_pred_xg)
y_test_original_xg = np.abs(y_test_original_xg)
df = pd.DataFrame({
    'ID': range(len(y_test_original_xg)),
    'Price': y_test_original_xg
})

# 将 DataFrame 保存为 CSV 文件
csv_file_path = '/home/mw/project/xg.csv'
df.to_csv(csv_file_path, index=False)