In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
using_data = pd.read_csv("/home/mw/project/using.csv")

In [2]:
# 缩尾处理
lower_percentile = 1
upper_percentile = 99

lower_bound = np.percentile(using_data["单位面积均价"], lower_percentile)
upper_bound = np.percentile(using_data["单位面积均价"], upper_percentile)

filtered_data = using_data[(using_data["单位面积均价"] >= lower_bound)&(using_data["单位面积均价"] <= upper_bound)]

print(f"原始样本量: {len(using_data)}")
print(f"去异常值后样本量: {len(filtered_data)}")
print(f"去除比例: {100 - len(filtered_data)/len(using_data)*100:.2f}%")

using_data = filtered_data

原始样本量: 84133
去异常值后样本量: 82449
去除比例: 2.00%


In [3]:
region_columns = using_data.columns[using_data.columns.str.startswith("区域板块")]
ring_columns = using_data.columns[using_data.columns.str.startswith("城市环线")]
X = using_data[["建筑面积","房间数","厅数","厨房数","卫生间数"]+region_columns.tolist()+ring_columns.tolist()]
y = using_data[["单位面积均价"]]

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=111,shuffle=True)

In [5]:
# 训练ols模型

model_ols = LinearRegression()
model_ols.fit(X_train, y_train)

y_train_pred = model_ols.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
print(f"样本内MAE: {mae_train:.2f}")

y_test_pred = model_ols.predict(X_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"样本外MAE: {mae_test:.2f}")

kfold = KFold(n_splits=6, shuffle=True, random_state=111)
mae_scores = cross_val_score(model_ols,X,y,cv=kfold,scoring="neg_mean_absolute_error")
mae_scores = -mae_scores  
mean_mae = np.mean(mae_scores)
std_mae = np.std(mae_scores)

print(f"六折交叉验证MAE:")
print(f"均值 = {mean_mae:.2f} ± {std_mae:.2f}")
print("各折MAE:", np.round(mae_scores, 2))

样本内MAE: 2729.72
样本外MAE: 2757.95
六折交叉验证MAE:
均值 = 2773.23 ± 20.66
各折MAE: [2771.33 2737.72 2781.88 2769.34 2771.22 2807.91]


In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# 训练lasso模型

model_lasso = Lasso(alpha=0.2, max_iter=20)
model_lasso.fit(X_train_scaled, y_train)
print("OK")

y_train_pred = model_lasso.predict(X_train_scaled)
print("OK")
mae_train = mean_absolute_error(y_train, y_train_pred)
print(f"样本内MAE: {mae_train:.2f}")

y_test_pred = model_lasso.predict(X_test_scaled)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"样本外MAE: {mae_test:.2f}")

mae_scores = cross_val_score(model_lasso,X,y,cv=kfold,scoring="neg_mean_absolute_error")
mae_scores = -mae_scores  
mean_mae = np.mean(mae_scores)
std_mae = np.std(mae_scores)

print(f"六折交叉验证MAE:")
print(f"均值 = {mean_mae:.2f} ± {std_mae:.2f}")
print("各折MAE:", np.round(mae_scores, 2))

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


OK
OK
样本内MAE: 2763.92
样本外MAE: 2805.50


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


六折交叉验证MAE:
均值 = 2824.19 ± 19.59
各折MAE: [2827.61 2792.67 2813.07 2828.79 2824.88 2858.13]


In [8]:
# 训练ridge模型

model_ridge = Ridge(alpha=1000.0)  # alpha是正则化强度
model_ridge.fit(X_train_scaled, y_train)

y_train_pred = model_ridge.predict(X_train_scaled)
mae_train = mean_absolute_error(y_train, y_train_pred)
print(f"样本内MAE: {mae_train:.2f}")

y_test_pred = model_ridge.predict(X_test_scaled)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"样本外MAE: {mae_test:.2f}")

mae_scores = cross_val_score(model_ridge,X,y,cv=kfold,scoring="neg_mean_absolute_error")
mae_scores = -mae_scores  
mean_mae = np.mean(mae_scores)
std_mae = np.std(mae_scores)

print(f"六折交叉验证MAE:")
print(f"均值 = {mean_mae:.2f} ± {std_mae:.2f}")
print("各折MAE:", np.round(mae_scores, 2))

样本内MAE: 2760.41
样本外MAE: 2788.41
六折交叉验证MAE:
均值 = 6895.53 ± 54.61
各折MAE: [6806.36 6952.18 6887.59 6850.78 6915.39 6960.91]


In [19]:
predict_data = pd.read_csv("/home/mw/project/predict.csv")

# 二者需要保持一致
columns_to_drop=predict_data.columns.difference(X.columns)
predict_data=predict_data.drop(columns=columns_to_drop)
missing=X.columns.difference(predict_data.columns)
predict_data=predict_data.assign(**{col: 0 for col in missing})
predict_data= predict_data.reindex(columns=X.columns)

X_predict=predict_data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_predict)

# 注意这里同样是有选择性的，三种是类似的操作
y_predict=model_ridge.predict(X_scaled)
# y_predict=model_lasso.predict(X_scaled)
# y_predict=model_ols.predict(X_predict)

predict_data = pd.read_csv("/home/mw/input/quant4533/ruc_Class25Q1_test.csv")
out_data = pd.DataFrame()
out_data["ID"]=predict_data["ID"]
out_data["price"]=y_predict*X_predict[["建筑面积"]]
out_data.to_csv("/home/mw/project/output.csv",index=False)