In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# 加载数据
df = pd.read_parquet('C:/Users/86138/Tigercut/midterm/train_plus.parquet')

# 数据预处理
df['城市'] = df['城市'].astype('category')
df['区域'] = df['区域'].astype('category')
df['板块'] = df['板块'].astype('category')

# 定义特征和标签
X = df.drop(columns=['价格', '小区名称', '交易时间'])
y = df['价格']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

# 定义预处理器
numeric_features = X.select_dtypes(include=['int32', 'int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, interaction_only=True))
    ]), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# 构建模型管道
models = {
    'OLS': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'LASSO': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Lasso(alpha=1.0, random_state=111))
    ]),
    'Ridge': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=0.1, random_state=111))
    ]),
    'ElasticNet': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', ElasticNet(alpha=0.001, l1_ratio=0.8, random_state=111))
    ])
}

# 初始化结果存储
results = []

# 6折交叉验证
cv = KFold(n_splits=6, shuffle=True, random_state=111)

for name, model in models.items():
    # 训练模型
    model.fit(X_train, y_train)
    
    # 训练集评估
    y_train_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    
    # 测试集评估
    y_test_pred = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    
    # 交叉验证评估
    cv_rmse = np.sqrt(-cross_val_score(model, X_train, y_train, 
                                      cv=cv, scoring='neg_mean_squared_error')).mean()
    cv_r2 = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2').mean()
    
    # 保存结果
    results.append({
        'Model': name,
        'Train RMSE': train_rmse,
        'Train R²': train_r2,
        'Test RMSE': test_rmse,
        'Test R²': test_r2,
        'CV RMSE': cv_rmse,
        'CV R²': cv_r2
    })

# 创建结果DataFrame
results_df = pd.DataFrame(results)

# 生成最终报告表格
report_columns = ['Model', 'Train R²', 'Test R²', 'CV R²']
final_report = results_df[report_columns].copy()
final_report.columns = ['模型', '训练集R²', '测试集R²', '交叉验证R²']

# 添加MAE和RMSE的详细表格
detailed_metrics = results_df[['Model', 'Train RMSE', 'Test RMSE', 'CV RMSE']].copy()
detailed_metrics.columns = ['模型', '训练集RMSE', '测试集RMSE', '交叉验证RMSE']

# 打印结果
print("="*50)
print("主要性能指标报告:")
print("="*50)
print(final_report.to_markdown(index=False))

print("\n" + "="*50)
print("详细误差指标报告:")
print("="*50)
print(detailed_metrics.to_markdown(index=False))

# 计算去除异常值后的预测数量（示例）
# 这里需要根据你的异常值定义标准来计算
print("\n去除异常值后的总预测数量:", len(y_test))  # 这里只是示例，实际应根据你的异常值处理逻辑

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

主要性能指标报告:
| 模型       |   训练集R² |     测试集R² |   交叉验证R² |
|:-----------|-----------:|-------------:|-------------:|
| OLS        |   0.920809 | -8.37118e+12 | -3.05623e+13 |
| LASSO      |   0.920506 |  0.914823    |  0.913025    |
| Ridge      |   0.920632 |  0.91488     |  0.913221    |
| ElasticNet |   0.914058 |  0.909978    |  0.907796    |

详细误差指标报告:
| 模型       |   训练集RMSE |       测试集RMSE |     交叉验证RMSE |
|:-----------|-------------:|-----------------:|-----------------:|
| OLS        |       460202 |      4.71745e+12 |      6.03521e+12 |
| LASSO      |       461082 | 475856           | 482193           |
| Ridge      |       460715 | 475697           | 481654           |
| ElasticNet |       479415 | 489202           | 496464           |

去除异常值后的总预测数量: 16473
