In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

df = pd.read_parquet('C:/Users/86138/Tigercut/midterm/train_plus.parquet')

# 数据预处理
# 转换需要作为分类变量的列
df['城市'] = df['城市'].astype('category')
df['区域'] = df['区域'].astype('category')
df['板块'] = df['板块'].astype('category')

# 定义特征和标签
X = df.drop(columns=['价格', '小区名称', '交易时间'])
y = df['价格']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

# 定义预处理器
numeric_features = X.select_dtypes(include=['int32', 'int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, include_bias=False))  # 二次项
    ]), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [2]:
# 构建不同模型的管道
models = {
    'Linear Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Lasso': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Lasso(alpha=1.0, max_iter=10000, tol=0.0001, random_state=111))
    ]),
    # 目前最高R方
    'Ridge': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=0.1, random_state=111))
    ]),
    'ElasticNet': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', ElasticNet(alpha=0.001, l1_ratio=0.8, max_iter=10000, random_state=111))
    ])
}

# 训练和评估模型
results = []
for name, model in models.items():
    # 训练模型
    model.fit(X_train, y_train)
    
    # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    
    # 保存结果
    results.append({
        'Model': name,
        'MSE': mse,
        'RMSE': rmse,
        'R² Score': r2
    })

# 展示结果
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


            Model          MSE          RMSE  R² Score
Linear Regression 2.220564e+11 471228.633820  0.916471
            Lasso 2.225014e+11 471700.496721  0.916304
            Ridge 2.223108e+11 471498.416683  0.916376
       ElasticNet 2.356048e+11 485391.345130  0.911375


In [3]:
from joblib import dump

# 创建保存模型的目录（如果不存在）
import os
os.makedirs('C:/Users/86138/Tigercut/midterm/hypermodels', exist_ok=True)

# 保存所有模型
for name, model in models.items():
    dump(model, f'C:/Users/86138/Tigercut/midterm/hypermodels/{name.lower().replace(" ", "_")}.joblib')