In [110]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# 读取处理后的数据集
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_valid = pd.read_csv('data/processed/X_valid.csv')
y_valid = pd.read_csv('data/processed/y_valid.csv') 
X_train_full = pd.read_csv('data/processed/X_train_full.csv') 
y_train_full = pd.read_csv('data/processed/y_train_full.csv') 
X_test = pd.read_csv('data/processed/X_test.csv')

print('数据集读取完成')
print(f'训练集大小: {X_train.shape}')
print(f'验证集大小: {X_valid.shape}')
print(f'全量训练集大小: {X_train_full.shape}')
print(f'测试集大小: {X_test.shape}')


数据集读取完成
训练集大小: (20000, 74)
验证集大小: (5000, 74)
全量训练集大小: (25000, 74)
测试集大小: (10000, 74)


In [114]:
# 直接定义列名配置
del_cols = ['listing_id', 'original_reg_date', 'opc_scheme', 'lifespan', 'eco_category', 'indicative_price']
text_cols = ['title', 'description', 'features', 'accessories']
date_cols = ['reg_date']
numeric_cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 
                'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month',
                'text_brand_popularity_score', 'text_model_value_score', 'text_condition_score',
                'text_feature_rarity_score', 'text_performance_score', 'text_sentiment_score']
log_cols = ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 
            'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month']
root_cols = ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 
             'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month']
categorical_cols = ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']

# 更新变换列，添加GPT特征
cat_nu_cols = [
    "manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", 
    "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded",
    "text_brand_popularity_score", "text_model_value_score", "text_condition_score",
    "text_feature_rarity_score", "text_performance_score", "text_sentiment_score",
    "-", "almost new car", "coe car", "consignment car", "direct owner sale", 
    "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", 
    "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", 
    "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", 
    "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", 
    "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", 
    "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", 
    "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", 
    "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", 
    "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"
]

cat_log_cols = [
    "manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", 
    "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", 
    "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", 
    "text_condition_score", "text_feature_rarity_score", "text_performance_score", 
    "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", 
    "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", 
    "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", 
    "sgcarmart warranty cars", "sta evaluated car", "vintage cars", 
    "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", 
    "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", 
    "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", 
    "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", 
    "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", 
    "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", 
    "fuel_type_nan", "transmission_manual", "year", "month"
]

cat_root_cols = [
    "manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", 
    "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", 
    "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", 
    "text_condition_score", "text_feature_rarity_score", "text_performance_score", 
    "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", 
    "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", 
    "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", 
    "sgcarmart warranty cars", "sta evaluated car", "vintage cars", 
    "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", 
    "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", 
    "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", 
    "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", 
    "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", 
    "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", 
    "fuel_type_nan", "transmission_manual", "year", "month"
]

# 丢弃log和root变换的结果
X_train = X_train[cat_nu_cols]
X_valid = X_valid[cat_nu_cols]
X_test = X_test[cat_nu_cols]
X_train_full = X_train_full[cat_nu_cols]

print(f'训练集: {X_train.shape}')
print(f'全量集: {X_train_full.shape}')
print(f'测试集: {X_test.shape}')

训练集: (20000, 55)
全量集: (25000, 55)
测试集: (10000, 55)


## Additional Feature Engineering

In [79]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor

def create_new_features(df):
    """创建更有针对性的特征"""
    
    # 1. 核心价值特征 - 更关注直接影响价格的因素
    df['total_base_cost'] = df['omv'] + df['arf']  # 基础成本(不含COE)
    df['coe_ratio'] = df['coe'] / df['total_base_cost'].clip(lower=1)  # COE占比
    df['dereg_ratio'] = df['dereg_value'] / df['total_base_cost'].clip(lower=1)  # 折价率
    
    # 2. 车况评估 - 重点关注车辆状态
    df['age'] = 2024 - df['manufactured']
    df['km_per_year'] = df['mileage'] / df['age'].clip(lower=1)
    df['wear_tear_index'] = (
        df['km_per_year'] / 20000 +  # 标准年行驶里程
        df['age'] / 10 +             # 车龄影响
        df['no_of_owners'] / 2       # 车主数影响
    )
    
    # 3. 性能价值比 - 更注重实用性指标
    df['power_price_ratio'] = df['power'] / df['total_base_cost'].clip(lower=1) * 1000
    df['weight_efficiency'] = df['power'] / df['curb_weight'].clip(lower=1)
    
    return df

def select_features(X_train, y_train, X_valid, feature_names, threshold=0.001):
    """特征选择"""
    # 1. 基于XGBoost的特征重要性
    xgb_selector = SelectFromModel(
        XGBRegressor(
            n_estimators=500,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ),
        threshold=threshold
    )
    
    # 2. 保留重要特征
    X_train_selected = xgb_selector.fit_transform(X_train, y_train)
    selected_features_mask = xgb_selector.get_support()
    selected_features = [f for f, selected in zip(feature_names, selected_features_mask) if selected]
    
    # 3. 转换验证集
    X_valid_selected = xgb_selector.transform(X_valid)
    
    return X_train_selected, X_valid_selected, selected_features

# 主流程
def feature_engineering_pipeline(X_train, X_valid, y_train, y_valid):
    """特征工程主流程"""
    print("开始特征工程...")
    
    # 1. 创建新特征
    print("创建新特征...")
    X_train_new = create_new_features(X_train.copy())
    X_valid_new = create_new_features(X_valid.copy())
    
    # 2. 特征选择
    print("进行特征选择...")
    feature_names = list(X_train_new.columns)
    X_train_selected, X_valid_selected, selected_features = select_features(
        X_train_new, y_train, X_valid_new, feature_names
    )
    
    print(f"原始特征数: {len(feature_names)}")
    print(f"选择后特征数: {len(selected_features)}")
    print("\n重要特征:")
    print(selected_features)
    
    return X_train_selected, X_valid_selected, selected_features


# 使用示例
# X_train_final, X_valid_final, selected_features = feature_engineering_pipeline(
#     X_train, X_valid, y_train, y_valid
# )

# 不进行特征选择
X_train_final = create_new_features(X_train.copy())
X_valid_final = create_new_features(X_valid.copy())

# 评估新特征的效果
model = XGBRegressor(
    # 基本参数
    n_estimators=1000,        # 树的数量
    max_depth=6,              # 树的最大深度，避免过拟合
    learning_rate=0.01,       # 较小的学习率，提高模型稳定性
    
    # 防止过拟合的参数
    min_child_weight=5,       # 控制过拟合
    gamma=0.1,               # 节点分裂所需的最小损失函数下降值
    subsample=0.8,           # 随机采样训练样本的比例
    colsample_bytree=0.8,    # 随机采样特征的比例
    
    # 正则化参数
    reg_alpha=0.1,           # L1正则化
    reg_lambda=1,            # L2正则化
    
    # 其他参数
    objective='reg:squarederror',  # 回归任务
    random_state=42,
    n_jobs=-1,               # 使用所有CPU核心
    verbosity=0
)

model.fit(X_train_final, y_train)
y_pred = model.predict(X_valid_final)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f"\n模型性能:")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.4f}")


模型性能:
RMSE: 23517.44
R2: 0.9732


## Concat BERT Vectors

In [115]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from umap.umap_ import UMAP  # 正确的导入方式

# 加载BERT向量
bert_train_vectors = np.load('data/processed/train_vectors.npy')
bert_valid_vectors = np.load('data/processed/valid_vectors.npy')
bert_train_full_vectors = np.load('data/processed/train_full_vectors.npy')
bert_test_vectors = np.load('data/processed/test_vectors.npy')

# BERT降维
scaler = StandardScaler()

# UMAP参数设置
umap = UMAP(
    n_components=8,
    n_neighbors=20,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)

# umap = UMAP(
#     n_components=16,
#     n_neighbors=30,
#     min_dist=0.3,
#     metric='cosine',
#     random_state=42
# )

# umap = UMAP(
#     n_components=24,
#     n_neighbors=50,
#     min_dist=0.5,
#     metric='cosine',
#     random_state=42
# )

# 对BERT向量进行UMAP降维
bert_train_scaled = scaler.fit_transform(bert_train_vectors)
bert_train_umap = umap.fit_transform(bert_train_scaled)

# 对验证集和测试集应用相同的转换
bert_valid_scaled = scaler.transform(bert_valid_vectors)
bert_valid_umap = umap.transform(bert_valid_scaled)

bert_test_scaled = scaler.transform(bert_test_vectors)
bert_test_umap = umap.transform(bert_test_scaled)

# 对完整训练集进行转换
bert_train_full_scaled = scaler.transform(bert_train_full_vectors)
bert_train_full_umap = umap.transform(bert_train_full_scaled)

# 拼接特征
X_train_combined = np.hstack((X_train[cat_nu_cols].values, bert_train_umap))
X_valid_combined = np.hstack((X_valid[cat_nu_cols].values, bert_valid_umap))
X_test_combined = np.hstack((X_test[cat_nu_cols].values, bert_test_umap))
X_train_full_combined = np.hstack((X_train_full[cat_nu_cols].values, bert_train_full_umap))

# X_train_combined = np.hstack((X_train_final, bert_train_umap))
# X_valid_combined = np.hstack((X_valid_final, bert_valid_umap))
# X_test_combined = np.hstack((X_test_final, bert_test_umap))

# 打印维度信息
print("特征维度:")
print(f"原始特征: {X_train[cat_nu_cols].shape[1]}")
print(f"UMAP特征: {bert_train_umap.shape[1]}")
print(f"组合特征: {X_train_combined.shape[1]}")

  warn(


特征维度:
原始特征: 55
UMAP特征: 8
组合特征: 63


In [117]:
print(X_train_full_combined.shape)
print(X_test_combined.shape)

(25000, 63)
(10000, 63)


## Baseline

In [124]:
# 使用数值特征 + 分类特征
for li in [cat_nu_cols, cat_log_cols, cat_root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Mean Squared Error: 1992363754.8481019
Root Mean Squared Error: 44635.902083951456
R² Score: 0.9000067197210553


KeyError: "['power_log', 'engine_cap_log', 'depreciation_log', 'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log'] not in index"

In [25]:
# Assuming 'data' is your DataFrame and 'target' is the name of your target variable
X = X_train_combined 
y = y_train

# Creating the linear regression model
model = LinearRegression()

# Fitting the model
model.fit(X, y)

# Making predictions
y_valid_pred = model.predict(X_valid_combined)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Mean Squared Error: 1993880884.6093085
Root Mean Squared Error: 44652.89335092754
R² Score: 0.8999029127909969


## Other ML Regression Models

In [93]:
from xgboost import XGBRegressor

# # 创建XGBoost模型
model = XGBRegressor(
    # 基本参数
    n_estimators=1000,        # 树的数量
    max_depth=6,              # 树的最大深度，避免过拟合
    learning_rate=0.01,       # 较小的学习率，提高模型稳定性
    
    # 防止过拟合的参数
    min_child_weight=5,       # 控制过拟合
    gamma=0.1,               # 节点分裂所需的最小损失函数下降值
    subsample=0.8,           # 随机采样训练样本的比例
    colsample_bytree=0.8,    # 随机采样特征的比例
    
    # 正则化参数
    reg_alpha=0.1,           # L1正则化
    reg_lambda=1,            # L2正则化
    
    # 其他参数
    objective='reg:squarederror',  # 回归任务
    random_state=42,
    n_jobs=-1,               # 使用所有CPU核心
    verbosity=0
)


# Assuming 'data' is your DataFrame and 'target' is the name of your target variable
X = X_train[cat_nu_cols]  # Features
y = y_train       # Target variable

print(X.shape)

# 添加早停以防止过拟合
eval_set = [(X_valid[cat_nu_cols], y_valid)]

model.fit(
    X, 
    y,
    eval_set=eval_set
)

# Making predictions
y_valid_pred = model.predict(X_valid[cat_nu_cols])

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
# print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R² Score: {r2}')

(20000, 55)
[0]	validation_0-rmse:142302.58503
[1]	validation_0-rmse:141059.21290


[2]	validation_0-rmse:139814.90232
[3]	validation_0-rmse:138596.87475
[4]	validation_0-rmse:137383.82899
[5]	validation_0-rmse:136174.87417
[6]	validation_0-rmse:134990.07053
[7]	validation_0-rmse:133801.25630
[8]	validation_0-rmse:132657.01807
[9]	validation_0-rmse:131527.25182
[10]	validation_0-rmse:130388.18631
[11]	validation_0-rmse:129256.71292
[12]	validation_0-rmse:128152.58535
[13]	validation_0-rmse:127038.98374
[14]	validation_0-rmse:125937.67423
[15]	validation_0-rmse:124875.60048
[16]	validation_0-rmse:123788.04927
[17]	validation_0-rmse:122715.07888
[18]	validation_0-rmse:121695.51921
[19]	validation_0-rmse:120644.18487
[20]	validation_0-rmse:119639.02184
[21]	validation_0-rmse:118635.35593
[22]	validation_0-rmse:117621.28938
[23]	validation_0-rmse:116587.83682
[24]	validation_0-rmse:115607.78485
[25]	validation_0-rmse:114628.93368
[26]	validation_0-rmse:113671.92554
[27]	validation_0-rmse:112737.42231
[28]	validation_0-rmse:111764.94953
[29]	validation_0-rmse:110792.32706


In [94]:
print(X.shape)

(20000, 55)


In [122]:
from xgboost import XGBRegressor

# # 创建XGBoost模型
model = XGBRegressor(
    # 基本参数
    n_estimators=1000,        # 树的数量
    max_depth=6,              # 树的最大深度，避免过拟合
    learning_rate=0.01,       # 较小的学习率，提高模型稳定性
    
    # 防止过拟合的参数
    min_child_weight=5,       # 控制过拟合
    gamma=0.1,               # 节点分裂所需的最小损失函数下降值
    subsample=0.8,           # 随机采样训练样本的比例
    colsample_bytree=0.8,    # 随机采样特征的比例
    
    # 正则化参数
    reg_alpha=0.1,           # L1正则化
    reg_lambda=1,            # L2正则化
    
    # 其他参数
    objective='reg:squarederror',  # 回归任务
    random_state=42,
    n_jobs=-1,               # 使用所有CPU核心
    verbosity=0
)


# Assuming 'data' is your DataFrame and 'target' is the name of your target variable
X = X_train_combined  # Features
y = y_train       # Target variable

# 添加早停以防止过拟合
eval_set = [(X_valid_combined, y_valid)]

model.fit(
    X, 
    y,
    eval_set=eval_set
)

# Making predictions
y_valid_pred = model.predict(X_valid_combined)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
# print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R² Score: {r2}')

[0]	validation_0-rmse:142308.20010
[1]	validation_0-rmse:141066.81316
[2]	validation_0-rmse:139825.76145
[3]	validation_0-rmse:138569.25814
[4]	validation_0-rmse:137398.23651
[5]	validation_0-rmse:136214.05422
[6]	validation_0-rmse:135011.12224
[7]	validation_0-rmse:133807.26726
[8]	validation_0-rmse:132641.79629
[9]	validation_0-rmse:131474.29748
[10]	validation_0-rmse:130364.61580
[11]	validation_0-rmse:129224.62829
[12]	validation_0-rmse:128134.81280
[13]	validation_0-rmse:127031.73400
[14]	validation_0-rmse:125968.01218
[15]	validation_0-rmse:124898.26006
[16]	validation_0-rmse:123815.28734
[17]	validation_0-rmse:122743.82624
[18]	validation_0-rmse:121693.46088
[19]	validation_0-rmse:120666.11756
[20]	validation_0-rmse:119664.58005
[21]	validation_0-rmse:118660.23255
[22]	validation_0-rmse:117642.88000
[23]	validation_0-rmse:116655.81372
[24]	validation_0-rmse:115687.45828
[25]	validation_0-rmse:114697.81665
[26]	validation_0-rmse:113755.70685
[27]	validation_0-rmse:112805.18121
[2

In [123]:
# 获取UMAP降维后的维度
n_umap_components = bert_train_umap.shape[1]

# 动态生成特征名称
feature_names = cat_nu_cols + [f'bert_{i}' for i in range(n_umap_components)]

# 分析特征重要性
feature_importance = model.feature_importances_
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

Top 20 most important features:
                        feature  importance
11                          arf    0.463326
8                   dereg_value    0.129638
2                         power    0.072489
31                rare & exotic    0.055533
5                  depreciation    0.033955
10                          omv    0.027136
53                         year    0.025933
14       text_model_value_score    0.024259
9                       mileage    0.016032
0                  manufactured    0.013859
3                    engine_cap    0.011343
12          make_target_encoded    0.010820
13  text_brand_popularity_score    0.009710
6                           coe    0.008674
26        imported used vehicle    0.007563
1                   curb_weight    0.006558
17       text_performance_score    0.005882
57                       bert_2    0.005337
43          type_of_vehicle_suv    0.005263
16    text_feature_rarity_score    0.004458


In [98]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

# 1. LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)

# 3. Stacking
base_models = [
    ('xgb', XGBRegressor(verbosity=0, random_state=42)),
    ('lgb', lgb.LGBMRegressor(verbose=-1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=False, random_seed=42))
]
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=lgb.LGBMRegressor(verbose=-1),
    cv=5
)

# 4. RandomForest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 5. GradientBoosting
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# 比较函数
def compare_models(models, X_train, y_train, X_valid, y_valid):
    results = []
    for name, model in models.items():
        print(f"训练 {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        r2 = r2_score(y_valid, y_pred)
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R2': r2
        })
    return pd.DataFrame(results).sort_values('RMSE')

# 比较所有模型
models = {
    'LightGBM': lgb_model,
    'CatBoost': cat_model,
    'Stacking': stacking,
    'RandomForest': rf_model,
    'GradientBoosting': gb_model,
    'XGBoost': model  # 之前定义的XGBoost模型
}

# 使用cat_log_cols特征集（通常对价格预测效果更好）
results = compare_models(models, X_train_combined, y_train, 
                       X_valid_combined, y_valid)
print("\n模型性能比较:")
print(results)

训练 LightGBM


  y = column_or_1d(y, warn=True)


训练 CatBoost
训练 Stacking


  y = column_or_1d(y, warn=True)


训练 RandomForest


  return fit_method(estimator, *args, **kwargs)


训练 GradientBoosting


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


训练 XGBoost

模型性能比较:
              Model          RMSE        R2
5           XGBoost  21412.378914  0.977743
0          LightGBM  22889.269216  0.974566
1          CatBoost  26369.873762  0.966243
2          Stacking  27671.744487  0.962828
3      RandomForest  29122.231478  0.958829
4  GradientBoosting  29166.783512  0.958703


In [104]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    """Optuna目标函数，用于优化XGBoost参数"""
    param = {
        # 基本参数
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        
        # 防止过拟合的参数
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-4, 2.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        
        # 正则化参数
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 20.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 20.0, log=True),
        
        # 固定参数
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    # 创建模型
    model = XGBRegressor(**param)
    
    try:
        # 训练模型
        model.fit(X_train_combined, y_train.values.ravel())
        
        # 预测验证集
        y_pred = model.predict(X_valid_combined)
        
        # 计算RMSE
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        return rmse
        
    except Exception as e:
        print(f"训练出错: {str(e)}")
        return float('inf')

# 创建Optuna study
study = optuna.create_study(direction='minimize')

# 运行优化
print("开始Optuna参数优化...")
study.optimize(objective, n_trials=200, show_progress_bar=True)

# 打印最佳参数
print("\n最佳参数:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")
print(f"\n最佳RMSE: {study.best_value:.4f}")

# 使用最佳参数创建最终模型
best_params = study.best_params
best_params.update({
    'objective': 'reg:squarederror',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0
})

# 创建并训练最终模型
final_model = XGBRegressor(**best_params)
final_model.fit(X_train_combined, y_train.values.ravel())

# 评估最终模型
y_pred = final_model.predict(X_valid_combined)
final_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"\n最终模型在验证集上的RMSE: {final_rmse:.4f}")

# 更新模型字典
if 'models' not in locals():
    models = {}
models['XGBoost (Optimized)'] = final_model

[I 2024-11-08 19:29:20,389] A new study created in memory with name: no-name-e7ef19f3-de52-4d5e-a5cd-635d8e6bc745


开始Optuna参数优化...


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2024-11-08 19:29:21,429] Trial 0 finished with value: 111476.69695382394 and parameters: {'n_estimators': 777, 'max_depth': 6, 'learning_rate': 0.0003727245808413232, 'min_child_weight': 4, 'gamma': 0.016757760705157044, 'subsample': 0.7186246784881116, 'colsample_bytree': 0.5042242382086655, 'reg_alpha': 0.0007825629111476852, 'reg_lambda': 0.16412310198613533}. Best is trial 0 with value: 111476.69695382394.
[I 2024-11-08 19:29:23,589] Trial 1 finished with value: 26265.15765344184 and parameters: {'n_estimators': 4087, 'max_depth': 3, 'learning_rate': 0.2818477743389873, 'min_child_weight': 13, 'gamma': 0.011970593718404329, 'subsample': 0.4326829786956926, 'colsample_bytree': 0.615058771933839, 'reg_alpha': 0.246550175085745, 'reg_lambda': 0.0019240267465539958}. Best is trial 1 with value: 26265.15765344184.
[I 2024-11-08 19:29:27,483] Trial 2 finished with value: 24913.432866165942 and parameters: {'n_estimators': 2805, 'max_depth': 8, 'learning_rate': 0.00213322386775019, 'mi

In [120]:
# 使用最佳参数创建最终模型
best_model = XGBRegressor(
    n_estimators=4163,
    max_depth=6, 
    learning_rate=0.0110604556318349,
    min_child_weight=3,
    gamma=0.0036210695281719447,
    subsample=0.6453444676549167,
    colsample_bytree=0.6761141085642669,
    reg_alpha=0.34592583966227625,
    reg_lambda=0.01554727012065511,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

# best_model = XGBRegressor(
#     n_estimators=10000,
#     max_depth=6, 
#     learning_rate=0.0110604556318349,
#     min_child_weight=3,
#     gamma=0.0036210695281719447,
#     subsample=0.6453444676549167,
#     colsample_bytree=0.6761141085642669,
#     reg_alpha=0.34592583966227625,
#     reg_lambda=0.01554727012065511,
#     objective='reg:squarederror',
#     random_state=42,
#     n_jobs=-1,
#     verbosity=0
# )

# 创建并训练最终模型
best_model.fit(X_train_combined, y_train.values.ravel())

# 评估最终模型
y_pred = best_model.predict(X_valid_combined)
final_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"\n最终模型在验证集上的RMSE: {final_rmse:.4f}")


最终模型在验证集上的RMSE: 19481.5527


## Prediction on test set

In [121]:
print("使用全量数据训练最终模型...")
best_model.fit(X_train_full_combined, y_train_full)

print("生成测试集预测结果...")
test_predictions = best_model.predict(X_test_combined)

# 创建预测结果DataFrame
predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})

# 保存预测结果
predictions_df.to_csv('data/predictions.csv', index=False)
print("预测结果已保存到 data/predictions.csv")

使用全量数据训练最终模型...
生成测试集预测结果...
预测结果已保存到 data/predictions.csv
