In [64]:
# Some magic so that the notebook will reload the external python script file any time you edit and save the .py file;
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import numpy as np
import pandas as pd
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

In [66]:
# 加载 train.csv 文件
train_file_path = "./train.csv"
test_file_path = "./test.csv"
train_df = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

# 确认数据加载成功，并查看数据的前几行
print("数据集前几行：")
print(train_df.head())

数据集前几行：
   listing_id                                              title           make    model                                        description  manufactured original_reg_date     reg_date  type_of_vehicle                                  category transmission  curb_weight  power fuel_type  engine_cap  no_of_owners  depreciation    coe  road_tax  dereg_value   mileage      omv       arf opc_scheme lifespan   eco_category                                           features                                        accessories  indicative_price     price
0     1292132  Land Rover Range Rover Velar 3.0A Si6 R-Dynami...     land rover    range  1 owner, no repairs needed! it looks great, in...        2018.0               NaN  08-mar-2018              suv                                  parf car         auto       1884.0  280.0       NaN      2995.0           2.0       34270.0  48011    2380.0     103323.0   96000.0  88906.0  132031.0        NaN      NaN  uncategorized  3l supercharged v6 

In [67]:
# 输出列名以确认数据集结构
print("数据集的列名：", train_df.columns)

数据集的列名： Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured', 'original_reg_date', 'reg_date', 'type_of_vehicle', 'category', 'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category', 'features', 'accessories', 'indicative_price', 'price'], dtype='object')


In [68]:
# 检查目标列是否存在
required_columns = ['make', 'type_of_vehicle', 'category', 'transmission', 'price']  # 需要处理的列
missing_columns = [col for col in required_columns if col not in train_df.columns]
if missing_columns:
    raise ValueError(f"数据集中缺少以下列：{missing_columns}")


In [69]:
def target_encode_make(df, column, target):
    """使用MEstimateEncoder对make进行目标编码"""
    # 初始化编码器
    encoder = MEstimateEncoder(
        cols=[column],
        m=5.0,  # 平滑参数
    )
    
    # 训练编码器
    encoder.fit(df, df[target])
    
    # 计算默认值（用于处理未见过的类别）
    default_mean = df[target].mean()
    
    return encoder, default_mean

def apply_target_encoding(df, column, encoder, default_mean):
    """应用编码器到数据集"""
    # 转换数据
    df_encoded = encoder.transform(df)
    # 处理可能的空值
    df[f"{column}_target_encoded"] = df_encoded[column].fillna(default_mean)
    return df

In [70]:
def encode_categories_train(df, column_name):
    """在训练数据上初始化并应用MultiLabelBinarizer，返回编码器以备未来使用"""
    # 将字符串转换为列表，每个类别作为列表的一个元素
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 初始化MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    # 使用MultiLabelBinarizer进行编码
    mlb.fit_transform(df[f"{column_name}_list"])

    return mlb

def apply_categories_encoding(df, column_name, mlb):
    """应用已保存的MultiLabelBinarizer到新DataFrame"""
    # 转换字符串为列表
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))

    # 使用MultiLabelBinarizer进行编码
    df_encoded = mlb.transform(df[f"{column_name}_list"])

    # 转换回DataFrame并添加列名
    df_encoded = pd.DataFrame(df_encoded, columns=mlb.classes_, index=df.index)

    # 将编码后的DataFrame合并到原始DataFrame
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(columns=[column_name, f"{column_name}_list"])

    return df

In [71]:
def target_encode_make(df, column, target):
    """使用MEstimateEncoder对make进行目标编码"""
    # 初始化编码器
    encoder = MEstimateEncoder(
        cols=[column],
        m=5.0,  # 平滑参数
    )
    
    # 在全量数据上训练编码器
    encoder.fit(df[[column]], df[target])
    
    # 计算默认值（用于处理未见过的类别）
    default_mean = df[target].mean()
    
    return encoder, default_mean

def apply_target_encoding(df, column, encoder, default_mean):
    """应用编码器到数据集"""
    # 创建数据副本并只保留需要的列
    df_temp = df[[column]].copy()
    
    # 转换数据
    encoded_values = encoder.transform(df_temp)
    
    # 将编码结果添加到原始数据框中
    df[f"{column}_target_encoded"] = encoded_values[column]
    
    return df

In [72]:
def onehot_encode_columns_train(df, columns):
    """在训练数据上初始化并应用OneHotEncoder，返回编码器以备未来使用"""
    encoders = {}
    for column in columns:
        onehot_encoder = OneHotEncoder()
        # 注意这里转换为DataFrame是为了保持输入格式一致
        df_encoded = onehot_encoder.fit_transform(df[[column]])
        encoders[column] = onehot_encoder
    return encoders

def apply_onehot_encoding(df, columns, encoders):
    """应用已保存的OneHotEncoder到新DataFrame"""
    for column in columns:
        # 使用已保存的编码器进行transform操作，并转换为数组
        df_encoded = encoders[column].transform(df[[column]]).toarray()

        # 转换回DataFrame，列名使用encoder中的类别名称
        df_encoded = pd.DataFrame(df_encoded, columns=encoders[column].get_feature_names_out([column]), index=df.index)

        # 将编码后的DataFrame合并到原始DataFrame
        df = pd.concat([df, df_encoded], axis=1)
        df = df.drop(columns=column)

    return df

In [73]:
del_cols = ['listing_id', 'original_reg_date','opc_scheme', 'lifespan','eco_category', 'indicative_price']
text_cols = ['title', 'description', 'features', 'accessories'] # may need further processing
date_cols = ['reg_date'] # need to transform
numeric_cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month']
log_cols = ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month']
root_cols = ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month']
categorical_cols = ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']

In [74]:
def get_maxmin_dict(data, numeric_cols):
    max_dict = dict()
    min_dict = dict()
    for feature in numeric_cols:
        max_dict[feature] = data[feature].max()
        min_dict[feature] = data[feature].min()
    return max_dict, min_dict


## 清理数据

In [75]:
def preprocess_data_cat(data, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders):
    """处理分类特征"""
    data = data.drop(columns=del_cols)
    data = data.drop(columns=text_cols)
    
    # 应用目标编码
    data = apply_target_encoding(data, 'make', target_encoder, default_mean)

    # 应用多标签二值化编码
    data = apply_categories_encoding(data, 'category', mlb_encoder)

    # 应用OneHot编码
    data = apply_onehot_encoding(data, ['type_of_vehicle', 'fuel_type', 'transmission'], onehot_encoders)

    # 处理日期特征
    data['reg_date'] = pd.to_datetime(data['reg_date'], format='%d-%b-%Y')  
    data['year'] = data['reg_date'].dt.year
    data['month'] = data['reg_date'].dt.month
    data = data.drop(columns='reg_date')
    data['no_of_owners'].fillna(2, inplace=True)
    
    return data

def preprocess_data_num(data, max_dict, min_dict, remove_outliers=False):
    """处理数值特征"""
    # 1. 填充缺失值
    for feature in numeric_cols:
        data[feature] = data[feature].fillna(data[feature].median())
    
    # 2. 创建异常值掩码
    mask = ~((data[numeric_cols] - data[numeric_cols].mean()).abs() > 3 * data[numeric_cols].std()).any(axis=1)
    
    # 如果需要移除异常值
    if remove_outliers:
        data = data[mask]
    
    # 3. 特征转换
    long_tail_features = ['omv', 'arf', 'depreciation', 'dereg_value']
    root_transform_features = ['power', 'engine_cap', 'road_tax', 'mileage']
    
    # 对长尾特征进行变换
    for feature in long_tail_features:
        data[f'{feature}_log'] = np.log1p(data[feature])
        data[f'{feature}_root'] = np.sqrt(data[feature])

    # 对特定特征进行变换
    for feature in root_transform_features:
        data[f'{feature}_root'] = np.sqrt(data[feature])
        data[f'{feature}_log'] = np.log1p(data[feature])
    
    # 4. 归一化处理
    for feature in numeric_cols:
        data[feature] = (data[feature] - data[feature].min()) / (data[feature].max() - data[feature].min())
        if feature in long_tail_features or feature in root_transform_features:
            for transform in ['log', 'root']:
                feat_name = f'{feature}_{transform}'
                max_dict[feat_name] = data[feat_name].max()
                min_dict[feat_name] = data[feat_name].min()
                data[feat_name] = (data[feat_name] - data[feat_name].min()) / (data[feat_name].max() - data[feat_name].min())
    
    return data, mask if remove_outliers else None

In [76]:
X_train, y_train  = train.drop(columns=['price']), train['price']
X_valid, y_valid = valid.drop(columns=['price']), valid['price']
X_test = test

target_encoder, default_mean = target_encode_make(train_df, 'make', 'price')
mlb_encoder = encode_categories_train(train, 'category')
onehot_encoders = onehot_encode_columns_train(train, ['type_of_vehicle', 'fuel_type', 'transmission'])

In [77]:
# 1. 首先处理分类特征
X_train = preprocess_data_cat(X_train, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)
X_valid = preprocess_data_cat(X_valid, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)
X_test = preprocess_data_cat(X_test, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)

# 2. 然后处理数值特征
max_dict, min_dict = get_maxmin_dict(X_train, numeric_cols)

X_train, mask = preprocess_data_num(X_train, max_dict, min_dict, remove_outliers=False)
if mask is not None:
    y_train = y_train[mask]

X_valid, _ = preprocess_data_num(X_valid, max_dict, min_dict, remove_outliers=False)
X_test, _ = preprocess_data_num(X_test, max_dict, min_dict, remove_outliers=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['no_of_owners'].fillna(2, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['no_of_owners'].fillna(2, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [78]:
# # 5. 保存处理后的数据集
# processed_file_path = "./processed_train.csv"
# data.to_csv(processed_file_path, index=False)
# print(f"处理后的数据已保存至：{processed_file_path}")

# with open('max.json', 'w') as f:
#     json.dump(max_dict, f)
    
# with open('min.json', 'w') as f:
#     json.dump(min_dict, f)

In [79]:
# 只使用数值特征

for li in [numeric_cols, log_cols, root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    # print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Root Mean Squared Error: 54055.16124963882
R² Score: 0.8605522380162519
Root Mean Squared Error: 293232.5713536402
R² Score: -2.294859290869838
Root Mean Squared Error: 102662.46848904116
R² Score: 0.46739278240606175


In [80]:
cat_nu_cols = ["manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_log_cols = ["manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_root_cols = ["manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

In [81]:
# 设置显示最大行数
pd.set_option('display.max_rows', None)  # None表示显示所有行，或者设置一个具体的数字

# 设置显示最大列数
pd.set_option('display.max_columns', None)  # None表示显示所有列，或者设置一个具体的数字

# 也可以设置显示的宽度以适应你的屏幕
pd.set_option('display.width', 1000)  # 根据你的屏幕大小调整这个数值
missing_values = X_train.isnull().sum()
missing_percentages = (missing_values / len(X_train)) * 100
missing_info = pd.DataFrame({'Number of missing values': missing_values, 'Percentage of missing values (%)': missing_percentages})
print(missing_info)

                                 Number of missing values  Percentage of missing values (%)
make                                                 1029                             5.145
model                                                   0                             0.000
manufactured                                            0                             0.000
curb_weight                                             0                             0.000
power                                                   0                             0.000
engine_cap                                              0                             0.000
no_of_owners                                            0                             0.000
depreciation                                            0                             0.000
coe                                                     0                             0.000
road_tax                                                0                       

In [82]:
# 使用数值特征 + 分类特征

for li in [cat_nu_cols, cat_log_cols, cat_root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Mean Squared Error: 2145702504.0982034
Root Mean Squared Error: 46321.72820716217
R² Score: 0.9005586474891843
Mean Squared Error: 6081443392.787511
Root Mean Squared Error: 77983.60977017871
R² Score: 0.7011914614537941
Mean Squared Error: 3770552987.5335155
Root Mean Squared Error: 61404.828698836995
R² Score: 0.8132048901320689


## 使用不同模型 （num + cat 特征）

In [None]:
from xgboost import XGBRegressor

# # 创建XGBoost模型
model = XGBRegressor(
    # 基本参数
    n_estimators=1000,        # 树的数量
    max_depth=6,              # 树的最大深度，避免过拟合
    learning_rate=0.01,       # 较小的学习率，提高模型稳定性
    
    # 防止过拟合的参数
    min_child_weight=5,       # 控制过拟合
    gamma=0.1,               # 节点分裂所需的最小损失函数下降值
    subsample=0.8,           # 随机采样训练样本的比例
    colsample_bytree=0.8,    # 随机采样特征的比例
    
    # 正则化参数
    reg_alpha=0.1,           # L1正则化
    reg_lambda=1,            # L2正则化
    
    # 其他参数
    objective='reg:squarederror',  # 回归任务
    random_state=42,
    n_jobs=-1,               # 使用所有CPU核心
    verbosity=0
)

# model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)


# for li in [cat_nu_cols, cat_log_cols, cat_root_cols]:
for li in [cat_nu_cols]:
    
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # 添加早停以防止过拟合
    eval_set = [(X_valid[li], y_valid)]
    
    model.fit(
        X, 
        y,
        eval_set=eval_set
    )

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    # print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    # print(f'R² Score: {r2}')

In [None]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

# 1. LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)

# 3. Stacking
base_models = [
    ('xgb', XGBRegressor(verbosity=0, random_state=42)),
    ('lgb', lgb.LGBMRegressor(verbose=-1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=False, random_seed=42))
]
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=lgb.LGBMRegressor(verbose=-1),
    cv=5
)

# 4. RandomForest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 5. GradientBoosting
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# 比较函数
def compare_models(models, X_train, y_train, X_valid, y_valid):
    results = []
    for name, model in models.items():
        print(f"训练 {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        r2 = r2_score(y_valid, y_pred)
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R2': r2
        })
    return pd.DataFrame(results).sort_values('RMSE')

# 比较所有模型
models = {
    'LightGBM': lgb_model,
    'CatBoost': cat_model,
    'Stacking': stacking,
    'RandomForest': rf_model,
    'GradientBoosting': gb_model,
    'XGBoost': model  # 之前定义的XGBoost模型
}

# 使用cat_log_cols特征集（通常对价格预测效果更好）
results = compare_models(models, X_train[cat_nu_cols], y_train, 
                       X_valid[cat_nu_cols], y_valid)
print("\n模型性能比较:")
print(results)

In [None]:
import optuna
from sklearn.ensemble import VotingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# 1. 优化版LightGBM（使用optuna自动调参）
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
    }
    
    model = lgb.LGBMRegressor(**params, random_state=42, verbose=-1)
    model.fit(X_train[cat_nu_cols], y_train)
    y_pred = model.predict(X_valid[cat_nu_cols])
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    return rmse

# 运行optuna优化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# 使用最佳参数创建LightGBM模型
best_lgb = lgb.LGBMRegressor(**study.best_params, random_state=42, verbose=-1)

# 2. Voting集成（组合多个最佳模型）
voting_regressor = VotingRegressor([
    ('lgb', best_lgb),
    ('cat', CatBoostRegressor(
        iterations=2000,
        learning_rate=0.005,
        depth=8,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    )),
    ('xgb', XGBRegressor(
        n_estimators=2000,
        max_depth=7,
        learning_rate=0.005,
        min_child_weight=5,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=42,
        verbosity=0
    ))
])

# 3. 深度神经网络
nn_model = MLPRegressor(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42,
    early_stopping=True
)

# 4. SVR with RBF kernel（对于中等规模数据集）
svr_model = SVR(
    kernel='rbf',
    C=1.0,
    epsilon=0.1,
    gamma='scale'
)

# 5. KernelRidge（结合核方法和岭回归）
kr_model = KernelRidge(
    alpha=1.0,
    kernel='rbf',
    gamma=0.1
)

# 更新模型字典
models.update({
    'OptimizedLightGBM': best_lgb,
    'VotingEnsemble': voting_regressor,
    'NeuralNetwork': nn_model,
    'SVR': svr_model,
    'KernelRidge': kr_model
})

# 比较所有模型
results = compare_models(models, X_train[cat_nu_cols], y_train, 
                       X_valid[cat_nu_cols], y_valid)
print("\n增强版模型性能比较:")
print(results)

In [None]:
# 1. 准备数据
X_full = train_df.drop(columns=['price'])
y_full = train_df['price']
X_test = test

# 2. 特征处理
# 对全量训练数据进行编码
target_encoder, default_mean = target_encode_cross_validation(train_df, 'make', 'price')
mlb_encoder = encode_categories_train(train_df, 'category')
onehot_encoders = onehot_encode_columns_train(train_df, ['type_of_vehicle', 'fuel_type', 'transmission'])

# 处理分类特征
X_full = preprocess_data_cat(X_full, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)
X_test = preprocess_data_cat(X_test, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders)

# 处理数值特征
max_dict, min_dict = get_maxmin_dict(X_full, numeric_cols)
X_full, _ = preprocess_data_num(X_full, max_dict, min_dict, remove_outliers=False)
X_test, _ = preprocess_data_num(X_test, max_dict, min_dict, remove_outliers=False)

# 3. 使用表现最好的模型 - OptimizedLightGBM
final_model = lgb.LGBMRegressor(
    n_estimators=966,
    learning_rate=0.00747521912854598,
    num_leaves=30,
    max_depth=12,
    min_child_samples=14,
    subsample=0.6966427407695847,
    colsample_bytree=0.661691991287904,
    reg_alpha=8.153776462333733,
    reg_lambda=0.0013136798177146663
)

# 4. 在全量训练数据上训练模型
print("训练最终模型...")
final_model.fit(X_full[cat_nu_cols], y_full)

# 5. 对测试集进行预测
print("生成预测结果...")
test_predictions = final_model.predict(X_test[cat_nu_cols])

# 6. 保存预测结果
predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})
predictions_df.to_csv('predictions.csv', index=False)

print("\n预测完成！结果已保存到 predictions.csv")
print("\n预测结果统计：")
print(predictions_df['Predicted'].describe())

# 7. 输出特征重要性
feature_importance = pd.DataFrame({
    'feature': cat_nu_cols,
    'importance': final_model.feature_importances_
})
print("\n最重要的10个特征：")
print(feature_importance.sort_values('importance', ascending=False).head(10))