In [15]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# 读取处理后的数据集
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_valid = pd.read_csv('data/processed/X_valid.csv')
y_valid = pd.read_csv('data/processed/y_valid.csv') 
X_train_full = pd.read_csv('data/processed/X_train_full.csv') 
y_train_full = pd.read_csv('data/processed/y_train_full.csv') 
X_test = pd.read_csv('data/processed/X_test.csv')

print('数据集读取完成')
print(f'训练集大小: {X_train.shape}')
print(f'验证集大小: {X_valid.shape}')
print(f'全量训练集大小: {X_train_full.shape}')
print(f'测试集大小: {X_test.shape}')


数据集读取完成
训练集大小: (18150, 68)
验证集大小: (5000, 68)
全量训练集大小: (25000, 68)
测试集大小: (10000, 68)


In [16]:
# 从JSON文件读取列名配置
with open('./data/columns.json', 'r') as f:
    columns_dict = json.load(f)

# 从字典中读取列名
del_cols = columns_dict['del_cols']
text_cols = columns_dict['text_cols'] 
date_cols = columns_dict['date_cols']
numeric_cols = columns_dict['numeric_cols']
log_cols = columns_dict['log_cols']
root_cols = columns_dict['root_cols']
categorical_cols = columns_dict['categorical_cols']

# 定义变换列
cat_nu_cols = ["manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_log_cols = ["manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]
cat_root_cols = ["manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", "make_target_encoded", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

## Baseline

In [17]:
# 使用数值特征 + 分类特征
for li in [cat_nu_cols, cat_log_cols, cat_root_cols]:
    # Assuming 'data' is your DataFrame and 'target' is the name of your target variable
    X = X_train[li]  # Features
    y = y_train       # Target variable

    # Creating the linear regression model
    model = LinearRegression()

    # Fitting the model
    model.fit(X, y)

    # Making predictions
    y_valid_pred = model.predict(X_valid[li])

    # Calculating the performance metrics
    mse = mean_squared_error(y_valid_pred, y_valid)
    r2 = r2_score(y_valid_pred, y_valid)
    # Calculating the RMSE
    rmse = math.sqrt(mse)

    # Printing the MSE, RMSE, and R² Score
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'R² Score: {r2}')

Mean Squared Error: 2883238432.9251704
Root Mean Squared Error: 53695.79530023901
R² Score: 0.8240911336521936
Mean Squared Error: 8032677933.33663
Root Mean Squared Error: 89625.20813552753
R² Score: 0.02666173374675751
Mean Squared Error: 5336926720.111189
Root Mean Squared Error: 73054.2724288675
R² Score: 0.4540321116014323


## Other ML Regression Models

In [19]:
from xgboost import XGBRegressor

# # 创建XGBoost模型
model = XGBRegressor(
    # 基本参数
    n_estimators=1000,        # 树的数量
    max_depth=6,              # 树的最大深度，避免过拟合
    learning_rate=0.01,       # 较小的学习率，提高模型稳定性
    
    # 防止过拟合的参数
    min_child_weight=5,       # 控制过拟合
    gamma=0.1,               # 节点分裂所需的最小损失函数下降值
    subsample=0.8,           # 随机采样训练样本的比例
    colsample_bytree=0.8,    # 随机采样特征的比例
    
    # 正则化参数
    reg_alpha=0.1,           # L1正则化
    reg_lambda=1,            # L2正则化
    
    # 其他参数
    objective='reg:squarederror',  # 回归任务
    random_state=42,
    n_jobs=-1,               # 使用所有CPU核心
    verbosity=0
)


# Assuming 'data' is your DataFrame and 'target' is the name of your target variable
X = X_train[cat_nu_cols]  # Features
y = y_train       # Target variable

# 添加早停以防止过拟合
eval_set = [(X_valid[cat_nu_cols], y_valid)]

model.fit(
    X, 
    y,
    eval_set=eval_set
)

# Making predictions
y_valid_pred = model.predict(X_valid[cat_nu_cols])

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
# print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R² Score: {r2}')

[0]	validation_0-rmse:142315.79172
[1]	validation_0-rmse:141068.99195
[2]	validation_0-rmse:139802.93483
[3]	validation_0-rmse:138558.04800
[4]	validation_0-rmse:137343.66625
[5]	validation_0-rmse:136157.88805
[6]	validation_0-rmse:134969.11891
[7]	validation_0-rmse:133794.88149
[8]	validation_0-rmse:132626.02587
[9]	validation_0-rmse:131487.07828
[10]	validation_0-rmse:130362.56039
[11]	validation_0-rmse:129227.38033
[12]	validation_0-rmse:128125.81786
[13]	validation_0-rmse:127025.29280
[14]	validation_0-rmse:125907.00822
[15]	validation_0-rmse:124853.71941
[16]	validation_0-rmse:123767.21615
[17]	validation_0-rmse:122681.79719
[18]	validation_0-rmse:121660.83232
[19]	validation_0-rmse:120628.46347
[20]	validation_0-rmse:119647.20753
[21]	validation_0-rmse:118661.73960
[22]	validation_0-rmse:117680.42062
[23]	validation_0-rmse:116658.98449
[24]	validation_0-rmse:115704.17246
[25]	validation_0-rmse:114763.44273
[26]	validation_0-rmse:113798.52569
[27]	validation_0-rmse:112860.48119
[2

In [20]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

# 1. LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)

# 3. Stacking
base_models = [
    ('xgb', XGBRegressor(verbosity=0, random_state=42)),
    ('lgb', lgb.LGBMRegressor(verbose=-1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=False, random_seed=42))
]
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=lgb.LGBMRegressor(verbose=-1),
    cv=5
)

# 4. RandomForest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 5. GradientBoosting
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# 比较函数
def compare_models(models, X_train, y_train, X_valid, y_valid):
    results = []
    for name, model in models.items():
        print(f"训练 {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        r2 = r2_score(y_valid, y_pred)
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R2': r2
        })
    return pd.DataFrame(results).sort_values('RMSE')

# 比较所有模型
models = {
    'LightGBM': lgb_model,
    'CatBoost': cat_model,
    'Stacking': stacking,
    'RandomForest': rf_model,
    'GradientBoosting': gb_model,
    'XGBoost': model  # 之前定义的XGBoost模型
}

# 使用cat_log_cols特征集（通常对价格预测效果更好）
results = compare_models(models, X_train[cat_nu_cols], y_train, 
                       X_valid[cat_nu_cols], y_valid)
print("\n模型性能比较:")
print(results)

训练 LightGBM
训练 CatBoost
训练 Stacking


  y = column_or_1d(y, warn=True)


训练 RandomForest


  return fit_method(estimator, *args, **kwargs)


训练 GradientBoosting


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


训练 XGBoost

模型性能比较:
              Model          RMSE        R2
5           XGBoost  23555.256325  0.973065
0          LightGBM  24887.222889  0.969932
1          CatBoost  27791.193073  0.962506
2          Stacking  28904.091927  0.959443
3      RandomForest  28959.985562  0.959286
4  GradientBoosting  29832.461299  0.956796


## Prediction on test set

In [21]:
# 选择表现最好的模型进行最终训练和预测
best_model = models['XGBoost']  # 可以根据上面的results选择最佳模型

print("使用全量数据训练最终模型...")
best_model.fit(X_train_full[cat_nu_cols], y_train_full)

print("生成测试集预测结果...")
test_predictions = best_model.predict(X_test[cat_nu_cols])

# 创建预测结果DataFrame
predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})

# 保存预测结果
predictions_df.to_csv('data/predictions.csv', index=False)
print("预测结果已保存到 data/predictions.csv")

使用全量数据训练最终模型...
生成测试集预测结果...
预测结果已保存到 data/predictions.csv
