In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import time

# 数据加载和预处理函数
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    if data.columns[0] == 'Unnamed: 0':
        data = data.drop(columns=data.columns[0])
    
    X = data.drop('price', axis=1) if 'price' in data.columns else data
    y = data['price'] if 'price' in data.columns else None
    
    return X, y

# 定义列转换器
def get_column_transformer(X):
    numeric_features = ['curb_weight', 'power', 'engine_cap', 'no_of_owners', 'depreciation', 
                        'coe', 'road_tax', 'dereg_value', 'omv', 'arf', 'vehicle_age']
    # 自动识别其他类别特征
    categorical_features = [col for col in X.columns if col not in numeric_features]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        sparse_threshold=0  # 确保输出为密集矩阵
    )
    
    return preprocessor

# 模型训练和评估函数
def train_evaluate_random_forest(X_train, X_val, y_train, y_val, params):
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    return model

# 模型评估函数
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    #mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r2 = r2_score(y, predictions)
    return rmse, r2, predictions

# 新增：计算并排序特征重要性的函数
def get_feature_importance(model, feature_names):
    importances = model.feature_importances_
    feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
    return feature_importance

# 主函数
def main():
    # 设置随机种子以提高可重复性
    np.random.seed(42)
    
    # 加载和预处理数据
    X, y = load_and_preprocess_data('train_cleaned.csv')
    
    # 分割数据为训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 创建验证集
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
    
    # 预处理数据
    preprocessor = get_column_transformer(X)
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)

    # 获取特征名称
    numeric_features = ['curb_weight', 'power', 'engine_cap', 'no_of_owners', 'depreciation', 
                        'coe', 'road_tax', 'dereg_value', 'omv', 'arf', 'vehicle_age']
    categorical_features = [col for col in X.columns if col not in numeric_features]
    onehot_encoder = preprocessor.named_transformers_['cat']
    if hasattr(onehot_encoder, 'get_feature_names_out'):
        cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features)
    else:
        cat_feature_names = onehot_encoder.get_feature_names(categorical_features)
    feature_names = numeric_features + cat_feature_names.tolist()
    
    
    # 保存预处理器
    with open('preprocessor_random_forest.pkl', 'wb') as f:
        pickle.dump(preprocessor, f)
    
    # 定义随机森林参数
    params = {
        'n_estimators': 200,
        'max_depth': 30,
        'min_samples_split': 2,
        'min_samples_leaf': 2,
        'random_state': 42,
        'n_jobs': -1 
    }
    
    # 训练模型
    start_time = time.time()
    model = train_evaluate_random_forest(X_train_preprocessed, X_val_preprocessed, y_train, y_val, params)
    elapsed_time = time.time() - start_time
    print(f"Training time: {elapsed_time/60:.2f} minutes")
    
    # 评估模型
    rmse_train, r2_train, _ = evaluate_model(model, X_train_preprocessed, y_train)
    rmse_val, r2_val, _ = evaluate_model(model, X_val_preprocessed, y_val)
    rmse_test, r2_test, predictions_test = evaluate_model(model, X_test_preprocessed, y_test)
    
    print(f"Training Set - RMSE: {rmse_train:.4f}, R2: {r2_train:.4f}")
    print(f"Validation Set - RMSE: {rmse_val:.4f}, R2: {r2_val:.4f}")
    print(f"Test Set - RMSE: {rmse_test:.4f}, R2: {r2_test:.4f}")

    # 计算并打印特征重要性
    feature_importance = get_feature_importance(model, feature_names)
    print("\nImportance of features:")
    for feature, importance in feature_importance:
        print(f"{feature}: {importance:.4f}")
    
    # 保存模型
    with open('random_forest_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print("Model and preprocessor saved")
    
    # 加载测试数据并进行预测
    X_new_test, _ = load_and_preprocess_data('test_cleaned_new.csv')
    preprocessor_loaded = pickle.load(open('preprocessor_random_forest.pkl', 'rb'))
    X_new_test_preprocessed = preprocessor_loaded.transform(X_new_test)
    
    predictions_new_test = model.predict(X_new_test_preprocessed)
    
    # 检查预测结果中的NaN值
    nan_count = np.isnan(predictions_new_test).sum()
    if nan_count > 0:
        print(f"警告: 在 {len(predictions_new_test)} 个总预测中发现 {nan_count} 个NaN预测。")
        
        # 找出导致NaN的输入
        nan_indices = np.where(np.isnan(predictions_new_test))[0]
        print("NaN预测的索引:", nan_indices)
        
        # 检查这些索引对应的原始输入数据
        problematic_inputs = X_new_test.iloc[nan_indices]
        print("有问题的输入:")
        print(problematic_inputs)
        
        # 将NaN值替换为训练集的平均值
        train_mean = y_train.mean()
        predictions_new_test[np.isnan(predictions_new_test)] = train_mean
        print(f"用训练集平均值替换NaN值: {train_mean}")
    
    # 创建提交文件，使用round()来处理浮点数
    submission = pd.DataFrame({
        'Id': range(len(predictions_new_test)),
        'Predicted': np.round(predictions_new_test).astype(int)
    })
    
    submission.to_csv('submission_random_forest.csv', index=False)
    print("Prediction completed. Submission saved to 'submission_random_forest.csv'")
    
    # 输出一些统计信息
    print("\nPrediction Statistics::")
    print(f"Min: {predictions_new_test.min()}")
    print(f"Max: {predictions_new_test.max()}")
    print(f"Mean: {predictions_new_test.mean()}")
    print(f"Median: {np.median(predictions_new_test)}")

if __name__ == '__main__':
    main()

Training time: 0.14 minutes
Training Set - RMSE: 13199.7005, R2: 0.9928
Validation Set - RMSE: 14975.7194, R2: 0.9917
Test Set - RMSE: 15497.1956, R2: 0.9863

Importance of features:
dereg_value: 0.5880
arf: 0.2955
depreciation: 0.0632
vehicle_age: 0.0165
power: 0.0083
coe: 0.0072
rare & exotic_0: 0.0059
omv: 0.0048
rare & exotic_1: 0.0036
curb_weight: 0.0018
road_tax: 0.0007
engine_cap: 0.0006
no_of_owners: 0.0004
manufactured_2023: 0.0003
manufactured_2015: 0.0002
manufactured_2017: 0.0002
parf car_1: 0.0002
parf car_0: 0.0001
coe car_1: 0.0001
manufactured_2010: 0.0001
manufactured_2016: 0.0001
almost new car_1: 0.0001
manufactured_2020: 0.0001
coe car_0: 0.0001
almost new car_0: 0.0001
manufactured_2019: 0.0001
premium ad car_0: 0.0001
premium ad car_1: 0.0001
vehicle_type_sports_car_0: 0.0001
manufactured_2021: 0.0001
vehicle_type_sports_car_1: 0.0001
vehicle_type_suv_1: 0.0001
low mileage car_0: 0.0001
manufactured_2014: 0.0001
manufactured_2022: 0.0001
low mileage car_1: 0.0001


Training time: 1.18 minutes
Training Set - RMSE: 12723.9134, R2: 0.9933
Validation Set - RMSE: 17587.5593, R2: 0.9886
Test Set - RMSE: 17828.9103, R2: 0.9819
Model and preprocessor saved
Prediction completed. Submission saved to 'submission_random_forest.csv'

Prediction Statistics::
Min: 18317.386960891305
Max: 2613896.93
Mean: 112673.36727802617
Median: 76334.6058145435

dereg_value: 0.6259
arf: 0.2538
depreciation: 0.0615
vehicle_age: 0.0157
power: 0.0088
coe: 0.0065
rare & exotic_0: 0.0058
omv: 0.0052
rare & exotic_1: 0.0042
curb_weight: 0.0027

| Feature          | Importance | Feature          | Importance |
|------------------|------------|------------------|------------|
| dereg_value      | 0.6259     | rare & exotic_0  | 0.0058     |
| arf              | 0.2538     | omv              | 0.0052     |
| depreciation     | 0.0615     | rare & exotic_1  | 0.0042     |
| vehicle_age      | 0.0157     | curb_weight      | 0.0027     |
| power            | 0.0088     | coe              | 0.0065     |