# 1 特征工程优化

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 加载已预处理的数据
df = pd.read_csv('../datasets/walmart_preprocessed.csv')

In [2]:
# 1. 基于EDA添加更多交互特征
# 从热力图可以看出有强相关性的特征
df['Age_City'] = df['Age'].astype(str) + "_" + df['City_Category']
df['Gender_City'] = df['Gender'] + "_" + df['City_Category']
df['Age_Marital'] = df['Age'].astype(str) + "_" + df['Marital_Status'].astype(str)

# 2. 基于RFM分析添加客户价值分段
# R(Recency)：停留年限的倒数
df['Recency_Score'] = 5 - df['Stay_Years']
# F(Frequency)：购买频次分段
df['Frequency_Score'] = pd.qcut(df['Purchase_Count'], 5, labels=[1, 2, 3, 4, 5])
# M(Monetary)：总购买金额分段
df['Monetary_Score'] = pd.qcut(df['Total_User_Purchase'], 5, labels=[1, 2, 3, 4, 5])
# 综合RFM得分
df['RFM_Score'] = df['Recency_Score'].astype(str) + df['Frequency_Score'].astype(str) + df['Monetary_Score'].astype(str)

In [3]:
# 3. 产品类别的高阶特征
# 按产品类别的购买方差
product_var = df.groupby('Product_Category')['Purchase'].var().reset_index()
product_var.columns = ['Product_Category', 'Product_Purchase_Var']
df = pd.merge(df, product_var, on='Product_Category', how='left')

# 4. 城市和职业的交互
city_occupation = df.groupby(['City_Category', 'Occupation'])['Purchase'].mean().reset_index()
city_occupation.columns = ['City_Category', 'Occupation', 'City_Occupation_Mean']
df = pd.merge(df, city_occupation, on=['City_Category', 'Occupation'], how='left')

In [4]:
# 5. 用户购买习惯特征
# 用户偏好的产品类别
user_category_pref = df.groupby(['User_ID', 'Product_Category']).size().reset_index(name='Category_Purchase_Count')
user_max_category = user_category_pref.loc[user_category_pref.groupby('User_ID')['Category_Purchase_Count'].idxmax()]
user_max_category.columns = ['User_ID', 'Preferred_Category', 'Category_Count']
df = pd.merge(df, user_max_category[['User_ID', 'Preferred_Category']], on='User_ID', how='left')

# 6. 创建购买频率和金额的比率特征
df['Avg_Transaction_Value'] = df['Total_User_Purchase'] / df['Purchase_Count']

# 7. 对一些特征进行多项式转换
df['Product_Category_Squared'] = df['Product_Category'] ** 2
df['Occupation_Squared'] = df['Occupation'] ** 2

# 8. 对数转换购买金额 (目标变量)
df['Purchase_Log'] = np.log1p(df['Purchase'])

In [5]:
# 准备最终特征集
categorical_features = ['Gender', 'Age', 'City_Category', 'Marital_Status', 
                       'Gender_City', 'Age_City', 'Age_Marital', 'RFM_Score', 
                       'Preferred_Category']

numerical_features = ['Occupation', 'Stay_Years', 'Product_Category',
                     'City_Code', 'Gender_Code', 'Total_User_Purchase',
                     'Purchase_Count', 'Avg_User_Purchase', 'Avg_Category_Purchase',
                     'Product_Purchase_Var', 'City_Occupation_Mean',
                     'Avg_Transaction_Value', 'Product_Category_Squared',
                     'Occupation_Squared', 'Recency_Score']

# 2 特征选择

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectFromModel, RFE

# 准备X和y
target = 'Purchase'  # 或使用'Purchase_Log'如果您想预测log转换后的购买金额
X = df.drop(['Purchase', 'Purchase_Log'], axis=1)
y = df[target]

# 去除不需要的列
cols_to_drop = ['User_ID', 'Product_ID', 'Stay_In_Current_City_Years', 'Purchase_Normalized',
                'Purchase_Standardized', 'Age_Category']
X = X.drop([col for col in cols_to_drop if col in X.columns], axis=1)

In [7]:
# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.impute import SimpleImputer

# 预处理管道
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

available_num_features = [col for col in numerical_features if col in X_train.columns]
available_cat_features = [col for col in categorical_features if col in X_train.columns]

print("使用的数值特征:", available_num_features)
print("使用的分类特征:", available_cat_features)

使用的数值特征: ['Occupation', 'Stay_Years', 'Product_Category', 'City_Code', 'Gender_Code', 'Total_User_Purchase', 'Purchase_Count', 'Avg_User_Purchase', 'Avg_Category_Purchase', 'Product_Purchase_Var', 'City_Occupation_Mean', 'Avg_Transaction_Value', 'Product_Category_Squared', 'Occupation_Squared', 'Recency_Score']
使用的分类特征: ['Gender', 'Age', 'City_Category', 'Marital_Status', 'Gender_City', 'Age_City', 'Age_Marital', 'RFM_Score', 'Preferred_Category']


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [col for col in numerical_features if col in X.columns]),
        ('cat', categorical_transformer, [col for col in categorical_features if col in X.columns])
    ])

In [10]:
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_test = preprocessor.transform(X_test)

print(f"预处理后特征数量: {preprocessed_X_train.shape[1]}")

预处理后特征数量: 149


# 3 基础模型构建与训练

## 3.1 构建

In [11]:
# 构建基础模型
base_models = [
    ('rf', RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)),
    ('gbm', GradientBoostingRegressor(n_estimators=50, random_state=42)),
    ('xgb', xgb.XGBRegressor(n_estimators=50, n_jobs=-1, random_state=42)),
    ('lgbm', lgb.LGBMRegressor(n_estimators=50, n_jobs=-1, random_state=42))
]

In [13]:
import time
# 动态确定样本大小
total_size = len(X_train)
print(f"训练集总大小: {total_size}行")

# 根据数据集大小自动确定样本大小
if total_size < 5000:
    # 如果数据集很小，使用全部数据
    sample_size = total_size
    print(f"数据集较小，使用全部{sample_size}个样本")
elif total_size < 50000:
    # 中等大小数据集，使用20%或5000个样本(取较大值)
    sample_size = max(int(total_size * 0.2), 5000)
    print(f"数据集中等大小，使用{sample_size}个样本(约{sample_size/total_size*100:.1f}%的数据)")
else:
    # 大型数据集，使用10%或10000个样本(取较大值)
    sample_size = max(int(total_size * 0.1), 10000)
    print(f"数据集较大，使用{sample_size}个样本(约{sample_size/total_size*100:.1f}%的数据)")

训练集总大小: 440054行
数据集较大，使用44005个样本(约10.0%的数据)


## 3.2 训练

In [14]:
# 如果有足够的样本，则取样本进行训练
if sample_size < total_size:
    sample_indices = np.random.choice(total_size, sample_size, replace=False)
    sample_X_train = preprocessed_X_train[sample_indices]
    sample_y_train = y_train.iloc[sample_indices]
    
    print(f"从{total_size}行数据中抽取{sample_size}行用于快速模型评估")
    
    # 使用样本数据训练模型
    for name, model in base_models:
        start_time = time.time()
        model.fit(sample_X_train, sample_y_train)
        train_time = time.time() - start_time
        
        y_pred = model.predict(preprocessed_X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        print(f"{name} (样本训练) - R²: {r2:.4f}, RMSE: {rmse:.2f}, 训练时间: {train_time:.2f}秒")
else:
    # 数据集很小，使用全部数据
    print("数据集较小，使用全部数据进行训练")
    
    for name, model in base_models:
        start_time = time.time()
        model.fit(preprocessed_X_train, y_train)
        train_time = time.time() - start_time
        
        y_pred = model.predict(preprocessed_X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        print(f"{name} - R²: {r2:.4f}, RMSE: {rmse:.2f}, 训练时间: {train_time:.2f}秒")

从440054行数据中抽取44005行用于快速模型评估
rf (样本训练) - R²: 0.6239, RMSE: 3074.26, 训练时间: 16.87秒
gbm (样本训练) - R²: 0.6712, RMSE: 2874.09, 训练时间: 5.25秒
xgb (样本训练) - R²: 0.6708, RMSE: 2875.86, 训练时间: 0.18秒
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 44005, number of used features: 142
[LightGBM] [Info] Start training from score 9267.369549
lgbm (样本训练) - R²: 0.6738, RMSE: 2863.01, 训练时间: 0.13秒


# 4 随机森林模型

## 4.1 训练

In [18]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

param_grid = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [None, 15, 20, 25],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt']
}

## 4.2 调参

In [19]:
# 随机森林模型调参
grid_search = RandomizedSearchCV(
    rf_pipeline, 
    param_distributions=param_grid,
    n_iter=20,
    cv=5, 
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [20]:
sample_indices = np.random.choice(len(X_train), sample_size, replace=False)
X_train_sample = X_train.iloc[sample_indices]
y_train_sample = y_train.iloc[sample_indices]

print("在样本数据上进行参数搜索...")
grid_search.fit(X_train_sample, y_train_sample)

print("最佳参数:")
print(grid_search.best_params_)

在样本数据上进行参数搜索...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
最佳参数:
{'model__n_estimators': 400, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'sqrt', 'model__max_depth': 20}


## 4.3 最终模型

In [24]:
# 获取最佳随机森林模型
best_rf = RandomForestRegressor(
    n_estimators=400,           # 来自网格搜索结果
    max_depth=20,               # 来自网格搜索结果
    min_samples_split=10,       # 来自网格搜索结果
    min_samples_leaf=4,         # 来自网格搜索结果
    max_features='sqrt',        # 来自网格搜索结果
    random_state=42,
    n_jobs=-1                   # 使用多核加速
)

# 创建包含预处理器和最佳模型的管道
final_rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_rf)
])

In [None]:
# 在完整训练集上训练最终模型
print("在完整训练集上训练最终模型...")
final_rf_pipeline.fit(X_train, y_train)

## 4.4 保存模型与参数字典

In [22]:
import pickle
import joblib
import os

# 创建保存模型的目录（如果不存在）
os.makedirs('../models', exist_ok=True)

In [25]:
print("保存完整的随机森林管道...")
joblib.dump(final_rf_pipeline, '../models/final_rf_pipeline.joblib')

保存完整的随机森林管道...


['../models/final_rf_pipeline.joblib']

In [23]:
best_params = {
    'n_estimators': 400,
    'max_depth': 20,
    'min_samples_split': 10,
    'min_samples_leaf': 4,
    'max_features': 'sqrt',
    'random_state': 42
}

# 将最佳参数保存为pickle文件
with open('../models/best_rf_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

## 4.5 测试集训练

In [None]:
# 在测试集上评估模型
y_pred_rf = final_rf_pipeline.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print(f"最终随机森林模型 - R²: {r2_rf:.4f}, RMSE: {rmse_rf:.2f}")

# 5 Stacking集成模型

In [None]:
from sklearn.linear_model import LinearRegression, Ridge

## 5.1 定义基础模型

In [None]:
base_estimators = [
    ('rf', RandomForestRegressor(
        n_estimators=grid_search.best_params_['model__n_estimators'],
        max_depth=grid_search.best_params_['model__max_depth'],
        min_samples_split=grid_search.best_params_['model__min_samples_split'],
        min_samples_leaf=grid_search.best_params_['model__min_samples_leaf'],
        random_state=42
    )),
    ('gbm', GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )),
    ('xgb', xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )),
    ('lgbm', lgb.LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    ))
]

## 4.5 Ridge 作为最终模型

In [None]:
stacking = StackingRegressor(
    estimators=base_estimators,
    final_estimator=Ridge(),
    cv=5
)

# 创建完整的Stacking管道
stacking_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', stacking)
])

# 训练Stacking模型
stacking_pipeline.fit(X_train, y_train)

In [None]:
# 在测试集上评估Stacking模型
y_pred_stack = stacking_pipeline.predict(X_test)
r2_stack = r2_score(y_test, y_pred_stack)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))

print(f"Stacking集成 - R²: {r2_stack:.4f}, RMSE: {rmse_stack:.2f}")