In [None]:
# ===============================================================
# 全流程：预处理 ➜ 特征精简 ➜ 两套 RandomForest ➜ 结果对比
# ===============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import joblib
from datetime import datetime

# ---------------- 读取数据（正确分隔符 ;） ----------------
df = pd.read_csv('marketing_campaign_CLEANED.csv', sep=',')

# ---------- ① 解析 Dt_Customer 并生成天数字段 ----------
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')

# 以数据集中最晚日期为“今天”，也可改为 datetime(2025,1,1)
ref_date = df['Dt_Customer'].max()
df['DaysSinceSignup'] = (ref_date - df['Dt_Customer']).dt.days

# ---------- ② 常规预处理 ----------
df['Age'] = 2025 - df['Year_Birth']
df = df[df['Age'] <= 100]
df = df.dropna(subset=['Income'])

# One-Hot 编码
df = pd.get_dummies(df, columns=['Education', 'Marital_Status'], drop_first=True)

# 构造 TotalSpent（不含 MntWines）
df['TotalSpent'] = df[['MntFruits','MntMeatProducts','MntFishProducts',
                       'MntSweetProducts','MntGoldProds']].sum(axis=1)

# 活跃过滤
Setting_Amount = 50
df['TotalSpendingAll'] = df[['MntWines','MntFruits','MntMeatProducts',
                             'MntFishProducts','MntSweetProducts','MntGoldProds']].sum(axis=1)
df_active = df[df['TotalSpendingAll'] >= Setting_Amount].copy()

# ---------- ③ 删除原始日期列，保留 DaysSinceSignup ----------
df_active = df_active.drop(columns=['Dt_Customer'])

# ---------- ④ 目标 & 特征 ----------
y = df_active['MntWines'].astype(float)
X_full = df_active.drop(['MntWines', 'TotalSpendingAll'], axis=1)


# ----------------------- 3. 先做一次 RF 获取重要性 ------------------
rf_init = RandomForestRegressor(
    n_estimators=400, random_state=42, n_jobs=-1)
rf_init.fit(X_full, y)

# 依据重要性阈值 0.01 精简特征
imp_series = pd.Series(rf_init.feature_importances_, index=X_full.columns)
important_features = imp_series[imp_series > 0.01].index.tolist()
X_reduced = X_full[important_features]
print(important_features)




print(f"❯❯ 重要性阈值 0.01 后，特征数：{len(important_features)} / {X_full.shape[1]}")

# ----------------------- 4. 统一划分训练/测试 -----------------------
# 使用相同 random_state 保证索引一致
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_full, y, test_size=0.20, random_state=42)
X_train_red  = X_train_full[important_features]
X_test_red   = X_test_full[important_features]

# ----------------------- 5. 训练两套 RandomForest ------------------
# 你可将 param_grid 调整为更大范围做 GridSearchCV，这里直接给常用参数
rf_params = {'n_estimators': 500, 'max_depth': None,
             'min_samples_split': 2, 'min_samples_leaf': 1,
             'random_state': 42, 'n_jobs': -1}

rf_full  = RandomForestRegressor(**rf_params).fit(X_train_full, y_train)
rf_red   = RandomForestRegressor(**rf_params).fit(X_train_red,  y_train)

# ----------------------- 6. 预测 & 指标对比 ------------------------
pred_full = rf_full.predict(X_test_full)
pred_red  = rf_red.predict(X_test_red)

r2_full,  mse_full  = r2_score(y_test, pred_full), mean_squared_error(y_test, pred_full)
r2_red,   mse_red   = r2_score(y_test, pred_red),  mean_squared_error(y_test, pred_red)

print("\n================= RandomForest 对比 =================")
print(f"{'模型':<15}{'特征数':<8}{'R²':>8}{'MSE':>12}")
print(f"{'Full RF':<15}{X_full.shape[1]:<8}{r2_full:8.3f}{mse_full:12.2f}")
print(f"{'Reduced RF':<15}{len(important_features):<8}{r2_red:8.3f}{mse_red:12.2f}")

# ----------------------- 7. 可视化散点图 --------------------------
fig, axes = plt.subplots(1, 2, figsize=(12,5), sharey=True, sharex=True)

axes[0].scatter(y_test, pred_full, alpha=0.5, color='steelblue')
axes[0].plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()], 'r--')
axes[0].set_title(f'Full RF (R²={r2_full:.3f})')
axes[0].set_xlabel("Actual")
axes[0].set_ylabel("Predicted")

axes[1].scatter(y_test, pred_red, alpha=0.5, color='seagreen')
axes[1].plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()], 'r--')
axes[1].set_title(f'Reduced RF (R²={r2_red:.3f})')
axes[1].set_xlabel("Actual")

plt.suptitle("Predicted vs Actual Red Wine Spending\nFull vs Reduced Feature Sets")
plt.tight_layout()
plt.show()

# ----------------------- 8. 保存精简模型（可选） ---------------------
joblib.dump(rf_red, 'redwine_rf_reduced.pkl')
print("✓ 精简 RandomForest 已保存为 redwine_rf_reduced.pkl")


['ID', 'Income', 'Recency', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp4', 'AcceptedCmp5', 'DaysSinceSignup', 'Education_PhD', 'TotalSpent']


ValueError: All arrays must be of the same length