In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

# 1. 加载原始数据
bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features.copy()
y = bank_marketing.data.targets['y']

# 2. 缺失处理
X['job'] = X['job'].fillna('unknown')
X['education'] = X['education'].fillna('unknown')
X['contact'] = X['contact'].fillna('unknown')
X = X.drop(columns=['poutcome'])  # 缺失太多

# 3. 对偏态数值变量做 log1p（不影响分类变量）
skewed_cols = ['balance', 'duration', 'campaign', 'pdays', 'previous']
for col in skewed_cols:
    if (X[col] <= 0).any():
        shift = 1 - X[col].min()
        X[col] = np.log1p(X[col] + shift)
    else:
        X[col] = np.log1p(X[col])


In [4]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# 将目标变量转为 0/1
y_binary = y.map({'yes': 1, 'no': 0})

# 拷贝一份用于 LabelEncode
X_encoded = X.copy()
categorical_cols = X_encoded.select_dtypes(include='object').columns.tolist()

# 分类变量 LabelEncoder（只用于特征筛选，不用于建模）
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    encoders[col] = le

# 计算互信息得分
mi_scores = mutual_info_classif(X_encoded, y_binary, discrete_features='auto')
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# 输出互信息得分
print("各特征互信息得分：")
print(mi_series)

# 选出前 k 个特征（不会超过现有数量）
k = min(20, X_encoded.shape[1])
top_features = mi_series.head(k).index.tolist()

print(f"\n原始特征数量：{X.shape[1]}")
print(f"互信息筛选后保留特征数量：{len(top_features)}")
print("保留的特征名：", top_features)


各特征互信息得分：
duration       0.072425
pdays          0.027103
month          0.026647
balance        0.022435
housing        0.018158
contact        0.016686
previous       0.014274
age            0.012334
job            0.007936
campaign       0.006641
day_of_week    0.006228
education      0.006122
marital        0.005021
loan           0.003710
default        0.000000
dtype: float64

原始特征数量：15
互信息筛选后保留特征数量：15
保留的特征名： ['duration', 'pdays', 'month', 'balance', 'housing', 'contact', 'previous', 'age', 'job', 'campaign', 'day_of_week', 'education', 'marital', 'loan', 'default']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# 取出互信息筛选后的变量
X_selected = X[top_features]  # 保留这15个变量
y_binary = y.map({'yes': 1, 'no': 0})

# 数值+分类列
num_cols = X_selected.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_selected.select_dtypes(include='object').columns.tolist()

# 构建预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
    ]
)

# 建立带稀疏选择的模型管道
model = Pipeline([
    ('preprocess', preprocessor),
    ('select', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', C=0.1)))
])

# 拟合模型
model.fit(X_selected, y_binary)

# 获取稀疏选择后的变量掩码
selected_mask = model.named_steps['select'].get_support()

# 获取预处理后的特征名
ohe_feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols))
all_feature_names = num_cols + ohe_feature_names

# 筛选后变量名
final_features = [name for i, name in enumerate(all_feature_names) if selected_mask[i]]

print(f"\nL1 正则筛选后保留特征数：{len(final_features)}")
print("保留的特征名：", final_features)



L1 正则筛选后保留特征数：34
保留的特征名： ['duration', 'pdays', 'balance', 'previous', 'campaign', 'day_of_week', 'month_aug', 'month_dec', 'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep', 'housing_yes', 'contact_telephone', 'contact_unknown', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'education_secondary', 'education_tertiary', 'education_unknown', 'marital_married', 'marital_single', 'loan_yes']


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# 创建随机森林管道
rf_model = Pipeline([
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_model.fit(X_selected, y_binary)

# 获取特征重要性
importances = rf_model.named_steps['rf'].feature_importances_
importance_series = pd.Series(importances, index=all_feature_names).sort_values(ascending=False)

# 打印重要性排名前N
print("\n随机森林特征重要性排名：")
print(importance_series.head(17))



随机森林特征重要性排名：
duration               0.283234
balance                0.104977
age                    0.100124
day_of_week            0.093040
pdays                  0.058525
campaign               0.041263
previous               0.028802
housing_yes            0.024559
contact_unknown        0.015261
month_mar              0.013945
education_secondary    0.013460
marital_married        0.012279
education_tertiary     0.011989
month_oct              0.011756
month_aug              0.011287
job_technician         0.011215
month_jun              0.011208
dtype: float64


你想我接下来帮你写哪种聚类分析流程？可以选：

✅ PCA + KMeans + 可视化聚类结构
✅ UMAP + KMeans
✅ 用这10个变量做 DBSCAN / Agglomerative clustering
✅ 先降维再可视化 + 群体解释分析（特征均值对比）这四种方法都给我返回一下