In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from ucimlrepo import fetch_ucirepo
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# 1. 导入数据
bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features
y = bank_marketing.data.targets['y']

# 处理缺失值
X = X.copy()  # 创建副本以避免切片问题
X['job'] = X['job'].fillna('unknown')
X['education'] = X['education'].fillna('unknown')
X['contact'] = X['contact'].fillna('unknown')
X = X.drop(columns=['poutcome'])  # 删除缺失值过多的特征

# 处理右偏连续变量
continuous_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
for col in continuous_cols:
    if col in ['balance', 'duration', 'campaign', 'pdays', 'previous']:  # 针对右偏特征
        if (X[col] <= 0).any():
            print(f"{col} contains non-positive values, applying log transformation with shift.")
            X[col] = np.log1p(X[col] - X[col].min() + 1)  # 移位确保正值
        else:
            X[col] = np.log1p(X[col])  # log(x+1)避免零值问题

# 识别数值和类别特征
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# 创建预处理管道（用于FCM、谱聚类、自编码器）
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# 保存原始数据（用于K-Prototypes）
X_raw = X.copy()

# 应用预处理（生成独热编码数据）
X_processed = preprocessor.fit_transform(X)

# 保存特征名称
feature_names = (numerical_cols.tolist() + 
                 preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist())

# 验证数据
print("Processed data shape:", X_processed.shape)
print("Sample of processed data:\n", X_processed[:5])
print("Feature names:", feature_names)

# 保存处理后的数据
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
X_processed_df.to_csv('processed_bank_data.csv', index=False)
X_raw.to_csv('raw_bank_data.csv', index=False)

# ------------------------------------------------------------------------

# 从保存的原始数据读取
X_raw = pd.read_csv('raw_bank_data.csv')

# 定义数值和类别特征
numerical_cols = ['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month']
categorical_indices = [X_raw.columns.get_loc(col) for col in categorical_cols]

# 标准化数值特征
scaler = StandardScaler()
X_raw[numerical_cols] = scaler.fit_transform(X_raw[numerical_cols])

# K-Prototypes聚类
kproto = KPrototypes(n_clusters=4, init='Cao', n_init=10, random_state=42, verbose=1)
clusters = kproto.fit_predict(X_raw, categorical=categorical_indices)

# 添加聚类标签
X_raw['Cluster'] = clusters

# 计算每个类的订阅率
subscription_rate = X_raw.groupby('Cluster')['y'].mean()
print("Subscription rate for each cluster:\n", subscription_rate)

# 聚类结果的三个评判标准
# 1. 轮廓系数（Silhouette Score）
silhouette_avg = silhouette_score(X_raw[numerical_cols + categorical_cols], clusters)
print(f"Silhouette Score: {silhouette_avg:.4f}")

# 2. Calinski-Harabasz指数
calinski_score = calinski_harabasz_score(X_raw[numerical_cols + categorical_cols], clusters)
print(f"Calinski-Harabasz Score: {calinski_score:.4f}")

# 3. Davies-Bouldin指数
davies_bouldin = davies_bouldin_score(X_raw[numerical_cols + categorical_cols], clusters)
print(f"Davies-Bouldin Score: {davies_bouldin:.4f}")

# 保存结果
X_raw.to_csv('kprototypes_clustered_data_with_metrics.csv', index=False)


balance contains non-positive values, applying log transformation with shift.
duration contains non-positive values, applying log transformation with shift.
pdays contains non-positive values, applying log transformation with shift.
previous contains non-positive values, applying log transformation with shift.
Processed data shape: (45211, 39)
Sample of processed data:
 [[ 1.60696496  0.50784905 -1.29847633  0.42939656 -0.90945712 -0.46629571
  -0.40589769  0.          0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          1.          0.          0.
   1.          0.          0.          1.          0.          0.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.        ]
 [ 0.28852927 -0.59038661 -1.29847633 -0.16545896 -0.90945712 -0.46629571
  -0.40589769  0.          0.          0.          0.          0.
   0.          0.          0.          1.   

KeyError: 'Column not found: y'