In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# 创建示例数据集
data = pd.DataFrame({
    'gender': ['Male', 'Female', 'Male', 'Other', 'Female'],
    'education': ["Bachelor's", "Master's", "High School", "PhD", "Bachelor's"],
    'marital_status': ['Married', 'Single', 'Divorced', 'Married', 'Single'],
    'age': [25, 30, 45, 35, 28],
    'income': [50000, 75000, 60000, 90000, 55000],
    'purchased': [0, 1, 0, 1, 0]
})

# 分离特征和目标变量
X = data.drop('purchased', axis=1)
y = data['purchased']

# 分离分类特征和数值特征
categorical_cols = ['gender', 'education', 'marital_status']
numeric_cols = ['age', 'income']

# 创建 OneHotEncoder 实例
encoder = OneHotEncoder(
    handle_unknown='ignore',  # 忽略未知类别
    sparse_output=False,      # 返回密集数组而非稀疏矩阵
    dtype=np.int32            # 设置输出类型为整数
)

# 对分类特征进行独热编码
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

# 将编码后的分类特征转换为 DataFrame
encoded_df = pd.DataFrame(
    X_categorical_encoded,
    columns=encoder.get_feature_names_out(categorical_cols)
)

# 将数值特征与编码后的分类特征合并
X_encoded = pd.concat([encoded_df, X[numeric_cols].reset_index(drop=True)], axis=1)

# 显示原始数据和编码后的数据
print("原始数据:")
print(X.to_csv(sep='\t', na_rep='nan'))

print("\n编码后的数据:")
print(X_encoded.to_csv(sep='\t', na_rep='nan'))

print("\n特征名称映射:")
for feature, categories in zip(categorical_cols, encoder.categories_):
    print(f"{feature}: {', '.join(categories)}")

原始数据:
	gender	education	marital_status	age	income
0	Male	Bachelor's	Married	25	50000
1	Female	Master's	Single	30	75000
2	Male	High School	Divorced	45	60000
3	Other	PhD	Married	35	90000
4	Female	Bachelor's	Single	28	55000


编码后的数据:
	gender_Female	gender_Male	gender_Other	education_Bachelor's	education_High School	education_Master's	education_PhD	marital_status_Divorced	marital_status_Married	marital_status_Single	age	income
0	0	1	0	1	0	0	0	0	1	0	25	50000
1	1	0	0	0	0	1	0	0	0	1	30	75000
2	0	1	0	0	1	0	0	1	0	0	45	60000
3	0	0	1	0	0	0	1	0	1	0	35	90000
4	1	0	0	1	0	0	0	0	0	1	28	55000


特征名称映射:
gender: Female, Male, Other
education: Bachelor's, High School, Master's, PhD
marital_status: Divorced, Married, Single
