In [1]:
import sys
sys.path.append("./")
sys.path.append("../")

# 使用聚类算法衍生特征
import pandas as pd
from sklearn.cluster import KMeans


def cluster_fea_gen(data, selected_cols, n_clusters):
    """
    使用聚类算法生成特征
    :param data: 用作输入的x,y
    :param selected_cols: 选取用来做聚类的特征列
    :param n_clusters: 聚类类别数
    :return: 聚类算法生成的特征
    """
    x_cluster_feas = data.loc[:, selected_cols]
    # 拟合聚类模型
    clf = KMeans(n_clusters=n_clusters, random_state=1)
    clf.fit(x_cluster_feas)
    return clf


def cluster_fea_apply(data, selected_cols, clf):
    """
    使用聚类算法生成特征
    :param data: 用作输入的x,y
    :param selected_cols: 选取用来做聚类的特征列
    :param clf: 聚类模型
    :return: 聚类算法生成的特征
    """
    # 对原数据表进行类别标记
    data['group'] = clf.predict(data[selected_cols])

    # 距质心距离特征的计算
    centers_df = pd.DataFrame(clf.cluster_centers_)
    centers_df.columns = [x + '_center' for x in selected_cols]

    for item in selected_cols:
        data[item + '_center'] = data['group'].apply(
            lambda x: centers_df.iloc[x, :][item + '_center'])
        data[item + '_distance'] = data[item] - data[item + '_center']

    fea_cols = ['group']
    fea_cols.extend([x + '_distance' for x in selected_cols])

    return data.loc[:, fea_cols]

In [8]:
centers_df =pd.DataFrame(model.cluster_centers_)

In [2]:
all_x_y = pd.read_excel('data/order_feas.xlsx')
all_x_y.set_index('order_no', inplace=True)
    # 取以下几个特征做聚类
chose_cols = ['orderv1_age', 'orderv1_90_workday_application_amount_mean', 'orderv1_history_order_num',
                  'orderv1_max_overdue_days']
all_x_y.fillna(0, inplace=True)

In [3]:
model = cluster_fea_gen(all_x_y, chose_cols, n_clusters=5)

In [4]:
fea_cluster = cluster_fea_apply(all_x_y, chose_cols, model)

In [5]:
print("使用聚类算法衍生特征数: \n", fea_cluster.shape[1])

使用聚类算法衍生特征数: 
 5


In [6]:
fea_cluster

Unnamed: 0_level_0,group,orderv1_age_distance,orderv1_90_workday_application_amount_mean_distance,orderv1_history_order_num_distance,orderv1_max_overdue_days_distance
order_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9268058473941147,3,4.818182,21784.512162,-15.318182,-1.166667
9267961137580040,4,-10.120000,-27674.933955,1.760000,0.440000
9267741647036595,3,-4.181818,25784.512162,-10.318182,-0.166667
9267846324091420,3,-1.181818,31784.512162,-5.318182,-0.166667
9268058096167541,3,-7.181818,20117.845495,-7.318182,-1.166667
...,...,...,...,...,...
9268009388699351,3,-7.181818,-6110.224680,57.681818,-0.166667
9268104629475002,3,-12.181818,-32215.487838,-12.318182,-1.166667
9268104706970592,3,0.818182,21784.512162,-18.318182,-1.166667
9268002569304461,0,8.446809,-1195.083927,-9.000000,-0.127660
