In [None]:
os.makedirs('model', exist_ok=True)
os.makedirs('embedding', exist_ok=True)


def w2v_emb(df, f1, f2):
    emb_size = 16

    model_path = 'model/w2v_{}_{}_{}.m'.format(f1, f2, emb_size)
    embedding_path = 'embedding/{}_{}_{}.pkl'.format(f1, f2, emb_size)

    if os.path.exists(embedding_path):
        embedding = pd.read_pickle(embedding_path)
        return embedding

    tmp = df.groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})

    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]

    if os.path.exists(model_path):
        model = Word2Vec.load(model_path)
    else:
        model = Word2Vec(sentences,
                         size=emb_size,
                         window=5,
                         min_count=5,
                         sg=0,
                         hs=1,
                         seed=seed)#这边的参数可以再考虑考虑
        model.save(model_path)

    emb_matrix = []
    for seq in tqdm(sentences):
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    df_emb = pd.DataFrame(emb_matrix)
    df_emb.columns = [
        '{}_{}_emb_{}'.format(f1, f2, i) for i in range(emb_size)
    ]

    embedding = pd.concat([tmp, df_emb], axis=1)
    embedding.to_pickle(embedding_path)

    return embedding

In [None]:
def tfidf_emb(df, f1, f2):
    emb_size = 10

    df[f2] = df[f2].astype(str)
    df[f2].fillna('-1', inplace=True)
    group_df = df.groupby([f1]).apply(
        lambda x: x[f2].tolist()).reset_index()
    group_df.columns = [f1, 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = TfidfVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_tfidf_{}_{}'.format(
        f2, i) for i in range(emb_size)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

In [None]:
def countvec_emb(df, f1, f2):
    emb_size = 10

    df[f2] = df[f2].astype(str)
    df[f2].fillna('-1', inplace=True)
    group_df = df.groupby([f1]).apply(
        lambda x: x[f2].tolist()).reset_index()
    group_df.columns = [f1, 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(
        f2, i) for i in range(emb_size)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

In [None]:
for f1, f2 in [['user', 'op_mode'], ['user', 'op_type']]:
    df_feature = df_feature.merge(tfidf_emb(df_op, f1, f2), on=f1, how='left')

In [12]:
import pandas as pd 
import numpy as np

In [22]:
df =pd.DataFrame([[101,1],[101,2],[102,2],[103,1]],columns=['user','channel'])
df

Unnamed: 0,user,channel
0,101,1
1,101,2
2,102,2
3,103,1


In [20]:
#一种类别特征的分类统计次数
for col in ['channel']:
    df_temp = df[['user', col]].copy()
    df_temp['tmp'] = 1
    df_temp = df_temp.pivot_table(index='user', columns=col,
                                  values='tmp', aggfunc=np.sum).reset_index().fillna(0)
    df_temp.columns = [c if c == 'user' else 'op_{}_{}_count'.format(
        col, c) for c in df_temp.columns]

In [21]:
df_temp

Unnamed: 0,user,op_channel_1_count,op_channel_2_count
0,101,1.0,1.0
1,102,0.0,1.0
2,103,1.0,0.0
