In [56]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import networkx as nx
import igraph as ig
from tqdm import tqdm
import numpy as np
from scipy import sparse

In [127]:
df1 =pd.DataFrame([[101,[1,2,2,2,2,6,4,2,8,1,2,5,9]],[102,[2,3,5,2,6,10,2]],[103,[2,4,1,7,1,5,8,2,1]],[104,[3,3,9,4,1,7,1,8,1,2,3]]],columns=['uid','appid'])

In [128]:
#把里面所有元素转化为字符串格式
df1['appid'] = df1['appid'].apply(lambda x:[str(i) for i in x])

# 列表格式和铺平格式相互转化

In [129]:
def flatten_active(df): 
    df['app_len'] = df['appid'].apply(lambda x: len(x))
    u = []
    a = []
    for i in tqdm(range(len(df['appid'].values))):
        u += [df['uid'].values[i]]*df['app_len'].values[i]
        a += list(df['appid'].values[i])
        
    new_df = pd.DataFrame()
    new_df['uid'] = u
    new_df['appid'] = a
        
    return new_df

In [130]:
df1

Unnamed: 0,uid,appid
0,101,"[1, 2, 2, 2, 2, 6, 4, 2, 8, 1, 2, 5, 9]"
1,102,"[2, 3, 5, 2, 6, 10, 2]"
2,103,"[2, 4, 1, 7, 1, 5, 8, 2, 1]"
3,104,"[3, 3, 9, 4, 1, 7, 1, 8, 1, 2, 3]"


In [131]:
df2 = flatten_active(df1)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


In [132]:
df2

Unnamed: 0,uid,appid
0,101,1
1,101,2
2,101,2
3,101,2
4,101,2
5,101,6
6,101,4
7,101,2
8,101,8
9,101,1


In [43]:
df2[['uid','appid']].groupby(['uid'])['appid'].apply(lambda x:list(x)).reset_index()

Unnamed: 0,uid,appid
0,101,"[1, , 2, , 2, , 2, , 2, , 6, , 4, , 2, ..."
1,102,"[2, , 3, , 5, , 2, , 6, , 1, 0, , 2]"
2,103,"[2, , 4, , 1, , 7, , 1, , 5, , 8, , 2, ..."
3,104,"[3, , 3, , 9, , 4, , 1, , 7, , 1, , 8, ..."


# 词袋和tfidf模型挖掘

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [21]:
df1['appid'] = df1['appid'].map(lambda x:" ".join(x))

In [24]:
tf_vec = TfidfVectorizer(lowercase=False,ngram_range=(1,1),dtype=np.float32,min_df=0.001,token_pattern='(?u)\\b\\w+\\b')

In [34]:
cv_vec = CountVectorizer(lowercase=False,ngram_range=(1,1),dtype=np.float32,min_df=0.001,token_pattern='(?u)\\b\\w+\\b')

In [83]:
full_tfidf = tf_vec.fit_transform(df1['appid']).toarray()
full_cvidf =cv_vec.fit_transform(df1['appid']).toarray()
full_tfidf = pd.DataFrame(full_tfidf,dtype='float16').add_prefix('tf-idf')
full_cvidf = pd.DataFrame(full_cvidf,dtype='float16').add_prefix('count-vec')
full_df = pd.concat([df1[['uid']],full_tfidf,full_cvidf],axis=1)

In [88]:
full_df

Unnamed: 0,uid,tf-idf0,tf-idf1,tf-idf2,tf-idf3,tf-idf4,tf-idf5,tf-idf6,tf-idf7,tf-idf8,...,count-vec0,count-vec1,count-vec2,count-vec3,count-vec4,count-vec5,count-vec6,count-vec7,count-vec8,count-vec9
0,101,0.342529,0.0,0.839844,0.0,0.171265,0.171265,0.211426,0.0,0.171265,...,2.0,0.0,6.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
1,102,0.0,0.442627,0.693359,0.349121,0.0,0.282715,0.349121,0.0,0.0,...,0.0,1.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
2,103,0.745605,0.0,0.40625,0.0,0.248413,0.248413,0.0,0.306885,0.248413,...,3.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
3,104,0.5625,0.0,0.15332,0.694824,0.1875,0.0,0.0,0.231567,0.1875,...,3.0,0.0,1.0,3.0,1.0,0.0,0.0,1.0,1.0,1.0


In [85]:
sparse_df = sparse.csr_matrix(full_df)

In [126]:
df2

Unnamed: 0,uid,appid
0,101,1
1,101,
2,101,2
3,101,
4,101,2
...,...,...
72,104,1
73,104,
74,104,2
75,104,


In [145]:
app_dict = dict(zip(df2.appid.unique().tolist(),np.arange(len(df2.appid.unique()))))

In [146]:
app_dict

{'1': 0,
 '2': 1,
 '6': 2,
 '4': 3,
 '8': 4,
 '5': 5,
 '9': 6,
 '3': 7,
 '10': 8,
 '7': 9}

In [None]:
c_vec1 = CountVectorizer(lowercase=False,ngram_range=(1,1),dtype=np.int8)

In [149]:
df1

Unnamed: 0,uid,appid,app_len
0,101,"[1, 2, 2, 2, 2, 6, 4, 2, 8, 1, 2, 5, 9]",13
1,102,"[2, 3, 5, 2, 6, 10, 2]",7
2,103,"[2, 4, 1, 7, 1, 5, 8, 2, 1]",9
3,104,"[3, 3, 9, 4, 1, 7, 1, 8, 1, 2, 3]",11


In [110]:
x_train = sequence.pad_sequences(df1['appid'], maxlen=10)

In [111]:
x_train

array([[ 2,  2,  6,  4,  2,  8,  1,  2,  5,  9],
       [ 0,  0,  0,  2,  3,  5,  2,  6, 10,  2],
       [ 0,  2,  4,  1,  7,  1,  5,  8,  2,  1],
       [ 3,  9,  4,  1,  7,  1,  8,  1,  2,  3]])

# 从聚合的角度挖掘

In [246]:
from collections import Counter
#用户行为的gini不纯度与熵
def Gini(pr):
    cate=Counter(pr)
    gini=1
    total=len(pr)
    for i in cate:
        gini=gini-(cate[i]/total)**2
    return gini

import math
# Calc Entropy
def entropy(pr):
    cate = Counter(pr)
    log2 = math.log2
    total = len(pr)
    ent = 0
    for i in cate:
        p = float(cate[i] / total)
        if p==0:
            ent=0
            continue
        ent = ent - p * (log2(p))
    return ent

def get_small(x,a,b):
    return int((x>a) & (x<=b))

In [247]:
df2

Unnamed: 0,uid,appid
0,101,1
1,101,2
2,101,2
3,101,2
4,101,2
5,102,2
6,102,3
7,102,5
8,103,2
9,103,4


In [248]:
def get_feature_flatten(df):
    
    fea = []
    # Count Encoder
    t1 = time.time()
    df['appid_count'] = df.groupby(['appid'])['uid'].transform('count')
    fea.append(df[['uid','appid_count']].groupby(['uid'])['appid_count'].agg({'mean','std','min','max','median'}))
    fea.append(df[['uid','appid']].groupby(['uid'])['appid'].apply(Gini))
    fea.append(df[['uid','appid']].groupby(['uid'])['appid'].apply(entropy))
    df['appid_count_0_2'] = df['appid_count'].map(lambda x:get_small(x,0,2))#这种行为总次数是否大于0小于1000
    df['appid_count_2_4'] = df['appid_count'].map(lambda x:get_small(x,2,4))
    df['appid_count_4_6'] = df['appid_count'].map(lambda x:get_small(x,4,6))
    tmp = df.groupby(['uid']).agg({'appid_count_0_2' : ['mean','sum','std'],
                          'appid_count_2_4' : ['mean','sum','std'],
                          'appid_count_4_6' : ['mean','sum','std']}
                          )

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    fea.append(tmp)
    print("Count Active Finish... :",time.time()-t1)
    
    return fea

In [249]:
uid0 = df1[['uid']]
fea0 = get_feature_flatten(df2)

Count Active Finish... : 0.00795125961303711


In [250]:
for i in tqdm(fea0):
    uid0 = uid0.merge(i,how='left',on='uid')

100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1002.64it/s]


In [251]:
 uid0

Unnamed: 0,uid,median,mean,std,max,min,appid_x,appid_y,appid_count_0_2_mean,appid_count_0_2_sum,appid_count_0_2_std,appid_count_2_4_mean,appid_count_2_4_sum,appid_count_2_4_std,appid_count_4_6_mean,appid_count_4_6_sum,appid_count_4_6_std
0,101,6.0,5.2,1.788854,6,2,0.32,0.721928,0.2,1,0.447214,0.0,0,0.0,0.8,4,0.447214
1,102,3.0,3.333333,2.516611,6,1,0.666667,1.584963,0.333333,1,0.57735,0.333333,1,0.57735,0.333333,1,0.57735
2,103,2.0,3.0,2.645751,6,1,0.666667,1.584963,0.666667,2,0.57735,0.0,0,0.0,0.333333,1,0.57735
3,104,3.0,3.0,0.0,3,3,0.0,0.0,0.0,0,0.0,1.0,2,0.0,0.0,0,0.0


# 从图统计量的角度和prone的角度分别挖掘特征

In [252]:
# Graph Feature
import scipy.sparse
from scipy import linalg
from scipy.special import iv
import scipy.sparse as sp

from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD

import argparse
import time

class ProNE():
    def __init__(self, G, emb_size=128, step=10, theta=0.5, mu=0.2, n_iter=5, random_state=2019):
        self.G = G
        self.emb_size = emb_size
        self.G = self.G.to_undirected()
        self.node_number = self.G.number_of_nodes()
        self.random_state = random_state
        self.step = step
        self.theta = theta
        self.mu = mu
        self.n_iter = n_iter
        
        mat = scipy.sparse.lil_matrix((self.node_number, self.node_number))

        for e in tqdm(self.G.edges()):
            if e[0] != e[1]:
                mat[int(e[0]), int(e[1])] = 1
                mat[int(e[1]), int(e[0])] = 1
        self.mat = scipy.sparse.csr_matrix(mat)
        print(mat.shape)

    def get_embedding_rand(self, matrix):
        # Sparse randomized tSVD for fast embedding
        t1 = time.time()
        l = matrix.shape[0]
        smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
        print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
        U, Sigma, VT = randomized_svd(smat, n_components=self.emb_size, n_iter=self.n_iter, random_state=self.random_state)
        U = U * np.sqrt(Sigma)
        U = preprocessing.normalize(U, "l2")
        print('sparsesvd time', time.time() - t1)
        return U

    def get_embedding_dense(self, matrix, emb_size):
        # get dense embedding via SVD
        t1 = time.time()
        U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)
        U = np.array(U)
        U = U[:, :emb_size]
        s = s[:emb_size]
        s = np.sqrt(s)
        U = U * s
        U = preprocessing.normalize(U, "l2")
        print('densesvd time', time.time() - t1)
        return U

    def fit(self, tran, mask):
        # Network Embedding as Sparse Matrix Factorization
        t1 = time.time()
        l1 = 0.75
        C1 = preprocessing.normalize(tran, "l1")
        neg = np.array(C1.sum(axis=0))[0] ** l1

        neg = neg / neg.sum()

        neg = scipy.sparse.diags(neg, format="csr")
        neg = mask.dot(neg)
        print("neg", time.time() - t1)

        C1.data[C1.data <= 0] = 1
        neg.data[neg.data <= 0] = 1

        C1.data = np.log(C1.data)
        neg.data = np.log(neg.data)

        C1 -= neg
        F = C1
        features_matrix = self.get_embedding_rand(F)
        return features_matrix

    def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
        # NE Enhancement via Spectral Propagation
        print('Chebyshev Series -----------------')
        t1 = time.time()

        if order == 1:
            return a

        A = sp.eye(self.node_number) + A
        DA = preprocessing.normalize(A, norm='l1')
        L = sp.eye(self.node_number) - DA

        M = L - mu * sp.eye(self.node_number)

        Lx0 = a
        Lx1 = M.dot(a)
        Lx1 = 0.5 * M.dot(Lx1) - a

        conv = iv(0, s) * Lx0
        conv -= 2 * iv(1, s) * Lx1
        for i in range(2, order):
            Lx2 = M.dot(Lx1)
            Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
            #         Lx2 = 2*L.dot(Lx1) - Lx0
            if i % 2 == 0:
                conv += 2 * iv(i, s) * Lx2
            else:
                conv -= 2 * iv(i, s) * Lx2
            Lx0 = Lx1
            Lx1 = Lx2
            del Lx2
            print('Bessell time', i, time.time() - t1)
        mm = A.dot(a - conv)
        self.embeddings = self.get_embedding_dense(mm, self.emb_size)
        return self.embeddings
    
    def transform(self):
        if self.embeddings is None:
            print("Embedding is not train")
            return {}
        self.embeddings = pd.DataFrame(self.embeddings)
        self.embeddings.columns = ['ProNE_Emb_{}'.format(i) for i in range(len(self.embeddings.columns))]
        self.embeddings = self.embeddings.reset_index().rename(columns={'index' : 'nodes'}).sort_values(by=['nodes'],ascending=True).reset_index(drop=True)

        return self.embeddings

避免uid和appid重复进行重新编码

In [253]:
uid_lbl,appid_lbl = LabelEncoder(),LabelEncoder()
df2['new_uid'] = uid_lbl.fit_transform(df2['uid'])
df2['new_appid'] = appid_lbl.fit_transform(df2['appid'])
df2['new_appid'] += df2['new_uid'].max() + 1

In [254]:
df2

Unnamed: 0,uid,appid,appid_count,appid_count_0_2,appid_count_2_4,appid_count_4_6,new_uid,new_appid
0,101,1,2,1,0,0,0,4
1,101,2,6,0,0,1,0,5
2,101,2,6,0,0,1,0,5
3,101,2,6,0,0,1,0,5
4,101,2,6,0,0,1,0,5
5,102,2,6,0,0,1,1,5
6,102,3,3,0,1,0,1,6
7,102,5,1,1,0,0,1,8
8,103,2,6,0,0,1,2,5
9,103,4,1,1,0,0,2,7


In [255]:
def get_graph_embedding(df,prefix):
    
    uid_lbl,appid_lbl = LabelEncoder(),LabelEncoder()
    df['new_uid'] = uid_lbl.fit_transform(df['uid'])
    df['new_appid'] = appid_lbl.fit_transform(df['appid'])
    df['new_appid'] += df['new_uid'].max() + 1
    
    print("Encoder Finished...")
    
    G = ig.Graph()
    G.add_vertices(df['new_appid'].max()+1)
    G.add_edges(df[['new_uid','new_appid']].values)
    print("Build Graph Finished...")
    evcent = G.evcent() # 计算图中节点的向量中心性
    shell_index = G.shell_index() # 计算图中节点度至少为K的最大子图
    degree = G.degree() # 总度数
    pagerank = G.pagerank() # pagerank
    # 以下4个计算的很慢，效果还不错，可以注释掉，观察evcent的效果
    closeness = G.closeness() # 计算节点与网络中其他所有节点的距离的平均值   
    betweenness = G.betweenness() # 计算节点的介值
    constraint = G.constraint()
    eccentricity = G.eccentricity() # 计算给定节点到图中其他节点的最短距离的最大值。
    
    G_stat = pd.DataFrame()
    G_stat['evcent'] = evcent
    G_stat['shell_index'] = shell_index
    G_stat['degree'] = degree
    G_stat['pagerank'] = pagerank
    print("PR Finished...")
    G_stat['closeness'] = closeness
    G_stat['betweenness'] = betweenness
    G_stat['constraint'] = constraint
    G_stat['eccentricity'] = eccentricity
    G_stat = G_stat.reset_index()
    G_stat = G_stat[G_stat['index'].isin(df['new_uid'])]
    G_stat['index'] = uid_lbl.inverse_transform(G_stat['index'])
    G_stat.rename(columns={'index' : 'uid'},inplace=True)
    
    print("Graph Stat Finished...")
    G_stat.to_pickle("Graph_Stat_{}.pickle".format(prefix))
    
    del G
    
    import gc
    gc.collect()
    
    G = nx.Graph()
    G.add_edges_from(df[['new_uid','new_appid']].values)
    model = ProNE(G,emb_size=32,n_iter=6,step=12)
    features_matrix = model.fit(model.mat, model.mat)
    model.chebyshev_gaussian(model.mat, features_matrix, model.step, model.mu, model.theta)
    emb = model.transform()
    fea = emb[emb['nodes'].isin(df['new_uid'])]
    fea['nodes'] = uid_lbl.inverse_transform(fea['nodes'])
    fea.rename(columns={'nodes' : 'uid'},inplace=True)
    del G
    gc.collect()
    print("Embedding Finished...")
    fea.to_pickle("Graph_Bi_{}.pickle".format(prefix))
    
    return fea,G_stat

In [256]:
#test为生成文件的区别标签，add_prefix为特征的区别标签。
fea1,stat1 = get_graph_embedding(df2,'test')

Encoder Finished...
Build Graph Finished...
PR Finished...
Graph Stat Finished...


100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fea['nodes'] = uid_lbl.inverse_transform(fea['nodes'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fea.rename(columns={'nodes' : 'uid'},inplace=True)


(9, 9)
neg 0.0009970664978027344
svd sparse 0.2222222222222222
sparsesvd time 0.0027778148651123047
Chebyshev Series -----------------
Bessell time 2 0.0009980201721191406
Bessell time 3 0.0009980201721191406
Bessell time 4 0.0009980201721191406
Bessell time 5 0.0009980201721191406
Bessell time 6 0.0009980201721191406
Bessell time 7 0.0009980201721191406
Bessell time 8 0.0009980201721191406
Bessell time 9 0.0009980201721191406
Bessell time 10 0.0009980201721191406
Bessell time 11 0.001995086669921875
densesvd time 0.0
Embedding Finished...


In [257]:
stat1

Unnamed: 0,uid,evcent,shell_index,degree,pagerank,closeness,betweenness,constraint,eccentricity
0,101,0.967543,4,5,0.160771,0.421053,4.0,0.68,4.0
1,102,0.25496,2,3,0.124772,0.533333,17.0,0.333333,3.0
2,103,0.306123,2,3,0.120929,0.470588,8.0,0.333333,4.0
3,104,0.032612,2,2,0.089024,0.296296,0.0,1.0,5.0


In [258]:
fea1

Unnamed: 0,uid,ProNE_Emb_0,ProNE_Emb_1,ProNE_Emb_2,ProNE_Emb_3,ProNE_Emb_4,ProNE_Emb_5,ProNE_Emb_6,ProNE_Emb_7,ProNE_Emb_8
0,101,-0.749192,-0.232707,-0.209441,-0.479306,0.216986,-0.003158,-0.210763,0.139448,-8.470183e-10
1,102,-0.529723,0.72046,-0.277333,0.194662,0.157705,0.140218,0.184612,0.083127,-2.33714e-10
2,103,-0.776861,-0.353012,0.285474,0.313275,0.232999,-0.142895,0.04955,-0.122761,2.902806e-10
3,104,-0.117049,0.577222,0.676567,-0.314392,0.060563,0.245739,-0.093973,-0.153765,-4.579472e-09


# 使用word2vec对行为序列进行编码

In [4]:
df1

Unnamed: 0,uid,appid
0,101,"[1, 2, 2, 2, 2, 6, 4, 2, 8, 1, 2, 5, 9]"
1,102,"[2, 3, 5, 2, 6, 10, 2]"
2,103,"[2, 4, 1, 7, 1, 5, 8, 2, 1]"
3,104,"[3, 3, 9, 4, 1, 7, 1, 8, 1, 2, 3]"


In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [15]:
def set_tokenizer(docs, split_char=' ', max_len=100):
    '''
    输入
    docs:文本列表
    split_char:按什么字符切割
    max_len:截取的最大长度
    
    输出
    X:序列化后的数据
    word_index:文本和数字对应的索引
    '''
    tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)
    tokenizer.fit_on_texts(docs)
    X = tokenizer.texts_to_sequences(docs)
    maxlen = max_len
    X = pad_sequences(X, maxlen=maxlen, value=0)
    word_index=tokenizer.word_index
    return X, word_index

In [16]:
text_1_list = list(df1['appid'])

In [28]:
x1, index_1 = set_tokenizer(text_1_list, split_char=',', max_len=14)

In [29]:
x1

array([[ 0,  2,  1,  1,  1,  1,  7,  4,  1,  5,  2,  1,  6,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  3,  6,  1,  7, 10,  1],
       [ 0,  0,  0,  0,  0,  1,  4,  2,  9,  2,  6,  5,  1,  2],
       [ 0,  0,  0,  3,  3,  8,  4,  2,  9,  2,  5,  2,  1,  3]])

In [34]:
def get_embedding_matrix(word_index, embed_size=64, Emed_path="w2v_300.txt"):
    embeddings_index = models.KeyedVectors.load_word2vec_format(
        Emed_path, binary=False)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    
    print("null cnt",count)
    return embedding_matrix

In [35]:
emb1 = get_embedding_matrix(index_1, Emed_path='w2v.model')

100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]

null cnt 0





In [37]:
index_1

{'2': 1,
 '1': 2,
 '3': 3,
 '4': 4,
 '8': 5,
 '5': 6,
 '6': 7,
 '9': 8,
 '7': 9,
 '10': 10}

In [40]:
emb1.shape

(11, 64)

In [42]:
[emb1]

[array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00,  0.00000000e+0

In [5]:
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument

In [7]:
w2v = models.Word2Vec(df1['appid'].values, size=64, window=10, workers=6,min_count=1,hs=1)
w2v.wv.save_word2vec_format("w2v.model")
vocab = list(w2v.wv.vocab.keys())

In [8]:
w2v_feature = np.zeros((df1.shape[0],64))
w2v_feature_avg = np.zeros((df1.shape[0],64))

In [9]:
for i,line in tqdm(enumerate(df1['appid'].values.tolist())):
    num = 0
    if line == '':
        w2v_feature_avg[i,:] = np.zeros(size)
    else:
        for word in line:
            num += 1
            vec = w2v[word] if word in vocab else np.zeros(size)
            w2v_feature[i,:] += vec
        w2v_feature_avg[i,:] = w2v_feature[i,:] / num
w2v_avg = pd.DataFrame(w2v_feature_avg)
w2v_avg = w2v_avg.add_prefix("W2V_AVG_{}_".format('test'))
w2v_avg['uid'] = df1['uid']

  vec = w2v[word] if word in vocab else np.zeros(size)
4it [00:00, ?it/s]


In [None]:
w2v.

In [10]:
w2v_avg

Unnamed: 0,W2V_AVG_test_0,W2V_AVG_test_1,W2V_AVG_test_2,W2V_AVG_test_3,W2V_AVG_test_4,W2V_AVG_test_5,W2V_AVG_test_6,W2V_AVG_test_7,W2V_AVG_test_8,W2V_AVG_test_9,...,W2V_AVG_test_55,W2V_AVG_test_56,W2V_AVG_test_57,W2V_AVG_test_58,W2V_AVG_test_59,W2V_AVG_test_60,W2V_AVG_test_61,W2V_AVG_test_62,W2V_AVG_test_63,uid
0,0.003226,-0.001955,-0.001613,0.00104,0.003063,-0.001385,0.001535,-0.003816,-0.00163,0.001973,...,-0.00171,0.001595,0.000134,-0.002412,-0.001759,-0.003532,0.000835,0.001547,-4e-06,101
1,0.003098,-0.002775,-0.003447,0.000789,0.002569,-0.00111,0.003224,-0.002711,-0.002717,0.001411,...,-0.001617,-0.000565,-0.001683,-0.002733,0.000548,-0.003351,0.000176,0.004593,0.000687,102
2,0.002334,-0.001546,0.000371,-0.001419,0.002523,0.001004,-0.000591,-0.001243,0.000767,-0.000566,...,-0.001466,0.002589,0.002559,-0.000376,-0.000444,-0.002132,-0.000282,-0.000864,-0.00228,103
3,-0.001266,-0.00051,5e-05,-0.001406,0.000962,0.000303,-0.000604,-0.001018,0.001745,-0.001093,...,0.001416,0.001532,0.000942,-0.001672,0.002009,-0.001472,0.002534,-6.2e-05,-0.001014,104


In [14]:
def load_w2v(word_index,max_features):    
    EMBEDDING_FILE = 'w2v.model'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

# 将向量化的结果和trian，test合并

In [None]:
#之前的每一个embedding的df都有uid
train = pd.merge(train[['uid']],full_df, how='inner', on='uId').fillna(0) #内连接
test = pd.merge(test,df, how='inner', on='uId').fillna(0) 
train.sort_values('uid', axis=0, ascending=True, inplace=True)
test.sort_values('uid', axis=0, ascending=True, inplace=True)
train.drop('uid',axis=1,inplace=True)
test.drop('uid',axis=1,inplace=True)

In [None]:
#压缩为稀疏矩阵并保存
train = csr_matrix(train) 
test = csr_matrix(test) 
sparse.save_npz('for_trian.npz', train)
sparse.save_npz('for_train.npz', test)

In [None]:
#载入
base_train = sparse.load_npz('for_trian.npz')
base_test = sparse.load_npz('for_trian.npz')

In [None]:
#使用时用hstack把各种稀疏矩阵拼接之后直接放入树模型即可
train = sparse.hstack((base_train1,base_train2))