In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle/"
deal_device_app = pd.read_pickle("{}/deal_device_new_app.pickle".format(pickle_path))
deal_tag_data = pd.read_pickle("{}/deal_tag_new_data.pickle".format(pickle_path))
train_data = pd.read_pickle("{}/train_data.pickle".format(pickle_path))
test_data = pd.read_pickle("{}/test_data.pickle".format(pickle_path))
deal_device_app.shape,train_data.shape,test_data.shape

In [2]:
deal_device_app.head()

Unnamed: 0,deviceid,applist
0,000046581b8a28c431be90c278674925,app_133
1,000046581b8a28c431be90c278674925,app_1
2,00016381ab699d4e76dc99291e79e7a1,app_133
3,0001c7e6a85a3a4498fe0c5f29f3a379,app_133
4,000207c515d01c00e9144c6866b546a7,app_133


In [3]:
deal_tag_data.head()

Unnamed: 0,deviceid,all_tag_word
0,000046581b8a28c431be90c278674925,美食
1,000046581b8a28c431be90c278674925,--其他
2,000046581b8a28c431be90c278674925,美食攻略
3,000046581b8a28c431be90c278674925,花絮片段
4,000046581b8a28c431be90c278674925,玩具


In [4]:
# Graph Feature
import scipy.sparse
from scipy import linalg
from scipy.special import iv
import scipy.sparse as sp

from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD

import argparse
import time

class ProNE():
    def __init__(self, G, emb_size=128, step=10, theta=0.5, mu=0.2, n_iter=5, random_state=2019):
        self.G = G
        self.emb_size = emb_size
        self.G = self.G.to_undirected()
        self.node_number = self.G.number_of_nodes()
        self.random_state = random_state
        self.step = step
        self.theta = theta
        self.mu = mu
        self.n_iter = n_iter
        
        mat = scipy.sparse.lil_matrix((self.node_number, self.node_number))
        print(mat.shape)
        for e in tqdm(self.G.edges()):
            if e[0] != e[1]:
                mat[int(e[0]), int(e[1])] = 1
                mat[int(e[1]), int(e[0])] = 1
        self.mat = scipy.sparse.csr_matrix(mat)
        print(mat.shape)

    def get_embedding_rand(self, matrix):
        # Sparse randomized tSVD for fast embedding
        t1 = time.time()
        l = matrix.shape[0]
        smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
        print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
        U, Sigma, VT = randomized_svd(smat, n_components=self.emb_size, n_iter=self.n_iter, random_state=self.random_state)
        U = U * np.sqrt(Sigma)
        U = preprocessing.normalize(U, "l2")
        print('sparsesvd time', time.time() - t1)
        return U

    def get_embedding_dense(self, matrix, emb_size):
        # get dense embedding via SVD
        t1 = time.time()
        U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)
        U = np.array(U)
        U = U[:, :emb_size]
        s = s[:emb_size]
        s = np.sqrt(s)
        U = U * s
        U = preprocessing.normalize(U, "l2")
        print('densesvd time', time.time() - t1)
        return U

    def fit(self, tran, mask):
        # Network Embedding as Sparse Matrix Factorization
        t1 = time.time()
        l1 = 0.75
        C1 = preprocessing.normalize(tran, "l1")
        neg = np.array(C1.sum(axis=0))[0] ** l1

        neg = neg / neg.sum()

        neg = scipy.sparse.diags(neg, format="csr")
        neg = mask.dot(neg)
        print("neg", time.time() - t1)

        C1.data[C1.data <= 0] = 1
        neg.data[neg.data <= 0] = 1

        C1.data = np.log(C1.data)
        neg.data = np.log(neg.data)

        C1 -= neg
        F = C1
        features_matrix = self.get_embedding_rand(F)
        return features_matrix

    def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
        # NE Enhancement via Spectral Propagation
        print('Chebyshev Series -----------------')
        t1 = time.time()

        if order == 1:
            return a

        A = sp.eye(self.node_number) + A
        DA = preprocessing.normalize(A, norm='l1')
        L = sp.eye(self.node_number) - DA

        M = L - mu * sp.eye(self.node_number)

        Lx0 = a
        Lx1 = M.dot(a)
        Lx1 = 0.5 * M.dot(Lx1) - a

        conv = iv(0, s) * Lx0
        conv -= 2 * iv(1, s) * Lx1
        for i in range(2, order):
            Lx2 = M.dot(Lx1)
            Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
            #         Lx2 = 2*L.dot(Lx1) - Lx0
            if i % 2 == 0:
                conv += 2 * iv(i, s) * Lx2
            else:
                conv -= 2 * iv(i, s) * Lx2
            Lx0 = Lx1
            Lx1 = Lx2
            del Lx2
            print('Bessell time', i, time.time() - t1)
        mm = A.dot(a - conv)
        self.embeddings = self.get_embedding_dense(mm, self.emb_size)
        return self.embeddings
    
    def transform(self):
        if self.embeddings is None:
            print("Embedding is not train")
            return {}
        self.embeddings = pd.DataFrame(self.embeddings)
        self.embeddings.columns = ['ProNE_Emb_deviceid_{}'.format(i) for i in range(len(self.embeddings.columns))]
        self.embeddings = self.embeddings.reset_index().rename(columns={'index' : 'nodes'}).sort_values(by=['nodes'],ascending=True).reset_index(drop=True)

        return self.embeddings

In [11]:
from tqdm import tqdm
import networkx as nx
import igraph as ig

def get_graph_embedding(df,prefix,fea1,fea2,size = 32,slow_fea = False):
    
    fea1_lbl,fea2_lbl = LabelEncoder(),LabelEncoder()
    new_fea1 = 'new_' + fea1
    new_fea2 = 'new_' + fea2
    df[new_fea1] = fea1_lbl.fit_transform(df[fea1])
#     deal_device_app.sort_values(by = new_fea1,inplace=True)
    df[new_fea2] = fea2_lbl.fit_transform(df[fea2])
    df[new_fea2] += df[new_fea1].max() + 1
    #df.drop_duplicates([new_fea1,new_fea2],inplace = True)
    
    print("Encoder Finished...")
    
    G = ig.Graph(directed = True)
    G.add_vertices(df[new_fea2].max()+1)
    G.add_edges(df[[new_fea1,new_fea2]].values)
    print(df[[new_fea1,new_fea2]].head())
    print("Build Graph Finished...")
    evcent = G.evcent() # 计算图中节点的向量中心性
    shell_index = G.shell_index() # 计算图中节点度至少为K的最大子图
    degree = G.degree() # 总度数
    pagerank = G.pagerank() # pagerank
    
    G_stat = pd.DataFrame()
    
    # 以下4个计算的很慢，效果还不错，可以注释掉，观察evcent的效果
    if(slow_fea):
        closeness = G.closeness() # 计算节点与网络中其他所有节点的距离的平均值   
        betweenness = G.betweenness() # 计算节点的介值
        constraint = G.constraint()
        eccentricity = G.eccentricity() # 计算给定节点到图中其他节点的最短距离的最大值。
    
    
        G_stat['closeness'] = closeness
        G_stat['betweenness'] = betweenness
        G_stat['constraint'] = constraint
        G_stat['eccentricity'] = eccentricity
   
    G_stat['evcent'] = evcent
    G_stat['shell_index'] = shell_index
    G_stat['degree'] = degree
    G_stat['pagerank'] = pagerank
    print("PR Finished...")
    
    G_stat = G_stat.reset_index()
    G_stat = G_stat[G_stat['index'].isin(df[new_fea1])]
    G_stat['index'] = fea1_lbl.inverse_transform(G_stat['index'])
    print(G_stat.head())
    print(G_stat.shape)
    print("Graph Stat Finished...")
    
    G_stat.to_csv("{}/Graph_edges_{}.csv".format(pickle_path,prefix),index = False)
    
    del G
    
    import gc
    gc.collect()
    
    G = nx.Graph()
    G.add_edges_from(df[[new_fea1,new_fea2]].values)
    model = ProNE(G,emb_size=size,n_iter=6,step=12)
    features_matrix = model.fit(model.mat, model.mat)
    model.chebyshev_gaussian(model.mat, features_matrix, model.step, model.mu, model.theta)
    emb = model.transform()
    fea = emb[emb['nodes'].isin(df[new_fea1])]
    fea['nodes'] = fea1_lbl.inverse_transform(fea['nodes'])
    fea.rename(columns={'nodes' : fea1},inplace=True)
    del G
    gc.collect()
    print("Embedding Finished...")
    print(fea.head())
    fea.to_csv("{}/Graph_prone_{}.csv".format(pickle_path,prefix),index = False)
    
    return fea,G_stat

In [12]:
fea_tag,stat_tag = get_graph_embedding(deal_tag_data,'tag','deviceid','all_tag_word',size = 32)

Encoder Finished...
   new_deviceid  new_all_tag_word
0             0            137148
1             0            114569
2             0            137159
3             0            138289
4             0            133984
Build Graph Finished...
PR Finished...
                              index  evcent  shell_index  degree  pagerank
0  000046581b8a28c431be90c278674925     0.0          210     305  0.000004
1  00016381ab699d4e76dc99291e79e7a1     0.0            1       1  0.000004
2  0001c7e6a85a3a4498fe0c5f29f3a379     0.0          151     171  0.000004
3  000207c515d01c00e9144c6866b546a7     0.0            8       8  0.000004
4  000355d66e3fe127c8c2dd1ef60322a3     0.0            3       3  0.000004
(114567, 5)
Graph Stat Finished...


  0%|          | 16627/4321790 [00:00<00:25, 166269.37it/s]

(144986, 144986)


100%|██████████| 4321790/4321790 [00:28<00:00, 153557.22it/s]


(144986, 144986)
neg 0.2760143280029297
svd sparse 0.0004111890295775046
sparsesvd time 9.635740756988525
Chebyshev Series -----------------
Bessell time 2 1.653660535812378
Bessell time 3 2.381338596343994
Bessell time 4 3.0825624465942383
Bessell time 5 3.7954540252685547
Bessell time 6 4.532383441925049
Bessell time 7 5.2146995067596436
Bessell time 8 5.950856685638428
Bessell time 9 6.6685028076171875
Bessell time 10 7.374639987945557
Bessell time 11 8.093040943145752
densesvd time 0.2681765556335449
Embedding Finished...
                           deviceid  ProNE_Emb_deviceid_0  \
0  000046581b8a28c431be90c278674925             -0.258863   
1  00016381ab699d4e76dc99291e79e7a1             -0.000344   
2  0001c7e6a85a3a4498fe0c5f29f3a379             -0.262779   
3  000207c515d01c00e9144c6866b546a7             -0.082072   
4  000355d66e3fe127c8c2dd1ef60322a3             -0.085687   

   ProNE_Emb_deviceid_1  ProNE_Emb_deviceid_2  ProNE_Emb_deviceid_3  \
0              0.001034       

In [4]:
fea_app,stat_app = get_graph_embedding(deal_device_app,'app','deviceid','applist',size = 32)

Encoder Finished...
         new_deviceid  new_applist
1927384             0       114584
1205409             0       118252
91513               1       118252
867562              2       118252
668188              3       118252
Build Graph Finished...
PR Finished...
                              index  evcent  shell_index  degree  pagerank  \
0  000046581b8a28c431be90c278674925     0.0            2       2  0.000004   
1  00016381ab699d4e76dc99291e79e7a1     0.0            1       1  0.000004   
2  0001c7e6a85a3a4498fe0c5f29f3a379     0.0            1       1  0.000004   
3  000207c515d01c00e9144c6866b546a7     0.0            2       2  0.000004   
4  000355d66e3fe127c8c2dd1ef60322a3     0.0           82      86  0.000004   

   closeness  betweenness  constraint  eccentricity  
0   0.416604          0.0    0.500000           1.0  
1   0.344240          0.0    1.000000           1.0  
2   0.344240          0.0    1.000000           1.0  
3   0.416604          0.0    0.500000         

  1%|          | 16421/2092443 [00:00<00:12, 164200.76it/s]

(140314, 140314)


100%|██████████| 2092443/2092443 [00:12<00:00, 167709.14it/s]


(140314, 140314)
neg 0.1442568302154541
svd sparse 0.00021256003896960154
sparsesvd time 10.008776903152466
Chebyshev Series -----------------
Bessell time 2 0.7940661907196045
Bessell time 3 1.1532483100891113
Bessell time 4 1.5249197483062744
Bessell time 5 1.8984036445617676
Bessell time 6 2.2626779079437256
Bessell time 7 2.6398448944091797
Bessell time 8 3.0125088691711426
Bessell time 9 3.378295421600342
Bessell time 10 3.740447759628296
Bessell time 11 4.108135461807251
densesvd time 0.2875185012817383
Embedding Finished...
                           deviceid  ProNE_Emb_0  ProNE_Emb_1  ProNE_Emb_2  \
0  000046581b8a28c431be90c278674925    -0.249126    -0.019937     0.212572   
1  00016381ab699d4e76dc99291e79e7a1    -0.229955    -0.032861    -0.138496   
2  0001c7e6a85a3a4498fe0c5f29f3a379    -0.229955    -0.032861    -0.138496   
3  000207c515d01c00e9144c6866b546a7    -0.249126    -0.019937     0.212572   
4  000355d66e3fe127c8c2dd1ef60322a3    -0.015703     0.191333    -0.03718

In [4]:
train_data.shape,test_data.shape

((11376681, 15), (3653592, 13))

In [5]:
all_data = pd.concat([train_data,test_data],sort = False,axis = 0)
all_data.shape

(15030273, 15)

In [6]:
all_data.head()

Unnamed: 0,id,target,timestamp,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
0,1,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,g4,9,112.5385,37.83793,STF-AL00,1573298086436
1,2,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,w,9,111.7312,35.62274,STF-AL00,1573298087570
2,3,0.0,,832aaa33cdf4a0938ba2c795eb3ffefd,4941885624885390992,d51a157d2b1e0e9aed4dd7f9900b85b2,2,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377075934
3,4,0.0,,832aaa33cdf4a0938ba2c795eb3ffefd,6088376349846612406,d51a157d2b1e0e9aed4dd7f9900b85b2,1,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377044359
4,5,0.0,,67dd9dac18cce1a6d79e8f20eefd98ab,5343094189765291622,625dc45744f59ddbc3ec8df161217188,0,2.1.1,xiaomi,w,9,116.7509,36.56831,Redmi Note 7,1573380989662


In [1]:
#自己电脑内存不够，需要用服务器跑
fea_train_device_news,stat_train_device_news = get_graph_embedding(all_data,'all_data_dev_news','deviceid','newsid',size = 48)