In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
pickle_path = "../pickle"
vector_path = "../vector"

app_data_nlp = pd.read_pickle("{}/device_new_app.pickle".format(pickle_path))
user_fav_nlp = pd.read_pickle("{}/tag_weight_new_data.pickle".format(pickle_path))


In [3]:
app_data_nlp.head()

Unnamed: 0,deviceid,applist,app_len
0,000046581b8a28c431be90c278674925,"[app_133, app_1]",2
1,00016381ab699d4e76dc99291e79e7a1,[app_133],1
2,0001c7e6a85a3a4498fe0c5f29f3a379,[app_133],1
3,000207c515d01c00e9144c6866b546a7,"[app_133, app_1]",2
4,000355d66e3fe127c8c2dd1ef60322a3,"[app_84, app_85, app_4, app_5, app_86, app_87,...",86


In [4]:
user_fav_nlp.head()

Unnamed: 0,deviceid,all_tag_word,all_tag_weight
0,000046581b8a28c431be90c278674925,"[美食, --其他, 美食攻略, 花絮片段, 玩具, 吃秀, 社会热点, 中医, 片段, 大...","[0.4171913341996304, 0.36140167938226964, 0.35..."
1,00016381ab699d4e76dc99291e79e7a1,[未知],[0]
2,0001c7e6a85a3a4498fe0c5f29f3a379,"[社会热点, --其他, 古代, 范冰冰, 台湾, 李治廷, 彦希, 灰姑娘, 清朝, 总裁...","[0.8310844893612963, 0.3135020218516166, 6.367..."
3,000207c515d01c00e9144c6866b546a7,"[海军, 航母, 导弹, 武器, 武器, 导弹, 洲际导弹, 大妈, 海军, 航母, 网游,...","[17.15805189101101, 13.780793638746603, 13.220..."
4,000355d66e3fe127c8c2dd1ef60322a3,"[东北, 大盘, 菜谱]","[37.141856323864594, 35.747926949211916, 4.949..."


In [5]:
#glove model
from glove import *


t1 = time.time()
c = Corpus()
c.fit(app_data_nlp['applist'].values)
glove = Glove(no_components=300, learning_rate=0.05) 
glove.fit(c.matrix,epochs=12,no_threads=30,verbose=1)
glove.add_dictionary(c.dictionary)
glove.save("{}/app_data_glove300.model".format(vector_path))
print(time.time()-t1)
# 耗时51.98334240913391


t1 = time.time()
c = Corpus()
c.fit(user_fav_nlp['all_tag_word'].values)
glove = Glove(no_components=300, learning_rate=0.05) 
glove.fit(c.matrix,epochs=12,no_threads=30,verbose=1)
glove.add_dictionary(c.dictionary)
glove.save("{}/user_favorite_glove300.model".format(vector_path))
print(time.time()-t1)
# 耗时179.15632033348083

Performing 12 training epochs with 30 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
36.16325783729553
Performing 12 training epochs with 30 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
121.92749738693237


In [6]:
# Gen W2V Vector
from gensim import models


t1 = time.time()
w2v = models.Word2Vec(app_data_nlp['applist'].values, size=300, window=20, workers=40,hs=1) # 设置sg的话 变成skip-gram方法 我们测试效果差不多
w2v.wv.save_word2vec_format("{}/app_data_w2v300.model".format(vector_path))
print(time.time()-t1)
# 耗时19.42617154121399


t1 = time.time()
w2v = models.Word2Vec(user_fav_nlp['all_tag_word'].values, size=300, window=20, workers=40,hs=1) # 设置sg的话 变成skip-gram方法 我们测试效果差不多
w2v.wv.save_word2vec_format("{}/user_fav_w2v300.model".format(vector_path))
print(time.time()-t1)
# 耗时43.82200646400452

19.68525004386902
41.50236511230469


In [7]:
t = []
c = []

t1 = time.time()
tfidf = TfidfVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=1,ngram_range=(1,1))
t.append(tfidf.fit_transform(app_data_nlp['applist'].map(lambda x:' '.join(x)).values))
cv = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=1,ngram_range=(1,1))
c.append(cv.fit_transform(app_data_nlp['applist'].map(lambda x:' '.join(x)).values))  
print(time.time()-t1)

t1 = time.time()
tfidf = TfidfVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=1,ngram_range=(1,1))
t.append(tfidf.fit_transform(user_fav_nlp['all_tag_word'].map(lambda x:' '.join(x)).values))
cv = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b",min_df=1,ngram_range=(1,1))
c.append(cv.fit_transform(user_fav_nlp['all_tag_word'].map(lambda x:' '.join(x)).values))
print(time.time()-t1)

from scipy import sparse
if not os.path.exists("{}/Sparse_Matrix".format(vector_path)):
    os.mkdir("{}/Sparse_Matrix".format(vector_path))
sparse.save_npz('{}/Sparse_Matrix/app_data_tfidf.npz'.format(vector_path), t[0])
sparse.save_npz('{}/Sparse_Matrix/user_fav_tfidf.npz'.format(vector_path), t[1])

sparse.save_npz('{}/Sparse_Matrix/app_data_count.npz'.format(vector_path), c[0])
sparse.save_npz('{}/Sparse_Matrix/user_fav_count.npz'.format(vector_path), c[1])

2.697256326675415
5.720769643783569


In [8]:
t[0]

<114584x25730 sparse matrix of type '<class 'numpy.float64'>'
	with 2092443 stored elements in Compressed Sparse Row format>

In [9]:
c[0]

<114584x25730 sparse matrix of type '<class 'numpy.int64'>'
	with 2092443 stored elements in Compressed Sparse Row format>