# 词向量与主题词挖掘

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
import datetime
import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"

device_app = pd.read_pickle("{}/device_new_app.pickle".format(pickle_path))
tag_data = pd.read_pickle("{}/tag_weight_new_data.pickle".format(pickle_path))

device_app.shape,tag_data.shape

((114584, 3), (114584, 3))

In [2]:
device_app.head()

Unnamed: 0,deviceid,applist,app_len
0,000046581b8a28c431be90c278674925,"[app_133, app_1]",2
1,00016381ab699d4e76dc99291e79e7a1,[app_133],1
2,0001c7e6a85a3a4498fe0c5f29f3a379,[app_133],1
3,000207c515d01c00e9144c6866b546a7,"[app_133, app_1]",2
4,000355d66e3fe127c8c2dd1ef60322a3,"[app_84, app_85, app_4, app_5, app_86, app_87,...",86


In [3]:
tag_data.head()

Unnamed: 0,deviceid,all_tag_word,all_tag_weight
0,000046581b8a28c431be90c278674925,"[美食, --其他, 美食攻略, 花絮片段, 玩具, 吃秀, 社会热点, 中医, 片段, 大...","[0.4171913341996304, 0.36140167938226964, 0.35..."
1,00016381ab699d4e76dc99291e79e7a1,[未知],[0]
2,0001c7e6a85a3a4498fe0c5f29f3a379,"[社会热点, --其他, 古代, 范冰冰, 台湾, 李治廷, 彦希, 灰姑娘, 清朝, 总裁...","[0.8310844893612963, 0.3135020218516166, 6.367..."
3,000207c515d01c00e9144c6866b546a7,"[海军, 航母, 导弹, 武器, 武器, 导弹, 洲际导弹, 大妈, 海军, 航母, 网游,...","[17.15805189101101, 13.780793638746603, 13.220..."
4,000355d66e3fe127c8c2dd1ef60322a3,"[东北, 大盘, 菜谱]","[37.141856323864594, 35.747926949211916, 4.949..."


In [4]:
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument
from glove import *


def get_gensim_feature(now=None,model='word2vec',fea = 'fea_name',size=5,window=10,prefix='active'):
    begin=datetime.datetime.now()
    print("===================================================================")
    print(prefix,model,'begin')
    df = now.copy()
    if os.path.exists("../pickle/{}_{}_emb.pickle".format(prefix,model)):
        return pd.read_pickle("../pickle/{}_{}_emb.pickle".format(prefix,model))
    else:
        dictionary = corpora.Dictionary(df[fea].values)
        corpus = [dictionary.doc2bow(text) for text in df[fea].values]
        if model=='word2vec':
            '''
            word2vec:
            将所有单词映射为向量，如果一个人有多个标签，
            每个标签代表一个词，则吧每个标签映射成的单词向量相加然后取平均
            
            相当于把每一个ｄｅｖｉｃｅｉｄ对应的ａｐｐ，ｔａｇ，映射为一个单词（同维度）
            '''
            if os.path.exists("../vector/w2v_{}.model".format(prefix)):
                w2v = models.KeyedVectors.load_word2vec_format("../vector/w2v_{}.model".format(prefix), binary=False)
            else:
                w2v = models.Word2Vec(df[fea].values, size=size, window=window, workers=40)
                w2v.wv.save_word2vec_format("../vector/w2v_{}.model".format(prefix))
            vocab = list(w2v.wv.vocab.keys())

            # Sentence Embedding

            w2v_feature = np.zeros((df.shape[0],size))
            w2v_feature_avg = np.zeros((df.shape[0],size))

            for i,line in tqdm(enumerate(df[fea].values.tolist())):
                num = 0
                if line == '':
                    w2v_feature_avg[i,:] = np.zeros(size)
                else:
                    for word in line:
                        num += 1
                        vec = w2v[word] if word in vocab else np.zeros(size)
                        w2v_feature[i,:] += vec
                    w2v_feature_avg[i,:] = w2v_feature[i,:] / num
            w2v_avg = pd.DataFrame(w2v_feature_avg)
            w2v_avg = w2v_avg.add_prefix("W2V_AVG_{}_".format(prefix))
            w2v_avg['deviceid'] = df['deviceid']
            df_agg = w2v_avg

        elif model=='lda':
            '''
            LDA文档主题生成模型，也称三层贝叶斯概率模型，包含词、主题和文档三层结构。
            
            本程序是提取出每一行语料的２０个主题词
            col　每一行代表数据中的每一行，有２０个主题词
            '''
            lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=size)
            col = np.zeros((df.shape[0],20))
            ans = lda.get_document_topics(corpus)
            for i in tqdm(range(df.shape[0])):
                for j in ans[i]:
                    col[i][j[0]] = j[1]

            df_agg = pd.DataFrame(col)
            df_agg = df_agg.add_prefix("LDA_TOPIC_{}_".format(prefix))
            df_agg['deviceid'] = df['deviceid']

        elif model=='fasttext':
            
            '''
            fasttext:与word2vec，都是将单词映射为向量
            将所有单词映射为向量，如果一个人有多个标签，
            每个标签代表一个词，则吧每个标签映射成的单词向量相加然后取平均
            
            相当于把每一个ｄｅｖｉｃｅｉｄ对应的ａｐｐ，ｔａｇ，映射为一个单词（同维度）
            
            '''
            
            if os.path.exists("../vector/fasttext_{}.model".format(prefix)):
                fasttext = models.KeyedVectors.load_word2vec_format("../vector/fasttext_{}.model".format(prefix), binary=False)
            else:
                fasttext = models.FastText(df[fea].values, size=size, window=window, workers=40)
                fasttext.wv.save_word2vec_format("../vector/fasttext_{}.model".format(prefix))
            vocab = list(fasttext.wv.vocab.keys())

            fasttext_feature = np.zeros((df.shape[0],size))
            fasttext_feature_avg = np.zeros((df.shape[0],size))

            for i,line in tqdm(enumerate(df[fea].values.tolist())):
                num = 0
                if line == '':
                    fasttext_feature_avg[i,:] = np.zeros(size)
                else:
                    for word in line:
                        num += 1
                        vec = fasttext[word] if word in vocab else np.zeros(size)
                        fasttext_feature[i,:] += vec
                    fasttext_feature_avg[i,:] = fasttext_feature[i,:] / num
            fasttext_avg = pd.DataFrame(fasttext_feature_avg)
            fasttext_avg = fasttext_avg.add_prefix("FASTTEXT_AVG_{}".format(prefix))
            fasttext_avg['deviceid'] = df['deviceid']
            df_agg = fasttext_avg

        elif model=='doc2vec':
            '''
            Doc2vec是一个非常好的技术。它易于使用，效果很好，而且从名称上可以理解，很大程度上基于word2vec
            word2vec是将一个单词映射为向量
            Doc2vec是将一句话或者一片文档映射为向量
            '''
            if os.path.exists("../vector/d2v_{}.model".format(prefix)):
                d2v = models.KeyedVectors.load_word2vec_format("../vector/d2v_{}.model".format(prefix), binary=False)
            else:            
                docs = [TaggedDocument(words=i[1],tags=[str(i[0])]) for i in df[['deviceid',fea]].values]
                d2v = models.Doc2Vec(docs,size=size,window=window,workers=40)
                d2v.wv.save_word2vec_format("../vector/d2v_{}.model".format(prefix))
            vocab = list(d2v.wv.vocab.keys())
            
            d2v_avg = np.zeros((df.shape[0],size))
            
            for index,i in tqdm(enumerate(df[fea].values)):
                line = []
                for j in i:
                    line.append(d2v[j] if j in vocab else 0)
                d2v_avg[index,] = np.mean(line,axis=0)
            
            d2v_avg = pd.DataFrame(d2v_avg)
            d2v_avg = d2v_avg.add_prefix("d2v_AVG_{}".format(prefix))
            d2v_avg['deviceid'] = df['deviceid']
            df_agg = d2v_avg

        elif model=='lsi':
            '''
            主题模型，将每一行样本所对应的主题提取出来
            '''
            lsi = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=size)
            df_agg = []
            for i in tqdm(df[fea].values):
                lsi_ = lsi[dictionary.doc2bow(i)]
                df_agg.append([tmp[1] for tmp in lsi[lsi_]])

            df_agg = pd.DataFrame(df_agg)
            df_agg = df_agg.add_prefix("LSI_TOPIC_{}_".format(prefix))
            df_agg['deviceid'] = df['deviceid']
            
        elif model=='glove':
            '''
            词向量模型，将单词映射为向量
            '''
            matrix =  Corpus()
            matrix.fit(df[fea].values)
            glove = Glove(no_components=size, learning_rate=0.05)
            glove.fit(matrix.matrix,epochs=10,no_threads=30,verbose=1)
            glove.add_dictionary(matrix.dictionary)
            ans = np.zeros((df.shape[0],size))
            for index,i in tqdm(enumerate(df[fea].values)):
                line = []
                for j in i:
                    line.append(glove.word_vectors[glove.dictionary[j]])
                ans[index,] = np.mean(line,axis=0)
            df_agg = pd.DataFrame(ans)
            df_agg = df_agg.add_prefix("Glove_AVG_{}".format(prefix))
            df_agg['deviceid'] = df['deviceid']
            
        df_agg.to_pickle("../pickle/{}_{}_emb.pickle".format(prefix,model))
    end=datetime.datetime.now()
    dur=end-begin
    print(begin)
    print(end)
    print(dur)
    print(prefix,model,'end')
    print("===================================================================")
    return df_agg

In [5]:
from tqdm import tqdm
lsi = get_gensim_feature(device_app,'lsi','applist',32,10,'device_app')
w2v = get_gensim_feature(device_app,'word2vec','applist',64,10,'device_app')
fasttext = get_gensim_feature(device_app,'fasttext','applist',64,10,'device_app')
d2v = get_gensim_feature(device_app,'doc2vec','applist',64,10,'device_app')
lda = get_gensim_feature(device_app,'lda','applist',20,10,'device_app')
glove = get_gensim_feature(device_app,'glove','applist',20,10,'device_app')


lsi_1 = get_gensim_feature(tag_data,'lsi','all_tag_word',32,10,'tag_data')
w2v_1 = get_gensim_feature(tag_data,'word2vec','all_tag_word',64,10,'tag_data')
fasttext_1 = get_gensim_feature(tag_data,'fasttext','all_tag_word',64,10,'tag_data')
d2v_1 = get_gensim_feature(tag_data,'doc2vec','all_tag_word',64,10,'tag_data')
lda_1 = get_gensim_feature(tag_data,'lda','all_tag_word',20,10,'tag_data')
glove_1 = get_gensim_feature(tag_data,'glove','all_tag_word',20,10,'tag_word')


device_app lsi begin


100%|██████████| 114584/114584 [00:33<00:00, 3428.65it/s]


2020-01-21 20:37:48.604555
2020-01-21 20:38:34.202015
0:00:45.597460
device_app lsi end
device_app word2vec begin


114584it [00:32, 3552.01it/s]


2020-01-21 20:38:34.254100
2020-01-21 20:39:15.930744
0:00:41.676644
device_app word2vec end
device_app fasttext begin


114584it [00:35, 3238.00it/s]


2020-01-21 20:39:15.990286
2020-01-21 20:40:37.314681
0:01:21.324395
device_app fasttext end
device_app doc2vec begin


114584it [00:24, 4641.84it/s]


2020-01-21 20:40:37.384237
2020-01-21 20:41:43.749020
0:01:06.364783
device_app doc2vec end
device_app lda begin


100%|██████████| 114584/114584 [00:19<00:00, 6008.29it/s]


2020-01-21 20:41:43.814425
2020-01-21 20:42:25.162747
0:00:41.348322
device_app lda end
device_app glove begin
Performing 10 training epochs with 30 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


114584it [00:02, 44338.38it/s]


2020-01-21 20:42:25.216149
2020-01-21 20:42:37.018814
0:00:11.802665
device_app glove end
tag_data lsi begin


100%|██████████| 114584/114584 [00:36<00:00, 3179.07it/s]


2020-01-21 20:42:37.073969
2020-01-21 20:43:30.151727
0:00:53.077758
tag_data lsi end
tag_data word2vec begin


114584it [02:08, 894.74it/s] 


2020-01-21 20:43:30.247402
2020-01-21 20:45:55.245645
0:02:24.998243
tag_data word2vec end
tag_data fasttext begin


114584it [02:14, 854.24it/s]


2020-01-21 20:45:55.351558
2020-01-21 20:48:41.162880
0:02:45.811322
tag_data fasttext end
tag_data doc2vec begin


114584it [01:45, 1082.17it/s]


2020-01-21 20:48:41.278489
2020-01-21 20:51:21.610242
0:02:40.331753
tag_data doc2vec end
tag_data lda begin


100%|██████████| 114584/114584 [00:27<00:00, 4171.78it/s]


2020-01-21 20:51:21.722276
2020-01-21 20:52:25.805977
0:01:04.083701
tag_data lda end
tag_word glove begin
Performing 10 training epochs with 30 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


114584it [00:04, 26694.89it/s]


2020-01-21 20:52:25.906646
2020-01-21 20:52:58.169498
0:00:32.262852
tag_word glove end


In [6]:
lsi.head()

Unnamed: 0,LSI_TOPIC_device_app_0,LSI_TOPIC_device_app_1,LSI_TOPIC_device_app_2,LSI_TOPIC_device_app_3,LSI_TOPIC_device_app_4,LSI_TOPIC_device_app_5,LSI_TOPIC_device_app_6,LSI_TOPIC_device_app_7,LSI_TOPIC_device_app_8,LSI_TOPIC_device_app_9,...,LSI_TOPIC_device_app_23,LSI_TOPIC_device_app_24,LSI_TOPIC_device_app_25,LSI_TOPIC_device_app_26,LSI_TOPIC_device_app_27,LSI_TOPIC_device_app_28,LSI_TOPIC_device_app_29,LSI_TOPIC_device_app_30,LSI_TOPIC_device_app_31,deviceid
0,0.114507,-0.206172,0.271877,0.224235,0.220379,0.095993,0.036288,-0.097078,-0.005242,-0.021594,...,-0.022837,-0.005368,0.036916,0.013208,-0.003303,0.002353,-0.00887,-0.011904,0.014061,000046581b8a28c431be90c278674925
1,-0.007949,0.010919,0.04071,-0.002762,0.003955,-0.000429,0.010979,-0.002107,-0.012347,-0.002757,...,-0.002875,-0.010878,0.000184,0.02254,0.005806,-0.006936,0.000451,0.024676,-0.004441,00016381ab699d4e76dc99291e79e7a1
2,-0.007949,0.010919,0.04071,-0.002762,0.003955,-0.000429,0.010979,-0.002107,-0.012347,-0.002757,...,-0.002875,-0.010878,0.000184,0.02254,0.005806,-0.006936,0.000451,0.024676,-0.004441,0001c7e6a85a3a4498fe0c5f29f3a379
3,0.114507,-0.206172,0.271877,0.224235,0.220379,0.095993,0.036288,-0.097078,-0.005242,-0.021594,...,-0.022837,-0.005368,0.036916,0.013208,-0.003303,0.002353,-0.00887,-0.011904,0.014061,000207c515d01c00e9144c6866b546a7
4,0.722966,0.371006,0.044857,-0.06087,2.428558,0.31016,1.942502,3.294945,2.506571,0.813158,...,0.090128,0.224837,0.330095,0.253708,-0.072954,-0.025526,0.114629,-0.133873,-0.103463,000355d66e3fe127c8c2dd1ef60322a3


In [27]:
w2v.head()

Unnamed: 0,W2V_AVG_device_app_0,W2V_AVG_device_app_1,W2V_AVG_device_app_2,W2V_AVG_device_app_3,W2V_AVG_device_app_4,W2V_AVG_device_app_5,W2V_AVG_device_app_6,W2V_AVG_device_app_7,W2V_AVG_device_app_8,W2V_AVG_device_app_9,...,W2V_AVG_device_app_55,W2V_AVG_device_app_56,W2V_AVG_device_app_57,W2V_AVG_device_app_58,W2V_AVG_device_app_59,W2V_AVG_device_app_60,W2V_AVG_device_app_61,W2V_AVG_device_app_62,W2V_AVG_device_app_63,deviceid
0,-0.837289,-0.962945,0.335245,-0.298538,-0.122355,-0.181348,-0.326044,-0.246627,-0.569171,-0.564219,...,0.662901,0.145727,0.398236,0.260711,-0.471206,-0.056578,-0.954686,-0.079862,-0.944546,832aaa33cdf4a0938ba2c795eb3ffefd
1,-0.507575,2.159136,0.950175,0.049325,-1.362319,0.422763,0.956467,0.356363,0.706265,0.97776,...,-0.033364,-0.613184,-0.827865,0.458838,0.67319,0.677139,1.178569,-0.876223,0.893812,67dd9dac18cce1a6d79e8f20eefd98ab
2,-0.737489,0.225597,-0.241625,0.410643,-0.713357,-0.043094,-0.515331,0.262719,0.57588,-0.548797,...,-0.240079,-0.077406,0.244009,0.261164,-0.448996,0.044729,0.457355,-0.091712,-0.405887,ddaa88b573f0ec579486de4df7852871
3,-0.737489,0.225597,-0.241625,0.410643,-0.713357,-0.043094,-0.515331,0.262719,0.57588,-0.548797,...,-0.240079,-0.077406,0.244009,0.261164,-0.448996,0.044729,0.457355,-0.091712,-0.405887,132cc4746b2ca645b37d64717bf2ccbd
4,-0.878887,-1.562962,0.445899,-0.586285,-0.222868,-0.823936,-0.37668,-0.409585,-0.557595,-0.422835,...,0.390522,-0.324884,-0.133656,0.843081,-0.77451,-0.222063,-1.439104,-0.097608,-1.106776,19ffd9b567a0a0863a72aee342d2ce9d


In [28]:
fasttext.head()

Unnamed: 0,FASTTEXT_AVG_device_app0,FASTTEXT_AVG_device_app1,FASTTEXT_AVG_device_app2,FASTTEXT_AVG_device_app3,FASTTEXT_AVG_device_app4,FASTTEXT_AVG_device_app5,FASTTEXT_AVG_device_app6,FASTTEXT_AVG_device_app7,FASTTEXT_AVG_device_app8,FASTTEXT_AVG_device_app9,...,FASTTEXT_AVG_device_app55,FASTTEXT_AVG_device_app56,FASTTEXT_AVG_device_app57,FASTTEXT_AVG_device_app58,FASTTEXT_AVG_device_app59,FASTTEXT_AVG_device_app60,FASTTEXT_AVG_device_app61,FASTTEXT_AVG_device_app62,FASTTEXT_AVG_device_app63,deviceid
0,0.124794,0.034739,-0.719205,1.050862,-0.127532,-0.447333,0.477138,-0.211633,-0.540445,-0.414669,...,-0.107022,0.15781,-1.313741,0.625771,0.04752,-0.625732,0.028033,0.001074,-0.167558,832aaa33cdf4a0938ba2c795eb3ffefd
1,-0.070494,2.918461,-2.427351,2.080716,1.080389,-0.368422,-0.193794,1.613857,0.258666,1.65849,...,0.080365,-1.123476,-2.154265,1.151413,0.788787,-0.352311,0.743275,-0.711996,-0.797587,67dd9dac18cce1a6d79e8f20eefd98ab
2,1.143714,-1.698895,-4.565188,3.534946,-0.752508,-2.546585,1.666796,1.92806,2.023513,-1.321478,...,-2.042135,0.035817,0.305704,1.095341,-1.637624,0.549181,0.721665,-1.366975,-1.904164,ddaa88b573f0ec579486de4df7852871
3,1.143714,-1.698895,-4.565188,3.534946,-0.752508,-2.546585,1.666796,1.92806,2.023513,-1.321478,...,-2.042135,0.035817,0.305704,1.095341,-1.637624,0.549181,0.721665,-1.366975,-1.904164,132cc4746b2ca645b37d64717bf2ccbd
4,-0.308178,0.16293,-1.142729,1.638301,0.531207,0.13787,0.804526,-0.432127,-0.089145,-0.273761,...,-0.70238,0.266379,-0.97187,0.617181,-0.233036,-0.972137,0.098042,-0.253425,-0.110168,19ffd9b567a0a0863a72aee342d2ce9d


In [29]:
d2v.head()

Unnamed: 0,d2v_AVG_device_app0,d2v_AVG_device_app1,d2v_AVG_device_app2,d2v_AVG_device_app3,d2v_AVG_device_app4,d2v_AVG_device_app5,d2v_AVG_device_app6,d2v_AVG_device_app7,d2v_AVG_device_app8,d2v_AVG_device_app9,...,d2v_AVG_device_app55,d2v_AVG_device_app56,d2v_AVG_device_app57,d2v_AVG_device_app58,d2v_AVG_device_app59,d2v_AVG_device_app60,d2v_AVG_device_app61,d2v_AVG_device_app62,d2v_AVG_device_app63,deviceid
0,0.257194,-1.065161,-0.479834,-1.010578,-1.013204,-0.410067,-0.299132,0.198686,-1.321432,0.40773,...,0.87054,-0.399427,0.489786,-0.113294,0.139536,0.774823,-1.829945,-1.722254,-2.36377,832aaa33cdf4a0938ba2c795eb3ffefd
1,-0.138025,1.392321,0.678663,-0.182792,-2.496152,0.893098,1.613436,-0.289379,2.069869,-0.187481,...,-0.338117,-0.891907,-0.50964,0.378947,0.274024,0.904589,0.692853,-0.377474,0.179152,67dd9dac18cce1a6d79e8f20eefd98ab
2,-0.843444,0.39085,-0.15183,-0.1626,-0.84911,-0.149006,-0.11637,-0.190058,0.486284,-0.462362,...,-0.490685,-0.010003,0.23825,-0.219885,-0.151092,0.001911,0.010987,-0.677154,-0.404447,ddaa88b573f0ec579486de4df7852871
3,-0.843444,0.39085,-0.15183,-0.1626,-0.84911,-0.149006,-0.11637,-0.190058,0.486284,-0.462362,...,-0.490685,-0.010003,0.23825,-0.219885,-0.151092,0.001911,0.010987,-0.677154,-0.404447,132cc4746b2ca645b37d64717bf2ccbd
4,0.207528,-1.215046,-0.626735,-1.278921,-1.236913,-0.7125,-0.374479,0.129522,-1.365392,0.123608,...,0.666899,-0.67829,0.586811,0.357522,-0.014119,0.98258,-2.052916,-2.245322,-2.520983,19ffd9b567a0a0863a72aee342d2ce9d


In [30]:
lda.head()

Unnamed: 0,LDA_TOPIC_device_app_0,LDA_TOPIC_device_app_1,LDA_TOPIC_device_app_2,LDA_TOPIC_device_app_3,LDA_TOPIC_device_app_4,LDA_TOPIC_device_app_5,LDA_TOPIC_device_app_6,LDA_TOPIC_device_app_7,LDA_TOPIC_device_app_8,LDA_TOPIC_device_app_9,...,LDA_TOPIC_device_app_11,LDA_TOPIC_device_app_12,LDA_TOPIC_device_app_13,LDA_TOPIC_device_app_14,LDA_TOPIC_device_app_15,LDA_TOPIC_device_app_16,LDA_TOPIC_device_app_17,LDA_TOPIC_device_app_18,LDA_TOPIC_device_app_19,deviceid
0,0.027667,0.0,0.018205,0.0,0.0,0.512197,0.0,0.0,0.0,0.0,...,0.0,0.0,0.414606,0.0,0.0,0.0,0.0,0.018065,0.0,832aaa33cdf4a0938ba2c795eb3ffefd
1,0.0,0.018236,0.265793,0.0,0.0,0.0,0.0,0.0,0.702261,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67dd9dac18cce1a6d79e8f20eefd98ab
2,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,...,0.025,0.025,0.025,0.025,0.025,0.525,0.025,0.025,0.025,ddaa88b573f0ec579486de4df7852871
3,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,...,0.025,0.025,0.025,0.025,0.025,0.525,0.025,0.025,0.025,132cc4746b2ca645b37d64717bf2ccbd
4,0.069729,0.0,0.105682,0.0,0.0,0.566115,0.0,0.0,0.0,0.0,...,0.0,0.0,0.246352,0.0,0.0,0.0,0.0,0.0,0.0,19ffd9b567a0a0863a72aee342d2ce9d


In [31]:
glove.head()

Unnamed: 0,Glove_AVG_device_app0,Glove_AVG_device_app1,Glove_AVG_device_app2,Glove_AVG_device_app3,Glove_AVG_device_app4,Glove_AVG_device_app5,Glove_AVG_device_app6,Glove_AVG_device_app7,Glove_AVG_device_app8,Glove_AVG_device_app9,...,Glove_AVG_device_app11,Glove_AVG_device_app12,Glove_AVG_device_app13,Glove_AVG_device_app14,Glove_AVG_device_app15,Glove_AVG_device_app16,Glove_AVG_device_app17,Glove_AVG_device_app18,Glove_AVG_device_app19,deviceid
0,-0.077326,-0.508028,0.17866,-0.208136,-0.08133,-0.109943,0.372201,-0.221732,-0.398203,-0.509056,...,0.416366,0.35048,0.423177,-0.393796,-0.394553,0.217302,0.471634,0.471688,0.460088,832aaa33cdf4a0938ba2c795eb3ffefd
1,-0.027904,-0.530132,-0.409862,-0.246644,0.243521,-0.265671,0.397971,-0.31898,-0.233059,-0.561119,...,0.517736,0.320563,0.359518,-0.473276,-0.194529,0.373079,0.473408,0.515563,0.394593,67dd9dac18cce1a6d79e8f20eefd98ab
2,0.412725,-0.33526,-0.209209,0.455962,-0.699583,-0.501481,0.259683,-0.15531,-0.514994,-0.280712,...,0.336324,0.699503,0.075734,-0.290612,-0.562938,0.171691,0.298542,0.248143,0.494686,ddaa88b573f0ec579486de4df7852871
3,0.412725,-0.33526,-0.209209,0.455962,-0.699583,-0.501481,0.259683,-0.15531,-0.514994,-0.280712,...,0.336324,0.699503,0.075734,-0.290612,-0.562938,0.171691,0.298542,0.248143,0.494686,132cc4746b2ca645b37d64717bf2ccbd
4,0.026485,-0.516828,0.129324,-0.24452,-0.088944,-0.084784,0.39208,-0.173879,-0.375044,-0.517637,...,0.451631,0.444376,0.432516,-0.391968,-0.419878,0.256217,0.488324,0.489212,0.447902,19ffd9b567a0a0863a72aee342d2ce9d
