In [1]:
import pandas as pd
import numpy as np
import re
from gensim.models import KeyedVectors
from tqdm import tqdm

## 1 计算文章关键词向量

In [2]:
user_data = pd.read_csv('../users30Days_1203.csv')
item_data = pd.read_csv('../essays_keywords1204.csv')
item_data = item_data[['itemid','最终关键词权重']]

#useritem_join = pd.read_csv('../1203_40%.csv')

In [3]:
#user_filterData = pd.DataFrame(useritem_join['deviceid'].unique(),columns=['deviceid'])
#item_filterData = pd.DataFrame(useritem_join['itemid'].unique(),columns=['itemid'])

In [4]:
#user_data = pd.merge(user_data,user_filterData,on='deviceid')
#item_data = pd.merge(item_data,item_filterData,on='itemid')

In [5]:
vec = KeyedVectors.load_word2vec_format('./model_size100_mincount50_title3.vector')

### 1.1 处理符号

In [6]:
def drop_symbols(data):
    key_words = data.replace('\'','').replace('(','').replace(')','').replace('[','').replace(']','').replace(' ','').split(',')
    key_dict={}
    for i in range(int(len(key_words)/2)):
        key_dict[key_words[2*i]]=float(key_words[2*i+1])/float(key_words[1])
    return key_dict

In [7]:
tqdm.pandas(desc="my bar!")
item_data['KeyWords_WithWeight'] = item_data['最终关键词权重'].progress_apply(lambda x: drop_symbols(x))

  from pandas import Panel
my bar!: 100%|████████████████████████████████████████████████████████████████| 21043/21043 [00:00<00:00, 88652.32it/s]


### 1.2 计算文章关键词向量(embedding vector)

In [8]:
def compute_essay_vectors(data):
    word_vector = np.zeros([1,100],dtype=np.float32)
    count = 0
    for word in data.keys():
        if word in vec.index2word:
            count+=1
            word_vector+=vec[word]
    if count:
        return word_vector/count
    else:
        return word_vector

In [9]:
essay_array = np.empty((item_data.shape[0],100))
for i in tqdm(range(item_data.shape[0])):
    essay_array[i]= compute_essay_vectors(item_data['KeyWords_WithWeight'][i])

100%|██████████████████████████████████████████████████████████████████████████| 21043/21043 [00:05<00:00, 3694.71it/s]


### 1.3 生成文章词向量数据，并保存

In [10]:
essay_data = pd.concat([item_data['itemid'],pd.DataFrame(essay_array, columns=['essayVectors_'+str(i) for i in range(100)])],axis=1)

In [11]:
essay_data.to_csv('./essay_vectors1203.csv')

## 2 计算用户关键词向量

In [12]:
#合并用户数据和文章数据
data = user_data.merge(item_data,on='itemid')

### 2.1 提取用户关键词

In [13]:
def extract_user_keywords(data):
    a = {}
    for i in range(data.shape[0]):
        a.update(data['KeyWords_WithWeight'].iloc[i])
    b = sorted(a.keys(),key=lambda x: x[1], reverse=True)[0:30]
    return b

In [14]:
#tqdm.pandas(desc="my bar!")
user_keywords = data.groupby('deviceid').progress_apply(lambda x: extract_user_keywords(x))
user_keywords = user_keywords.reset_index()

my bar!: 100%|███████████████████████████████████████████████████████████████| 137304/137304 [00:22<00:00, 6111.52it/s]


In [15]:
#保存用户关键词数据
#user_keywords.to_csv('./users_keywords_1204.csv',encoding='utf-8_sig')
#保存用户关键词数据

### 2.2 计算用户关键词向量

In [16]:
def compute_user_vectors(data):
    word_vector = np.zeros([1,100],dtype=np.float32)
    count = 0
    for word in data:
        if word in vec.index2word:
            count+=1
            word_vector+=vec[word]
    if count:
        return word_vector/count
    else:
        return word_vector

In [17]:
user_array = np.zeros((user_keywords.shape[0],100))
for i in tqdm(range(user_keywords.shape[0])):
    user_array[i]= compute_user_vectors(user_keywords[0][i])

100%|█████████████████████████████████████████████████████████████████████████| 137304/137304 [03:10<00:00, 720.08it/s]


### 2.3 保存用户关键词向量

In [18]:
%%time
user_vectors = pd.concat([user_keywords[['deviceid']],pd.DataFrame(user_array,columns=['userVectors_'+str(i) for i in range(100)])],axis=1)

Wall time: 181 ms


In [19]:
user_vectors.to_csv('./user_vectors1203.csv')