In [47]:
import os
import faiss
import random
import numpy as np
import pandas as pd
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

from gensim.models import Word2Vec 

import warnings
warnings.filterwarnings("ignore")

##### 获取数据

In [38]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
train_df = train_df[train_df.label==1]
train_df = train_df.sort_values(['user_id', 'click_timestamp'])

test_df = pd.read_csv('../data/ml-1m/test_df.csv')
test_df = test_df[test_df.label==1]

In [39]:
train_df.head()

Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
95513,1,"3118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","3118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1251,1,5,978300055,1,2,1,66,2040812218500000000000,5
383079,1,"3118,1251,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","3118,1251,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1673,1,4,978300055,2,2,1,78,271200000000000000,8
835777,1,"3118,1251,1673,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","3118,1251,1673,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1010,1,5,978300055,3,2,1,31,181500000000000000,3
316861,1,"3118,1251,1673,1010,0,0,0,0,0,0,0,0,0,0,0,0,0,...","3118,1251,1673,1010,0,0,0,0,0,0,0,0,0,0,0,0,0,...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",2272,1,3,978300103,4,2,1,79,16783388208000000000000,14
1004659,1,"3118,1251,1673,1010,2272,0,0,0,0,0,0,0,0,0,0,0...","3118,1251,1673,1010,2272,0,0,0,0,0,0,0,0,0,0,0...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",1769,1,5,978300172,5,2,1,79,199424112826260000000000,8


##### Item2vec

根据用户行为序列训练每个Item的embedding向量。

In [48]:
# 1. 根据用户行为序列训练word2vec模型，并获取item embedding。
data = train_df.copy()

by = 'user_id'
key = 'item_id' # key embedding
w2v_size = 4

# 获取二维序列数组key_list
# [[1,2,3], [4,5,6,7]...]
user_keys_df = data.groupby(by).apply(lambda x: list(x[key])).reset_index().rename(columns={0: f'{key}_list'})
key_list = list(user_keys_df[f'{key}_list'].values)
print('key_list len: ', len(key_list))

# 训练word2vec
model = Word2Vec(key_list,
                 vector_size=w2v_size,
                 window=10,
                 min_count=1,
                 workers=-1,
                 seed=1024,
                 sg=1,
                 epochs=10)
print('train done...')

# 保存embedding矩阵
vocab = list(model.wv.index_to_key)
w2v_arr = []
for w in vocab:
    w2v_arr.append(list(model.wv[w]))
    
w2v_df = pd.DataFrame()
w2v_df[key] = vocab
w2v_df['vector'] = w2v_arr

# 保存为dict字典
w2v_dict = dict(zip(w2v_df['item_id'].tolist(), w2v_df['vector'].tolist()))



key_list len:  6040




train done...


##### DeepWalk

DeepWalk是将随机游走和word2vec结合起来的图向量表示算法。

In [None]:
data = train_df.copy()
sentences = data.groupby(['user_id'])['item_id'].agg({list}).reset_index()['list'].values.tolist()

# 构建一个图，dict的健为节点，值为节点邻居。
graph_dict = {}
# 提取共现商品
for sentence in tqdm(sentences):
    sentence_len = len(sentence)
    for position, item_id in enumerate(sentence):
        # 提取窗口内的组合商品
        for i in range(position-1, position+2):
            if (i<0) | (i>=sentence_len) | (i==position):
                continue
            if item_id not in graph_dict:
                graph_dict[item_id] = [sentence[i]]
            else:
                graph_dict[item_id].append(sentence[i])
                
def get_random_walk(node, path_length=10):
    """ 从当前节点随机游走 """
    random_walk = [node]
#     while len(random_walk) <= path_length:
#         cur = random_walk[-1]
#         _neighbours = graph_dict[cur]
#         random_walk.append(random.choice(_neighbours))
#     return random_walk
    # 尝试游走过的节点不再游走，速度过慢，故放弃。
    while len(random_walk) <= path_length:
        cur = random_walk[-1]
        _neighbours = list(set(graph_dict[cur])-set(random_walk))
        if len(_neighbours) == 0:
            break
        random_walk.append(random.choice(_neighbours))
    return random_walk    

random_walks = []
# 遍历所有节点
nodes = list(graph_dict.keys())

# 每个节点随机游走5次，即从当前节点产生5条随机游走序列
for i in range(5): 
    random.shuffle(nodes)
    for node in tqdm(nodes):
        random_walks.append(get_random_walk(node, path_length=20))

print('random_walks len: ', len(random_walks))
w2v_size = 4

# 训练word2vec
model = Word2Vec(random_walks,
                 vector_size=w2v_size,
                 window=10,
                 min_count=1,
                 workers=-1,
                 seed=1024,
                 sg=1,
                 epochs=10)
print('train done...')

# 保存embedding矩阵
vocab = list(model.wv.index_to_key)
w2v_arr = []
for w in vocab:
    w2v_arr.append(list(model.wv[w]))
    
w2v_df = pd.DataFrame()
w2v_df['item_id'] = vocab
w2v_df['vector'] = w2v_arr

# 保存为dict字典
w2v_dict = dict(zip(w2v_df['item_id'].tolist(), w2v_df['vector'].tolist()))

In [55]:
# 2. 计算用户兴趣向量。
user_emb_list = np.array([[0.0]*4 for _ in range(test_df.shape[0])])
test_df['items'] = test_df['hist_item_id'].apply(lambda x: [int(i) for i in x.split(',')])
for idx, items in enumerate(test_df['items'].values):
    num = 0
    for item in items[-10:]:
        try:
            user_emb_list[idx] += np.array(w2v_dict[item])
            num += 1
            if w2v_dict[item][0] == 0:
                num -= 1
        except:
            pass
    if num != 0:
        user_emb_list[idx] /= num

In [56]:
# 3. 向量检索，faiss求topN相似物品。
embedding_size = 4
index = faiss.IndexFlatIP(embedding_size)
index.add(np.array(w2v_df['vector'].tolist()).astype('float32'))
D, I = index.search(user_emb_list.astype('float32'), 200)

In [57]:
# 4. 评估召回结果。
def get_recall(true_y, pred_y, top_n=50):
    """ 召回率 """
    return len(set(pred_y[:top_n])&set(true_y)) * 1.0 / len(true_y)

recall_list = []
for i, uid in tqdm(enumerate(test_df['user_id'])):
    preds = [w2v_df['item_id'].values[j] for j in I[i]]
    preds = [pred for pred in preds if pred not in test_df.iloc[idx]['items']]
    recall = get_recall([test_df['item_id'].values[i]], preds, top_n=50)
    recall_list.append(recall)

print('recall: ', np.mean(recall_list))

0it [00:00, ?it/s]

recall:  0.04486754966887417
