In [1]:
import os
import faiss
import random
import numpy as np
import pandas as pd
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

from gensim.models import Word2Vec 

import warnings
warnings.filterwarnings("ignore")



##### 获取数据

In [12]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
train_df = train_df[train_df.label==1]
train_df = train_df.sort_values(['user_id', 'click_timestamp'])

test_df = pd.read_csv('../data/ml-1m/test_df.csv')
test_df = test_df[test_df.label==1]
test_df.head()

Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
0,3453,"3484,2295,2330,1839,58,1063,3733,3024,3049,110...","143,3859,275,1506,2474,3105,3510,3248,2225,220...","2327,2633,3316,2694,3657,2386,656,1114,3382,25...",3090,1,2,979932664,50,1,4,80,"4351,19,12,4352,26,4353,3770,2015,4354,3691,0,...",8
3,1218,"769,1247,587,429,1394,2992,3460,2736,1257,1231...","1848,2596,596,398,2503,3111,3289,377,3824,3828...","1484,584,1492,514,1283,1360,2126,2177,424,3015...",2272,1,3,992633493,50,1,7,79,16783388208000000000000,14
4,4744,"3046,1233,958,1179,3091,1211,1212,1208,1191,94...","3099,2298,2861,711,3744,2086,587,591,2082,2724...","2849,1069,848,1248,477,1181,1190,1259,2727,334...",2272,1,4,1038923883,50,1,6,79,16783388208000000000000,14
8,5248,"2287,3290,1063,2106,3187,1016,2793,1596,2850,8...","7,1943,3382,1843,2402,2338,1530,1855,820,2303,...","1721,2356,2554,465,1203,1360,143,594,720,2702,...",2490,1,3,961365313,50,2,5,80,36271143000000000000,5
9,6014,"492,2694,1371,2008,967,2882,1203,1989,1569,337...","2878,2589,1010,1016,1184,3380,1196,3176,1657,1...","1269,907,927,1879,2903,1017,1023,585,2993,2873...",1664,1,3,956780780,50,1,5,78,1497132121536118911913880000000,5


##### DeepWalk

DeepWalk是将随机游走和word2vec结合起来的图向量表示算法。

In [8]:
data = train_df.copy()
sentences = data.groupby(['user_id'])['item_id'].agg({list}).reset_index()['list'].values.tolist()

# 构建一个图，dict的健为节点，值为节点邻居。
graph_dict = {}
# 提取共现商品
for sentence in tqdm(sentences):
    sentence_len = len(sentence)
    for position, item_id in enumerate(sentence):
        # 提取窗口内的组合商品
        for i in range(position-1, position+2):
            if (i<0) | (i>=sentence_len) | (i==position):
                continue
            if item_id not in graph_dict:
                graph_dict[item_id] = [sentence[i]]
            else:
                graph_dict[item_id].append(sentence[i])
                
def get_random_walk(node, path_length=10):
    """ 从当前节点随机游走 """
    random_walk = [node]
    while len(random_walk) <= path_length:
        cur = random_walk[-1]
        _neighbours = graph_dict[cur]
        random_walk.append(random.choice(_neighbours))
    return random_walk
    # 尝试游走过的节点不再游走
#     while len(random_walk) <= path_length:
#         cur = random_walk[-1]
#         _neighbours = list(set(graph_dict[cur])-set(random_walk))
#         if len(_neighbours) == 0:
#             break
#         random_walk.append(random.choice(_neighbours))
#     return random_walk    

random_walks = []
# 遍历所有节点
nodes = list(graph_dict.keys())

# 每个节点随机游走5次，即从当前节点产生5条随机游走序列
for i in range(5): 
    random.shuffle(nodes)
    for node in tqdm(nodes):
        random_walks.append(get_random_walk(node, path_length=20))

print('random_walks len: ', len(random_walks))
w2v_size = 4

# 训练word2vec
model = Word2Vec(random_walks,
                 vector_size=w2v_size,
                 window=10,
                 min_count=1,
                 workers=-1,
                 seed=1024,
                 sg=1,
                 epochs=10)
print('train done...')

# 保存embedding矩阵
vocab = list(model.wv.index_to_key)
w2v_arr = []
for w in vocab:
    w2v_arr.append(list(model.wv[w]))
    
w2v_df = pd.DataFrame()
w2v_df['item_id'] = vocab
w2v_df['vector'] = w2v_arr

# 保存为dict字典
w2v_dict = dict(zip(w2v_df['item_id'].tolist(), w2v_df['vector'].tolist()))

  0%|          | 0/6040 [00:00<?, ?it/s]

  0%|          | 0/3700 [00:00<?, ?it/s]

  0%|          | 0/3700 [00:00<?, ?it/s]

  0%|          | 0/3700 [00:00<?, ?it/s]

  0%|          | 0/3700 [00:00<?, ?it/s]

  0%|          | 0/3700 [00:00<?, ?it/s]

random_walks len:  18500
train done...


In [9]:
# 2. 计算用户兴趣向量。
user_emb_list = np.array([[0.0]*4 for _ in range(test_df.shape[0])])
test_df['items'] = test_df['hist_item_id'].apply(lambda x: [int(i) for i in x.split(',')])
for idx, items in enumerate(test_df['items'].values):
    num = 0
    for item in items[-10:]:
        try:
            user_emb_list[idx] += np.array(w2v_dict[item])
            num += 1
            if w2v_dict[item][0] == 0:
                num -= 1
        except:
            pass
    if num != 0:
        user_emb_list[idx] /= num

In [10]:
# 3. 向量检索，faiss求topN相似物品。
embedding_size = 4
index = faiss.IndexFlatIP(embedding_size)
index.add(np.array(w2v_df['vector'].tolist()).astype('float32'))
D, I = index.search(user_emb_list.astype('float32'), 200)

In [11]:
# 4. 评估召回结果。
def get_recall(true_y, pred_y, top_n=50):
    """ 召回率 """
    return len(set(pred_y[:top_n])&set(true_y)) * 1.0 / len(true_y)

recall_list = []
for i, uid in tqdm(enumerate(test_df['user_id'])):
    preds = [w2v_df['item_id'].values[j] for j in I[i]]
    preds = [pred for pred in preds if pred not in test_df.iloc[idx]['items']]
    recall = get_recall([test_df['item_id'].values[i]], preds, top_n=50)
    recall_list.append(recall)

print('recall: ', np.mean(recall_list))

0it [00:00, ?it/s]

recall:  0.04586092715231788
