


## 解题思路

使用深度学习模型构建隐式推荐算法模型，并构建负样本，最终按照模型输出的评分进行排序，做出最终的推荐。具体可以分为以下几个步骤：

- 步骤1：读取数据，对`用户`和`图书`进行编码；
- 步骤2：利用训练集构建负样本；
- 步骤3：使用Paddle构建打分模型；
- 步骤4：对测试集数据进行预测；

### 步骤1：读取数据集

首先我们使用`pandas`读取数据集，并对数据的字段进行编码。这里可以手动构造编码过程，也可以使用`LabelEncoder`来完成。

这一步骤的操作目的是将对`用户`和`图书`编码为连续的数值，原始的取值并不是连续的，这样可以减少后续模型所需要的空间。

### 步骤2：构建负样本

由于原始训练集中都是记录的是用户已有的图书记录，并不存在负样本。而在预测阶段我们需要预测用户下一个图书，此时的预测空间是用户对所有图书的关系。

这里构建负样本的操作非常粗暴，直接是选择用户在训练集中没有图书。这里可以先使用协同过滤的思路来构建负样本，即将负样本是相似用户都没有记录的图书。

### 步骤3：Paddle搭建打分模型

这里构建使用Paddle构建用户与图书的打分模型，借助`Embedding`层来完成具体的匹配过程。这里用最简单的dot来完成匹配，没有构建复杂的模型。

![](https://ai-studio-static-online.cdn.bcebos.com/1aa05bd8d08f44a5b8365c236994fd94d245b7de08ea459fb08b1ab9fe2d423e)

### 步骤4：对测试集进行预测

首先将测试集数据转为模型需要的格式，然后一行代码完成预测即可，然后转换为提交格式。


In [1]:
!cp /home/aistudio/data/data114712/test_dataset.csv ./

In [1]:
import pandas as pd
df = pd.read_csv('train_dataset.csv')
print('共{}个用户，{}本图书，{}条记录'.format(max(df['user_id'])+1, max(df['item_id'])+1, len(df)))

df.head()

共53424个用户，10000本图书，5869631条记录


Unnamed: 0,user_id,item_id
0,0,257
1,0,267
2,0,5555
3,0,3637
4,0,1795


In [2]:
import pandas as pd
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import Dataset

# 读取数据集
df = pd.read_csv('train_dataset.csv')
user_ids = df["user_id"].unique().tolist()

# 从新编码user 和 book，类似标签编码的过程
# 此步骤主要为减少id的编码空间
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

book_ids = df["item_id"].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}

# 编码映射
df["user"] = df["user_id"].map(user2user_encoded)
df["movie"] = df["item_id"].map(book2book_encoded)

num_users = len(user2user_encoded)
num_books = len(book_encoded2book)

user_book_dict = df.iloc[:].groupby(['user'])['movie'].apply(list)

In [3]:
# user 与 电影的对应关系
user_book_dict

user
0        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1        [115, 116, 117, 118, 119, 120, 21, 121, 122, 1...
2        [30, 136, 171, 167, 165, 164, 159, 163, 166, 1...
3        [260, 261, 262, 263, 264, 53, 265, 266, 267, 2...
4        [333, 334, 335, 336, 337, 338, 339, 340, 341, ...
                               ...                        
53419    [2133, 941, 610, 2014, 559, 2973, 914, 1745, 1...
53420    [6, 378, 119, 197, 41, 20, 45, 137, 46, 48, 36...
53421    [5302, 2085, 2082, 2083, 2072, 2073, 2070, 208...
53422    [6792, 8991, 8584, 326, 5248, 4994, 6739, 6738...
53423    [7626, 6201, 6186, 8161, 8158, 8159, 5961, 609...
Name: movie, Length: 53424, dtype: object

# 构造负样本

In [4]:
# 随机挑选数据集作为负样本，负样本只需要对没有看的电影进行随机采样
neg_df = []
book_set = set(list(book_encoded2book.keys()))
for user_idx in user_book_dict.index:
    book_idx = book_set - set(list(user_book_dict.loc[user_idx]))
    book_idx = list(book_idx)
    neg_book_idx = np.random.choice(book_idx, 100)
    for x in neg_book_idx:
        neg_df.append([user_idx, x])

In [5]:
# 负样本的标签
neg_df = pd.DataFrame(neg_df, columns=['user', 'movie'])
neg_df['label'] = 0

# 正样本的标签
df['label'] = 1

# 正负样本合并为数据集
train_df = pd.concat([df[['user', 'movie', 'label']], 
                      neg_df[['user', 'movie', 'label']]], axis=0)

train_df = train_df.sample(frac=1)

In [6]:
del df;

# 自定义数据集

In [7]:
# 自定义数据集
#映射式(map-style)数据集需要继承paddle.io.Dataset
class SelfDefinedDataset(Dataset):
    def __init__(self, data_x, data_y, mode = 'train'):
        super(SelfDefinedDataset, self).__init__()
        self.data_x = data_x
        self.data_y = data_y
        self.mode = mode

    def __getitem__(self, idx):
        if self.mode == 'predict':
            return self.data_x[idx]
        else:
            return self.data_x[idx], self.data_y[idx]

    def __len__(self):
        return len(self.data_x)

In [8]:
# 划分数据集，得到训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(train_df[['user', 'movie']].values, 
                                        train_df['label'].values.astype(np.float32).reshape(-1, 1))

In [9]:
traindataset = SelfDefinedDataset(x_train, y_train)
# 测试数据集读取
for data, label in traindataset:
    print(data.shape, label.shape)
    print(data, label)
    break

# 测试dataloder读取
train_loader = paddle.io.DataLoader(traindataset, batch_size = 1280*4, shuffle = True)
for batch_id, data in enumerate(train_loader):
    x_data = data[0]
    y_data = data[1]

    print(x_data.shape)
    print(y_data.shape)
    break

val_dataset = SelfDefinedDataset(x_val, y_val)
val_loader = paddle.io.DataLoader(val_dataset, batch_size = 1280*4, shuffle = True)        
for batch_id, data in enumerate(val_loader):
    x_data = data[0]
    y_data = data[1]

    print(x_data.shape)
    print(y_data.shape)
    break

(2,) (1,)
[7268 6400] [1.]
[5120, 2]
[5120, 1]
[5120, 2]
[5120, 1]


# 定义模型

In [10]:
EMBEDDING_SIZE = 32

# 定义深度学习模型
class RecommenderNet(nn.Layer):
    def __init__(self, num_users, num_movies, embedding_size):
        super(RecommenderNet, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        weight_attr_user = paddle.ParamAttr(
            regularizer = paddle.regularizer.L2Decay(1e-6),
            initializer = nn.initializer.KaimingNormal()
            )
        self.user_embedding = nn.Embedding(
            num_users,
            embedding_size,
            weight_attr=weight_attr_user
        )
        self.user_bias = nn.Embedding(num_users, 1)
        
        weight_attr_movie = paddle.ParamAttr(
            regularizer = paddle.regularizer.L2Decay(1e-6),
            initializer = nn.initializer.KaimingNormal()
            )
        self.movie_embedding = nn.Embedding(
            num_movies,
            embedding_size,
            weight_attr=weight_attr_movie
        )
        self.movie_bias = nn.Embedding(num_movies, 1)
        
    def forward(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = paddle.dot(user_vector, movie_vector)
        x = dot_user_movie + user_bias + movie_bias
        x = nn.functional.sigmoid(x)
        return x

In [11]:
model = RecommenderNet(num_users, num_books, EMBEDDING_SIZE)

model = paddle.Model(model)

# 定义模型损失函数、优化器和评价指标
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.003)
loss = nn.BCELoss()
metric = paddle.metric.Precision()

# # 设置visualdl路径
log_dir = './visualdl'
callback = paddle.callbacks.VisualDL(log_dir=log_dir)

# 模型训练与验证
model.prepare(optimizer, loss, metric)
model.fit(train_loader, val_loader, epochs=5, save_dir='./checkpoints', verbose=1, callbacks=callback)

W1231 11:42:31.823208   171 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 12.0, Runtime API Version: 10.1
W1231 11:42:31.828727   171 device_context.cc:422] device: 0, cuDNN Version: 7.6.


The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5


  return (isinstance(seq, collections.Sequence) and


save checkpoint at /home/aistudio/checkpoints/0
Eval begin...
Eval samples: 2803008
Epoch 2/5
save checkpoint at /home/aistudio/checkpoints/1
Eval begin...
Eval samples: 2803008
Epoch 3/5
save checkpoint at /home/aistudio/checkpoints/2
Eval begin...
Eval samples: 2803008
Epoch 4/5
save checkpoint at /home/aistudio/checkpoints/3
Eval begin...
Eval samples: 2803008
Epoch 5/5
save checkpoint at /home/aistudio/checkpoints/4
Eval begin...
Eval samples: 2803008
save checkpoint at /home/aistudio/checkpoints/final


# 预测测试集

In [12]:
test_df = []
with open('sub.csv', 'w') as up:
    up.write('user_id,item_id\n')

# 模型预测步骤
book_set = set(list(book_encoded2book.keys()))
for idx in range(int(len(user_book_dict)/1000) +1):
    # 对于所有的用户，需要预测其与其他书的打分
    test_user_idx = []
    test_book_idx = []
    for user_idx in user_book_dict.index[idx*1000:(idx+1)*1000]:
        
        book_idx = book_set - set(list(user_book_dict.loc[user_idx]))
        book_idx = list(book_idx)
        test_user_idx += [user_idx] * len(book_idx)
        test_book_idx +=  book_idx
    
    # 从剩余电影中筛选出标签为正的样本
    test_data = np.array([test_user_idx, test_book_idx]).T
    test_dataset = SelfDefinedDataset(test_data, data_y=None, mode='predict')
    test_loader = paddle.io.DataLoader(test_dataset, batch_size=1280, shuffle = False)        
        
    test_predict = model.predict(test_loader, batch_size=1024)
    test_predict = np.concatenate(test_predict[0], 0)
    
    test_data = pd.DataFrame(test_data, columns=['user', 'book'])
    test_data['label'] = test_predict
    for gp in test_data.groupby(['user']):
        with open('sub.csv', 'a') as up:
            u = gp[0]
            b = gp[1]['book'].iloc[gp[1]['label'].argmax()]
            up.write(f'{userencoded2user[u]}, {book_encoded2book[b]}\n')
        
    del test_data, test_dataset, test_loader


import numpy as np
from paddle.io import DataLoader, Dataset

class RandomDataset(Dataset):
    def __getitem__(self, idx):
        data = np.random.random((2, 3)).astype('float32')

        return data

    def __len__(self):
        return 10

dataset = RandomDataset()
loader = DataLoader(dataset, batch_size=1)
data = next(loader())

In Paddle <= 2.0, data is in format '[Tensor(shape=(1, 2, 3), dtype=float32)]', and in Paddle >= 2.1, data is in format 'Tensor(shape=(1, 2, 3), dtype=float32)'



Predict begin...
step   42/7724 [..............................] - ETA: 58s - 8ms/stepstep  102/7724 [..............................] - ETA: 43s - 6ms/st
Predict samples: 9886184
Predict begin...
Predict samples: 9884390
Predict begin...
Predict samples: 9884674
Predict begin...
Predict samples: 9883296
Predict begin...
Predict samples: 9883657
Predict begin...
Predict samples: 9883750
Predict begin...
Predict samples: 9881710
Predict begin...
Predict samples: 9882276
Predict begin...
Predict samples: 9883007
Predict begin...
Predict samples: 9885012
Predict begin...
Predict samples: 9884635
Predict begin...
Predict samples: 9885505
Predict begin...
Predict samples: 9887310
Predict begin...
Predict samples: 9883799
Predict begin...
Predict samples: 9886890
Predict begin...
Predict samples: 9887735
Predict begin...
Predict samples: 9887762
Predict begin...
Predict samples: 9888197
Predict begin...
Predict samples: 9887347
Predict begin...
Predict samples: 9888719
Predict begin...
Predic