#### 调库

In [15]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sklearn.metrics import ndcg_score
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### FM

In [16]:
class FM(nn.Module):
    # latent_dim是离散特征隐向量的维度, feature_num是特征的数量
    def __init__(self, feature_num, latent_dim):
        super(FM, self).__init__()
        self.latent_dim = latent_dim
        # 下面定义了三个矩阵
        self.w0 = nn.Parameter(torch.zeros([1, ]))
        self.w1 = nn.Parameter(torch.rand([feature_num, 1]))
        self.w2 = nn.Parameter(torch.rand([feature_num, latent_dim]))

    def forward(self, Input):
        # 一阶交叉
        order_1st = self.w0 + torch.mm(Input, self.w1)
        # 二阶交叉
        order_2nd = 1 / 2 * torch.sum(
            torch.pow(torch.mm(Input, self.w2), 2) - torch.mm(torch.pow(Input, 2), torch.pow(self.w2, 2)), dim=1,
            keepdim=True)
        return order_1st + order_2nd

#### DNN

In [17]:
class DNN(nn.Module):
    def __init__(self, hidden, dropout=0):
        super(DNN, self).__init__()
        # 相邻的hidden层, Linear用于设置全连接层
        # ModuleList可以将nn.Module的子类加入到List中
        self.dnn = nn.ModuleList([nn.Linear(layer[0], layer[1]) for layer in list(zip(hidden[:-1], hidden[1:]))])
        # dropout用于训练, 代表前向传播中有多少概率神经元不被激活
        # 为了减少过拟合
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        for linear in self.dnn:
            x = linear(x)
            # relu激活函数
            x = F.relu(x)
        x = self.dropout(x)
        return x

#### DeepFM

In [18]:
class DeepFM(nn.Module):
    def __init__(self, hidden, feature_col, dropout=0):
        super(DeepFM, self).__init__()
        # 连续型特征和离散型特征
        self.dense_col, self.sparse_col = feature_col
        self.embedding_layer = nn.ModuleDict({"embedding" + str(i): nn.Embedding(num_embeddings=feature["feature_num"],
                                                                                 embedding_dim=feature["embedding_dim"])
                                              for i, feature in enumerate(self.sparse_col)})

        self.feature_num = len(self.dense_col) + len(self.sparse_col) * self.sparse_col[0]["embedding_dim"]
        # 将feature_num插入到hidden的开头
        hidden.insert(0, self.feature_num)

        self.fm = FM(self.feature_num, self.sparse_col[0]["embedding_dim"])
        self.dnn = DNN(hidden, dropout)
        # 最终输出, 将最后一层输入然后输出一维的结果
        self.final = nn.Linear(hidden[-1], 1)

    def forward(self, x):
        sparse_input, dense_input = x[:, :len(self.sparse_col)], x[:, len(self.sparse_col):]
        sparse_input = sparse_input.long()
        sparse_embed = [self.embedding_layer["embedding" + str(i)](sparse_input[:, i]) for i in range(sparse_input.shape[1])]
        # 按照最后一个维度拼接
        sparse_embed = torch.cat(sparse_embed, dim=-1)

        x = torch.cat([sparse_embed, dense_input], dim=-1)
        wide_output = self.fm(x)
        deep_output = self.final(self.dnn(x))
        return F.sigmoid(torch.add(wide_output, deep_output)) * 5

#### 特征提取

In [19]:
full_info = pd.read_csv('book_final_info.csv')

sparse_feature = ["Book", "User", "Time"]
dense_feature = ["raw_score", "Be_Reading", "Have_Read", "Wanna_Read"]
# 填充缺失值
full_info[sparse_feature] = full_info[sparse_feature].fillna('-1')
full_info[dense_feature] = full_info[dense_feature].fillna(0)
# 离散特征编码
for feature in sparse_feature:
    label = LabelEncoder()
    full_info[feature] = label.fit_transform(full_info[feature])
# 数值特征归一化
mms = MinMaxScaler()
full_info[dense_feature] = mms.fit_transform(full_info[dense_feature])
print(full_info[sparse_feature + dense_feature])

        Book  User  Time  raw_score  Be_Reading  Have_Read  Wanna_Read
0        700  1653    28   0.948454    0.087177   0.037739    0.206488
1        757  1653     1   0.845361    0.027265   0.010078    0.034117
2        834  1653    30   0.855670    0.018542   0.008116    0.033353
3        826  1653    25   0.855670    0.014965   0.019608    0.015190
4       1181  1653    20   0.886598    0.302413   0.154366    0.361876
...      ...   ...   ...        ...         ...        ...         ...
637249   458  4150     3   0.876289    0.005703   0.022671    0.010555
637250    14  4150     3   0.938144    0.034638   0.169296    0.100627
637251     8  4150     3   0.907216    0.009684   0.014037    0.021897
637252   136  4150     3   0.938144    0.006934   0.013486    0.006816
637253   880  4150    28   0.814433    0.012063   0.042840    0.016488

[637254 rows x 7 columns]


#### 划分数据集

In [20]:
x_train, x_test, y_train, y_test = train_test_split(full_info[sparse_feature + dense_feature].values.astype(np.float32), full_info["Rate"].values.astype(np.float32),
                                                    test_size=0.2,
                                                    random_state=2023)
# 构建数据管道
train_dataset = TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float())
test_dataset = TensorDataset(torch.tensor(x_test).float(), torch.tensor(y_test).float())
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=True)

#### 训练

In [21]:
hidden = [128, 64, 32]
dropout = 0
feature_col = [[{"feature": feature_} for feature_ in dense_feature]] + [[{"feature": feature_, "feature_num": full_info[feature_].nunique(), "embedding_dim": 3} for feature_ in sparse_feature]]
model = DeepFM(hidden, feature_col, dropout)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for epoch in range(num_epochs):
    # 训练
    model.train()
    loss_sum_train, loss_sum_test = 0.0, 0.0
    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y_prediction = model(x)
        loss = criterion(y_prediction, y)
        loss.backward()
        optimizer.step()
        loss_sum_train += loss.item()
        output_loss_train = loss_sum_train / (i + 1)

    model.eval()
    y_list, y_prediction_list = [], []
    y_array, y_prediction_array = np.array([]), np.array([])
    with torch.no_grad():
        for index, (x_, y_) in enumerate(test_loader):
            y_prediction_ = model(x_)
            loss = criterion(y_prediction_, y_)
            y_list = torch.tensor(y_).numpy()
            y_array = np.append(y_array, y_list)
            y_prediction_list = torch.tensor(y_prediction_).numpy()
            y_prediction_array = np.append(y_prediction_array, y_prediction_)
            loss_sum_test += loss.item()
            output_loss_test = loss_sum_test / (i + 1)

    y_array = y_array.tolist()
    y_prediction_array = y_prediction_array.tolist()
    ndcgscore = ndcg_score([y_array], [y_prediction_array])
    print("epoch: {}, train_loss: {}, test_loss: {}, ndcg_score: {}".format(epoch + 1, output_loss_train,
                                                                            output_loss_test, ndcgscore))


epoch: 1, train_loss: 7.174833744049073, test_loss: 1.4051940269470216, ndcg_score: 0.9408075167613705
epoch: 2, train_loss: 4.59050658416748, test_loss: 1.1189309616088867, ndcg_score: 0.9404307256168711
epoch: 3, train_loss: 4.308561996459961, test_loss: 1.0945753288269042, ndcg_score: 0.9403987023577233
epoch: 4, train_loss: 4.260613311767578, test_loss: 1.0880211143493652, ndcg_score: 0.9404357605895934
epoch: 5, train_loss: 4.245028644561768, test_loss: 1.0880177268981934, ndcg_score: 0.9400013421923703
epoch: 6, train_loss: 4.239394046783447, test_loss: 1.08648876953125, ndcg_score: 0.9406631805996507
epoch: 7, train_loss: 4.238051387786865, test_loss: 1.087811222076416, ndcg_score: 0.9398094358556313
epoch: 8, train_loss: 4.233310882568359, test_loss: 1.0831170616149903, ndcg_score: 0.9404004727488062
epoch: 9, train_loss: 4.233950477600097, test_loss: 1.083015567779541, ndcg_score: 0.9401586607093397
epoch: 10, train_loss: 4.225923812866211, test_loss: 1.0829881477355956, ndcg_