#### 调库

In [6]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sklearn.metrics import ndcg_score
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### FM

In [7]:
class FM(nn.Module):
    # latent_dim是离散特征隐向量的维度, feature_num是特征的数量
    def __init__(self, feature_num, latent_dim):
        super(FM, self).__init__()
        self.latent_dim = latent_dim
        # 下面定义了三个矩阵
        self.w0 = nn.Parameter(torch.zeros([1, ]))
        self.w1 = nn.Parameter(torch.rand([feature_num, 1]))
        self.w2 = nn.Parameter(torch.rand([feature_num, latent_dim]))

    def forward(self, Input):
        # 一阶交叉
        order_1st = self.w0 + torch.mm(Input, self.w1)
        # 二阶交叉
        order_2nd = 1 / 2 * torch.sum(
            torch.pow(torch.mm(Input, self.w2), 2) - torch.mm(torch.pow(Input, 2), torch.pow(self.w2, 2)), dim=1,
            keepdim=True)
        return order_1st + order_2nd

#### DNN

In [8]:
class DNN(nn.Module):
    def __init__(self, hidden, dropout=0):
        super(DNN, self).__init__()
        # 相邻的hidden层, Linear用于设置全连接层
        # ModuleList可以将nn.Module的子类加入到List中
        self.dnn = nn.ModuleList([nn.Linear(layer[0], layer[1]) for layer in list(zip(hidden[:-1], hidden[1:]))])
        # dropout用于训练, 代表前向传播中有多少概率神经元不被激活
        # 为了减少过拟合
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        for linear in self.dnn:
            x = linear(x)
            # relu激活函数
            x = F.relu(x)
        x = self.dropout(x)
        return x

#### DeepFM

In [9]:
class DeepFM(nn.Module):
    def __init__(self, hidden, feature_col, dropout=0):
        super(DeepFM, self).__init__()
        # 连续型特征和离散型特征
        self.dense_col, self.sparse_col = feature_col
        self.embedding_layer = nn.ModuleDict({"embedding" + str(i): nn.Embedding(num_embeddings=feature["feature_num"],
                                                                                 embedding_dim=feature["embedding_dim"])
                                              for i, feature in enumerate(self.sparse_col)})

        self.feature_num = len(self.dense_col) + len(self.sparse_col) * self.sparse_col[0]["embedding_dim"]
        # 将feature_num插入到hidden的开头
        hidden.insert(0, self.feature_num)

        self.fm = FM(self.feature_num, self.sparse_col[0]["embedding_dim"])
        self.dnn = DNN(hidden, dropout)
        # 最终输出, 将最后一层输入然后输出一维的结果
        self.final = nn.Linear(hidden[-1], 1)

    def forward(self, x):
        sparse_input, dense_input = x[:, :len(self.sparse_col)], x[:, len(self.sparse_col):]
        sparse_input = sparse_input.long()
        sparse_embed = [self.embedding_layer["embedding" + str(i)](sparse_input[:, i]) for i in range(sparse_input.shape[1])]
        # 按照最后一个维度拼接
        sparse_embed = torch.cat(sparse_embed, dim=-1)

        x = torch.cat([sparse_embed, dense_input], dim=-1)
        wide_output = self.fm(x)
        deep_output = self.final(self.dnn(x))
        return F.sigmoid(torch.add(wide_output, deep_output)) * 5

#### 特征提取

In [10]:
full_info = pd.read_csv('../Dataset/movie_final_info.csv')

sparse_feature = ["Movie", "User", "Time"]
dense_feature = ["favor", "watched"]
# 填充缺失值
full_info[sparse_feature] = full_info[sparse_feature].fillna('-1')
full_info[dense_feature] = full_info[dense_feature].fillna(0)
# 离散特征编码
for feature in sparse_feature:
    label = LabelEncoder()
    full_info[feature] = label.fit_transform(full_info[feature])
# 数值特征归一化
mms = MinMaxScaler()
full_info[dense_feature] = mms.fit_transform(full_info[dense_feature])
print(full_info[sparse_feature + dense_feature])

        Movie  User  Time     favor   watched
0        1181   383    27  0.065757  0.105539
1         898   383    24  0.014995  0.009875
2        1186   383    13  0.323543  0.386266
3         804   383    10  0.321468  0.397589
4        1169   383     9  0.206937  0.359602
...       ...   ...   ...       ...       ...
715019     40   371    11  0.000000  0.000000
715020      2   371    11  0.000000  0.000000
715021    852   371    11  0.000000  0.000000
715022    711   371    11  0.000000  0.000000
715023    678   371    10  0.000000  0.000000

[715024 rows x 5 columns]


#### 划分数据集

In [11]:
x_train, x_test, y_train, y_test = train_test_split(full_info[sparse_feature + dense_feature].values.astype(np.float32), full_info["Rate"].values.astype(np.float32),
                                                    test_size=0.2,
                                                    random_state=2023)
# 构建数据管道
train_dataset = TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float())
test_dataset = TensorDataset(torch.tensor(x_test).float(), torch.tensor(y_test).float())
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=True)

#### 训练

In [12]:
hidden = [128, 64, 32]
dropout = 0
feature_col = [[{"feature": feature_} for feature_ in dense_feature]] + [[{"feature": feature_, "feature_num": full_info[feature_].nunique(), "embedding_dim": 3} for feature_ in sparse_feature]]
model = DeepFM(hidden, feature_col, dropout)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for epoch in range(num_epochs):
    # 训练
    model.train()
    loss_sum_train, loss_sum_test = 0.0, 0.0
    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y_prediction = model(x)
        loss = criterion(y_prediction, y)
        loss.backward()
        optimizer.step()
        loss_sum_train += loss.item()
        output_loss_train = loss_sum_train / (i + 1)

    model.eval()
    y_list, y_prediction_list = [], []
    y_array, y_prediction_array = np.array([]), np.array([])
    with torch.no_grad():
        for index, (x_, y_) in enumerate(test_loader):
            y_prediction_ = model(x_)
            loss = criterion(y_prediction_, y_)
            y_list = torch.tensor(y_).numpy()
            y_array = np.append(y_array, y_list)
            y_prediction_list = torch.tensor(y_prediction_).numpy()
            y_prediction_array = np.append(y_prediction_array, y_prediction_)
            loss_sum_test += loss.item()
            output_loss_test = loss_sum_test / (i + 1)

    y_array = y_array.tolist()
    y_prediction_array = y_prediction_array.tolist()
    ndcgscore = ndcg_score([y_array], [y_prediction_array])
    print("epoch: {}, train_loss: {}, test_loss: {}, ndcg_score: {}".format(epoch + 1, output_loss_train,
                                                                            output_loss_test, ndcgscore))


epoch: 1, train_loss: 5.481974038055965, test_loss: 0.9939663989203317, ndcg_score: 0.9503540688848905
epoch: 2, train_loss: 3.5784161124910625, test_loss: 0.861128158228738, ndcg_score: 0.9505360494468101
epoch: 3, train_loss: 3.424962229388101, test_loss: 0.8491232071604048, ndcg_score: 0.9507985933566854
epoch: 4, train_loss: 3.3989645430019926, test_loss: 0.8447950312069484, ndcg_score: 0.950722745104761
epoch: 5, train_loss: 3.389598447935922, test_loss: 0.8431819404874529, ndcg_score: 0.9508207253202141
epoch: 6, train_loss: 3.3827703424862454, test_loss: 0.8429012434823172, ndcg_score: 0.9507255888889335
epoch: 7, train_loss: 3.3805575013160705, test_loss: 0.8444405674934388, ndcg_score: 0.9504600422285151
epoch: 8, train_loss: 3.3792915463447573, test_loss: 0.842950907775334, ndcg_score: 0.9504324799403038
epoch: 9, train_loss: 3.377736152921404, test_loss: 0.8410760709217616, ndcg_score: 0.9508659878919993
epoch: 10, train_loss: 3.375027227401733, test_loss: 0.8418848548616682