In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [112]:
# 加载数据
ratings = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movies = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\movies.dat', sep='::', names=['movie_id', 'title', 'genres'], engine='python', encoding='latin-1')
users = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\users.dat', sep='::', names=['user_id', 'gender', 'age', 'occupation', 'zipcode'], engine='python')

In [113]:
# 过滤正样本并生成序列
positive_ratings=ratings[ratings['rating']>=4].reset_index(drop=True)
user_sequences = defaultdict(list)
for user_id, group in positive_ratings.groupby('user_id'):
    sorted_group = group.sort_values('timestamp')
    user_sequences[user_id] = sorted_group['movie_id'].tolist()

In [114]:
# 构建样本
samples = []
for user_id, seq in tqdm(user_sequences.items()):
    if len(seq) < 2:
        continue
    for i in range(1, len(seq)):
        hist = seq[:i]
        target_pos = seq[i]
        # 正样本
        samples.append({'user_id': user_id, 'hist': hist, 'target': target_pos, 'label': 1})
        # 负样本
        all_movies = ratings['movie_id'].unique()
        neg_movie = np.random.choice([m for m in all_movies if m not in seq])
        samples.append({'user_id': user_id, 'hist': hist, 'target': neg_movie, 'label': 0})
    
    if user_id>10:
        break
        
# 划分数据集
train_df, test_df = train_test_split(pd.DataFrame(samples), test_size=0.2)
# 索引映射 这一步的目的是使得所有的用户id和movieid 全部映射为连续的 便于embedding输入
user_to_idx = {uid: i+1 for i, uid in enumerate(ratings['user_id'].unique())}
movie_to_idx = {mid: i+1 for i, mid in enumerate(ratings['movie_id'].unique())}

  0%|          | 10/6038 [00:18<3:03:38,  1.83s/it]


In [121]:
class MovieLensDataset(Dataset):
    def __init__(self, df, max_seq_length):
        self.df = df
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_idx = user_to_idx[row['user_id']]
        hist = [movie_to_idx.get(m, 0) for m in row['hist']]
        # 截断
        hist=hist[:self.max_seq_length]
        # 填充 如果超过最大值则最大值 否则填充
        padded_hist = hist[:self.max_seq_length] + [0]*(self.max_seq_length - len(hist))
        mask = [1]*len(hist) + [0]*(self.max_seq_length - len(hist))
        return {
            'user_id': torch.tensor(user_idx, dtype=torch.long),
            'hist': torch.tensor(padded_hist, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.float),
            'target': torch.tensor(movie_to_idx.get(row['target'], 0), dtype=torch.long),
            'label': torch.tensor(row['label'], dtype=torch.float)
        }

max_seq_length = 20
train_dataset = MovieLensDataset(train_df, max_seq_length)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataset = MovieLensDataset(test_df, max_seq_length)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [None]:
## AUGRU实现
class AUGRUCell(nn.Module):
    """AUGRU cell for attention update
       input_size是嵌入向量维度
       hidden_size自定义
    """
    def __init__(self, input_size, hidden_size, bias=True):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        # (Wxr|Wxz|Wxh)
        self.weight_xrzh = nn.Parameter(
            torch.ones(input_size,3 * hidden_size,dtype=torch.float32))
        # (Hxr|Hxz|Hxh)
        self.weight_hrzh = nn.Parameter(
            torch.ones(hidden_size,3 * hidden_size,dtype=torch.float32))
        if bias:
            # (b)
            self.bias_r = nn.Parameter(torch.zeros(hidden_size))
            self.bias_z = nn.Parameter(torch.zeros(hidden_size))
            self.bias_h = nn.Parameter(torch.zeros(hidden_size))
        else:
            self.register_parameter('bias_r', None)
            self.register_parameter('bias_z', None)
            self.register_parameter('bias_h', None)
        self.reset_parameters()
 
    def reset_parameters(self):
        stdv = 1.0 / self.hidden_size ** 0.5
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.uniform_(param, -stdv, stdv)
            elif 'bias' in name:
                nn.init.zeros_(param)

    # att_score 是 batch_size*1
    # x是上一层gru的输出应该是batch_size,input_size
    def forward(self, x, hidden_state, att_score):
        W_xr,W_xz,W_xh = self.weight_xrzh.chunk(3, 1)
        W_hr,W_hz,W_hh = self.weight_hrzh.chunk(3, 1)
 
        reset_gate = torch.sigmoid(torch.matmul(x,W_xr) + torch.matmul(hidden_state,W_hr)+self.bias_r)
        # batch_size *hidden_size
        update_gate_pre = torch.sigmoid(torch.matmul(x,W_xz) +torch.matmul(hidden_state,W_hz) +self.bias_z)
        update_gate = att_score.reshape(-1, 1) * update_gate_pre
        hidden_gate = torch.tanh(torch.matmul(x,W_xh) + torch.matmul((reset_gate * hidden_state),W_hh) +self.bias_h)
        hidden_state = (1-update_gate)*hidden_state +  update_gate*hidden_gate
        return hidden_state

In [132]:
class DIEN(nn.Module):
    def __init__(self, user_num, movie_num, embed_dim, hidden_size):
        """
        user_num 用户id去重数量用以生成embedding
        movie_num 电影id去重数量用以生成embedding
        embed_din 生成的嵌入向量维度
        """
        super().__init__()
        # 嵌入部分
        self.user_embed = nn.Embedding(user_num, embed_dim)
        self.movie_embed = nn.Embedding(movie_num, embed_dim)
        # 第一层gru 兴趣抽取层
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        # 第二层 兴趣演化层
        self.augru_cell = AUGRUCell(hidden_size, hidden_size)
        # 注意力计算
        self.attention = nn.Linear(hidden_size + embed_dim, 1)
        # MLP部分
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim * 2 + hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid())

    def forward(self, user_ids, hist, target, mask):
        # 嵌入层
        user_emb = self.user_embed(user_ids)  # (B, E)
        hist_emb = self.movie_embed(hist)  # (B, L, E)
        target_emb = self.movie_embed(target)  # (B, E)

        # gru层
        gru_out, _ = self.gru(hist_emb)  # (B, L, H)
        
        #注意力计算
        target_expanded = target_emb.unsqueeze(1).expand(-1, gru_out.size(1), -1)  # (B, L, E)
        att_input = torch.cat([gru_out, target_expanded], dim=-1)
        att_scores = torch.softmax(self.attention(att_input).squeeze(-1).masked_fill(mask == 0, -1e9), dim=1)
        
        # augru层
        h = torch.zeros(user_ids.size(0), gru_out.size(2)).to(user_ids.device)
        for t in range(gru_out.size(1)):
            h = self.augru_cell(gru_out[:, t, :], h, att_scores[:, t])
        
        combined = torch.cat([user_emb, target_emb, h], dim=1)
        return self.mlp(combined).squeeze()

In [133]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DIEN(len(user_to_idx)+1, len(movie_to_idx)+1, 32, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [134]:
for epoch in range(100):
    model.train()
    for batch in train_loader:
        user_ids = batch['user_id'].to(device)
        hist = batch['hist'].to(device)
        mask = batch['mask'].to(device)
        target = batch['target'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(user_ids, hist, target, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

Epoch 1, Loss: 0.7018
Epoch 2, Loss: 0.6909
Epoch 3, Loss: 0.6815
Epoch 4, Loss: 0.6759
Epoch 5, Loss: 0.6757
Epoch 6, Loss: 0.6669
Epoch 7, Loss: 0.6688
Epoch 8, Loss: 0.6581
Epoch 9, Loss: 0.6394
Epoch 10, Loss: 0.6304
Epoch 11, Loss: 0.6215
Epoch 12, Loss: 0.6585
Epoch 13, Loss: 0.5907
Epoch 14, Loss: 0.6239
Epoch 15, Loss: 0.5410
Epoch 16, Loss: 0.6190
Epoch 17, Loss: 0.5541
Epoch 18, Loss: 0.5881
Epoch 19, Loss: 0.5357
Epoch 20, Loss: 0.5405
Epoch 21, Loss: 0.6021
Epoch 22, Loss: 0.5326
Epoch 23, Loss: 0.4980
Epoch 24, Loss: 0.4573
Epoch 25, Loss: 0.4902
Epoch 26, Loss: 0.4852
Epoch 27, Loss: 0.4143
Epoch 28, Loss: 0.4369
Epoch 29, Loss: 0.4451
Epoch 30, Loss: 0.3727
Epoch 31, Loss: 0.3860
Epoch 32, Loss: 0.3882
Epoch 33, Loss: 0.3002
Epoch 34, Loss: 0.3383
Epoch 35, Loss: 0.3508
Epoch 36, Loss: 0.3566
Epoch 37, Loss: 0.3466
Epoch 38, Loss: 0.3277
Epoch 39, Loss: 0.2751
Epoch 40, Loss: 0.2411
Epoch 41, Loss: 0.2532
Epoch 42, Loss: 0.2098
Epoch 43, Loss: 0.2332
Epoch 44, Loss: 0.27

In [135]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        user_ids = batch['user_id'].to(device)
        hist = batch['hist'].to(device)
        mask = batch['mask'].to(device)
        target = batch['target'].to(device)
        labels = batch['label'].to(device)
        outputs = model(user_ids, hist, target, mask)
        predicted = (outputs >= 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy: {correct / total:.4f}')

Accuracy: 0.6283
