In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# 数据预处理
# 读取数据
ratings = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
movies = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin-1')
users = pd.read_csv(r'D:\code_file\DINandDIEN\ml-1m\users.dat', sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

In [49]:
# 处理电影类型
all_genres = set()
for row in movies['Genres']:
    for genre in row.split('|'):
        all_genres.add(genre)
genre_to_idx = {genre: idx for idx, genre in enumerate(all_genres)}
num_genres = len(all_genres)
movies['genre_multi_hot'] = movies['Genres'].apply(lambda x: [1 if genre in x.split('|') else 0 for genre in all_genres])
genre_multi_hot = np.stack(movies['genre_multi_hot'].values)
movie_id_to_genre = dict(zip(movies['MovieID'], genre_multi_hot))

In [53]:
# 处理用户数据
gender_to_idx = {'F':0, 'M':1}
users['Gender'] = users['Gender'].map(gender_to_idx)
users['Age'] = users['Age'] - 1  # 原始年龄为1-7
user_dict = users.set_index('UserID').to_dict('index')


In [60]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [64]:
(ratings.groupby('UserID').apply(lambda x: x.sort_values('Timestamp')))[1]

KeyError: 1

In [56]:
# 生成样本
max_len = 50
samples = []
for user_id, group in ratings.groupby('UserID').apply(lambda x: x.sort_values('Timestamp')):
    user_data = user_dict[user_id]
    gender = user_data['Gender']
    age = user_data['Age']
    occupation = user_data['Occupation']
    history = []
    for idx, row in group.iterrows():
        movie_id = row['MovieID']
        label = 1 if row['Rating'] >= 4 else 0
        current_history = history[-max_len:]
        padded_history = current_history + [0]*(max_len - len(current_history))
        mask = [1]*len(current_history) + [0]*(max_len - len(current_history))
        candidate_genre = movie_id_to_genre.get(movie_id, np.zeros(num_genres))
        history_genres = []
        for hist_id in padded_history:
            if hist_id == 0:
                history_genres.append(np.zeros(num_genres))
            else:
                history_genres.append(movie_id_to_genre.get(hist_id, np.zeros(num_genres)))
        samples.append({
            'user_gender': gender,
            'user_age': age,
            'user_occupation': occupation,
            'candidate_movie_id': movie_id,
            'candidate_genre': candidate_genre,
            'history_movie_ids': padded_history,
            'history_genres': np.array(history_genres),
            'history_mask': mask,
            'label': label
        })
        history.append(movie_id)
        history = history[-max_len:]

samples_df = pd.DataFrame(samples)

ValueError: too many values to unpack (expected 2)

ValueError: too many values to unpack (expected 2)

In [None]:







# 数据集划分
train_df, test_df = train_test_split(samples_df, test_size=0.2, random_state=42)

# 自定义Dataset
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            'user_gender': torch.tensor(row['user_gender'], dtype=torch.long),
            'user_age': torch.tensor(row['user_age'], dtype=torch.long),
            'user_occupation': torch.tensor(row['user_occupation'], dtype=torch.long),
            'candidate_movie_id': torch.tensor(row['candidate_movie_id'], dtype=torch.long),
            'candidate_genre': torch.tensor(row['candidate_genre'], dtype=torch.float32),
            'history_movie_ids': torch.tensor(row['history_movie_ids'], dtype=torch.long),
            'history_genres': torch.tensor(row['history_genres'], dtype=torch.float32),
            'history_mask': torch.tensor(row['history_mask'], dtype=torch.float32),
            'label': torch.tensor(row['label'], dtype=torch.float32)
        }

train_dataset = MovieLensDataset(train_df)
test_dataset = MovieLensDataset(test_df)

# 模型定义
class UserEncoder(nn.Module):
    def __init__(self, gender_size=2, age_size=7, occupation_size=21, embed_dim=16):
        super().__init__()
        self.gender_embed = nn.Embedding(gender_size, embed_dim)
        self.age_embed = nn.Embedding(age_size, embed_dim)
        self.occupation_embed = nn.Embedding(occupation_size, embed_dim)
    
    def forward(self, gender, age, occupation):
        return torch.cat([
            self.gender_embed(gender),
            self.age_embed(age),
            self.occupation_embed(occupation)
        ], dim=1)

class MovieEncoder(nn.Module):
    def __init__(self, movie_size=3953, genre_size=18, embed_dim=32):
        super().__init__()
        self.movie_embed = nn.Embedding(movie_size, embed_dim)
        self.genre_fc = nn.Linear(genre_size, embed_dim)
    
    def forward(self, movie_id, genre):
        return torch.cat([
            self.movie_embed(movie_id),
            self.genre_fc(genre)
        ], dim=1)

class DIN(nn.Module):
    def __init__(self, user_encoder, movie_encoder, hidden_dim=128):
        super().__init__()
        self.user_encoder = user_encoder
        self.movie_encoder = movie_encoder
        self.attention = nn.Sequential(
            nn.Linear(3*64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.mlp = nn.Sequential(
            nn.Linear(64*2 + 48, 256),  # 64*2来自用户兴趣和候选物品，48来自用户特征
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, user_gender, user_age, user_occupation, candidate_movie, candidate_genre, history_movies, history_genres, mask):
        # 用户特征
        user_feat = self.user_encoder(user_gender, user_age, user_occupation)
        
        # 候选物品
        candidate_feat = self.movie_encoder(candidate_movie, candidate_genre)
        
        # 历史行为
        batch_size, seq_len = history_movies.size()
        history_feat = self.movie_encoder(
            history_movies.view(-1),
            history_genres.view(-1, history_genres.size(-1))
        ).view(batch_size, seq_len, -1)
        
        # 注意力机制
        candidate_expanded = candidate_feat.unsqueeze(1).expand_as(history_feat)
        interaction = history_feat * candidate_expanded
        attention_input = torch.cat([history_feat, candidate_expanded, interaction], dim=-1)
        attention_scores = self.attention(attention_input).squeeze(-1)
        attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        attention_weights = F.softmax(attention_scores, dim=1)
        
        # 用户兴趣表示
        user_interest = torch.bmm(attention_weights.unsqueeze(1), history_feat).squeeze(1)
        
        # 最终预测
        concat = torch.cat([user_interest, candidate_feat, user_feat], dim=1)
        return self.mlp(concat)

# 初始化模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
user_encoder = UserEncoder()
movie_encoder = MovieEncoder()
model = DIN(user_encoder, movie_encoder).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练循环
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {k: v.to(device).unsqueeze(1) if k in ['user_gender', 'user_age', 'user_occupation'] 
                 else v.to(device) for k, v in batch.items()}
        outputs = model(
            inputs['user_gender'],
            inputs['user_age'],
            inputs['user_occupation'],
            inputs['candidate_movie_id'],
            inputs['candidate_genre'],
            inputs['history_movie_ids'],
            inputs['history_genres'],
            inputs['history_mask']
        )
        loss = criterion(outputs.squeeze(), inputs['label'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(batch)
    return total_loss / len(dataloader.dataset)

# 测试循环
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device).unsqueeze(1) if k in ['user_gender', 'user_age', 'user_occupation'] 
                     else v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = criterion(outputs.squeeze(), inputs['label'])
            total_loss += loss.item() * len(batch)
            preds = (outputs > 0.5).float()
            correct += (preds == inputs['label']).sum().item()
    return total_loss / len(dataloader.dataset), correct / len(dataloader.dataset)

# 数据加载
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 训练过程
for epoch in range(10):
    train_loss = train(model, train_loader, criterion, optimizer)
    test_loss, test_acc = evaluate(model, test_loader)
    print(f'Epoch {epoch+1:02}')
    print(f'Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}')