In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import math

#########################################
# 1. 数据准备与异常值处理
#########################################

# 读取 CSV 数据（请确保文件路径正确）
user_df = pd.read_csv('user_features.csv')
item_df = pd.read_csv('movie_features.csv')
ratings_df = pd.read_csv('ratings_cleaned.csv')

# 对数据中的 NaN、无穷大做处理（例如填充0）
user_df = user_df.replace([np.inf, -np.inf], np.nan).fillna(0)
item_df = item_df.replace([np.inf, -np.inf], np.nan).fillna(0)
ratings_df = ratings_df.replace([np.inf, -np.inf], np.nan).fillna(0)

# 建立 userId, movieId 映射（如果文件中没有 user_idx/item_idx 则生成）
user_ids = user_df['userId'].unique()
item_ids = item_df['movieId'].unique()
user2index = {u: i for i, u in enumerate(user_ids)}
item2index = {m: i for i, m in enumerate(item_ids)}

if 'user_idx' not in user_df.columns:
    user_df['user_idx'] = user_df['userId'].map(user2index)
if 'item_idx' not in item_df.columns:
    item_df['item_idx'] = item_df['movieId'].map(item2index)

# 除去标识列，其余全部视为数值特征（请确保这些列为数值型）
user_feature_cols = [col for col in user_df.columns if col not in ['userId','user_idx']]
item_feature_cols = [col for col in item_df.columns if col not in ['movieId','item_idx','title','genres','year']]

# 将评分二值化：rating>=3.5 -> 1, 否则 0
ratings_df['label'] = (ratings_df['rating'] >= 3.5).astype(int)
ratings_df['label'] = ratings_df['label'].clip(0,1)  # 再次确保标签在 [0,1]

# 构造正样本数据
pos_df = ratings_df[ratings_df['label'] == 1].copy()

# 对每个用户采样5个负样本（未发生交互）
neg_samples = []
all_item_set = set(item_ids)
for u in ratings_df['userId'].unique():
    pos_items = set(pos_df[pos_df['userId'] == u]['movieId'].unique())
    neg_candidates = list(all_item_set - pos_items)
    if len(neg_candidates) >= 5:
        neg_items = np.random.choice(neg_candidates, size=5, replace=False)
    else:
        neg_items = neg_candidates
    for i in neg_items:
        neg_samples.append((u, i, 0))
neg_df = pd.DataFrame(neg_samples, columns=['userId','movieId','label'])

# 合并正负样本并打乱
data_df = pd.concat([pos_df[['userId','movieId','label']], neg_df], ignore_index=True)
data_df = data_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# 划分训练、验证、测试集
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))


In [None]:

#########################################
# 2. Dataset 定义（含 DIN 历史行为）
#########################################

# 为 DIN 模型构造用户历史正样本字典：userId -> list of item-feature vectors
user_history = {}
for u in pos_df['userId'].unique():
    items = pos_df[pos_df['userId'] == u]['movieId'].tolist()
    feats = []
    for item in items:
        row = item_df[item_df['movieId'] == item][item_feature_cols].values
        if len(row) > 0:
            feats.append(row[0])
    user_history[u] = feats

# 基础数据集：返回 (user_feat, item_feat, label)
class RankingDataset(Dataset):
    def __init__(self, df, model_type='base', max_hist=5):
        self.df = df.reset_index(drop=True)
        self.model_type = model_type
        self.max_hist = max_hist
        self.user_history = user_history if model_type=='din' else None

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        u_id = row['userId']
        i_id = row['movieId']
        label = row['label']
        # 从 user_df、item_df 中获取特征时，也进行异常值处理
        u_feat = user_df[user_df['userId'] == u_id][user_feature_cols].values[0]
        i_feat = item_df[item_df['movieId'] == i_id][item_feature_cols].values[0]
        # 如果 DIN，需要用户历史行为；不足则 padding 0 向量
        if self.model_type == 'din':
            hist = self.user_history.get(u_id, [])
            if len(hist) > self.max_hist:
                hist = hist[-self.max_hist:]
            else:
                while len(hist) < self.max_hist:
                    hist.append(np.zeros_like(i_feat))
            hist = np.array(hist)  # [max_hist, item_dim]
            return (torch.tensor(u_feat, dtype=torch.float32),
                    torch.tensor(i_feat, dtype=torch.float32),
                    torch.tensor(hist, dtype=torch.float32),
                    torch.tensor(label, dtype=torch.float32).clamp(0,1))
        else:
            return (torch.tensor(u_feat, dtype=torch.float32),
                    torch.tensor(i_feat, dtype=torch.float32),
                    torch.tensor(label, dtype=torch.float32).clamp(0,1))

train_dataset_base = RankingDataset(train_df, model_type='base')
val_dataset_base = RankingDataset(val_df, model_type='base')
test_dataset_base = RankingDataset(test_df, model_type='base')

train_dataset_din = RankingDataset(train_df, model_type='din')
val_dataset_din = RankingDataset(val_df, model_type='din')
test_dataset_din = RankingDataset(test_df, model_type='din')

batch_size = 256
train_loader_base = DataLoader(train_dataset_base, batch_size=batch_size, shuffle=True)
val_loader_base = DataLoader(val_dataset_base, batch_size=batch_size, shuffle=False)
test_loader_base = DataLoader(test_dataset_base, batch_size=batch_size, shuffle=False)

train_loader_din = DataLoader(train_dataset_din, batch_size=batch_size, shuffle=True)
val_loader_din = DataLoader(val_dataset_din, batch_size=batch_size, shuffle=False)
test_loader_din = DataLoader(test_dataset_din, batch_size=batch_size, shuffle=False)

user_dim = len(user_feature_cols)
item_dim = len(item_feature_cols)
input_dim = user_dim + item_dim  # 用于 DCN、DeepFM


In [None]:

#########################################
# 3. 模型定义（确保输出在 (1e-7, 1-1e-7)）
#########################################

# 3.1 DCN 模型
class CrossLayer(nn.Module):
    def __init__(self, input_dim):
        super(CrossLayer, self).__init__()
        self.weight = nn.Parameter(torch.randn(input_dim))
        self.bias = nn.Parameter(torch.randn(input_dim))
    
    def forward(self, x0, xl):
        xw = torch.sum(xl * self.weight, dim=1, keepdim=True)
        out = x0 * xw + self.bias + xl
        return out

class DCN(nn.Module):
    def __init__(self, input_dim, num_cross=2, deep_hidden=[64,32]):
        super(DCN, self).__init__()
        self.cross_layers = nn.ModuleList([CrossLayer(input_dim) for _ in range(num_cross)])
        deep_layers = []
        in_dim = input_dim
        for hu in deep_hidden:
            deep_layers.append(nn.Linear(in_dim, hu))
            deep_layers.append(nn.ReLU())
            in_dim = hu
        self.deep = nn.Sequential(*deep_layers)
        self.fc = nn.Linear(input_dim + deep_hidden[-1], 1)
    
    def forward(self, x):
        x0 = x
        xl = x
        for layer in self.cross_layers:
            xl = layer(x0, xl)
        deep_out = self.deep(x0)
        concat = torch.cat([xl, deep_out], dim=1)
        logit = self.fc(concat)
        # 保证输出在 (1e-7, 1-1e-7)
        return torch.clamp(torch.sigmoid(logit), 1e-7, 1-1e-7).squeeze(-1)

# 3.2 DIN 模型（简化版）
class AttentionUnit(nn.Module):
    def __init__(self, item_dim, hidden_units=[32,16]):
        super(AttentionUnit, self).__init__()
        layers = []
        in_dim = item_dim * 2
        for hu in hidden_units:
            layers.append(nn.Linear(in_dim, hu))
            layers.append(nn.ReLU())
            in_dim = hu
        layers.append(nn.Linear(in_dim, 1))
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, hist, target):
        L = hist.size(1)
        target_exp = target.unsqueeze(1).repeat(1, L, 1)
        concat = torch.cat([hist, target_exp], dim=2)
        scores = self.mlp(concat).squeeze(-1)
        att_weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        weighted = torch.sum(hist * att_weights, dim=1)
        return weighted

class DIN(nn.Module):
    def __init__(self, user_dim, item_dim, hidden_units=[64,32], hist_len=5):
        super(DIN, self).__init__()
        self.attention = AttentionUnit(item_dim)
        in_dim = user_dim + item_dim + item_dim
        layers = []
        cur_dim = in_dim
        for hu in hidden_units:
            layers.append(nn.Linear(cur_dim, hu))
            layers.append(nn.ReLU())
            cur_dim = hu
        layers.append(nn.Linear(cur_dim, 1))
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, user_feat, item_feat, hist):
        att_hist = self.attention(hist, item_feat)
        concat = torch.cat([user_feat, item_feat, att_hist], dim=1)
        logit = self.mlp(concat)
        return torch.clamp(torch.sigmoid(logit), 1e-7, 1-1e-7).squeeze(-1)

# 3.3 DeepFM 模型
class DeepFM(nn.Module):
    def __init__(self, input_dim, factor_dim=16, hidden_units=[64,32]):
        super(DeepFM, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.V = nn.Parameter(torch.randn(input_dim, factor_dim))
        layers = []
        in_dim = input_dim
        for hu in hidden_units:
            layers.append(nn.Linear(in_dim, hu))
            layers.append(nn.ReLU())
            in_dim = hu
        layers.append(nn.Linear(in_dim, 1))
        self.deep = nn.Sequential(*layers)
    
    def forward(self, x):
        linear_part = self.linear(x)
        xv = torch.matmul(x, self.V)
        xv_square = xv * xv
        x_square = x * x
        v_square = self.V * self.V
        x_square_v = torch.matmul(x_square, v_square)
        fm_2nd = 0.5 * torch.sum(xv_square - x_square_v, dim=1, keepdim=True)
        deep_out = self.deep(x)
        logit = linear_part + fm_2nd + deep_out
        return torch.clamp(torch.sigmoid(logit), 1e-7, 1-1e-7).squeeze(-1)


In [None]:

#########################################
# 4. 训练与评估函数（对标签也做 clamp）
#########################################

def train_model(model, train_loader, val_loader, epochs=5, lr=1e-3, model_type='base'):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            if model_type == 'din':
                user_feat, item_feat, hist, label = batch
                pred = model(user_feat, item_feat, hist)
            else:
                user_feat, item_feat, label = batch
                if isinstance(model, DCN) or isinstance(model, DeepFM):
                    x = torch.cat([user_feat, item_feat], dim=1)
                    pred = model(x)
                else:
                    pred = model(user_feat, item_feat)
            # 对 label 也 clamp 至 [0,1]
            label = torch.clamp(label, 0, 1)
            loss = criterion(pred, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        val_auc = evaluate_auc(model, val_loader, model_type)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Val AUC: {val_auc:.4f}")
    return model

def evaluate_auc(model, loader, model_type='base'):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in loader:
            if model_type == 'din':
                user_feat, item_feat, hist, label = batch
                out = model(user_feat, item_feat, hist)
            else:
                user_feat, item_feat, label = batch
                if isinstance(model, DCN) or isinstance(model, DeepFM):
                    x = torch.cat([user_feat, item_feat], dim=1)
                    out = model(x)
                else:
                    out = model(user_feat, item_feat)
            preds.append(out.cpu().numpy())
            labels.append(label.cpu().numpy())
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    return roc_auc_score(labels, preds)

def evaluate_ndcg(model, df, model_type='base', top_k=10):
    model.eval()
    user_groups = df.groupby('userId')
    ndcg_list = []
    with torch.no_grad():
        for uid, group in user_groups:
            items = group['movieId'].values
            true_labels = group['label'].values
            if np.sum(true_labels)==0 or np.sum(true_labels)==len(true_labels):
                continue
            scores = []
            u_feat = torch.tensor(user_df[user_df['userId']==uid][user_feature_cols].values[0],
                                  dtype=torch.float32).unsqueeze(0)
            for i in items:
                i_feat = torch.tensor(item_df[item_df['movieId']==i][item_feature_cols].values[0],
                                      dtype=torch.float32).unsqueeze(0)
                if model_type == 'din':
                    hist = user_history.get(uid, [])
                    max_hist = 5
                    if len(hist) > max_hist:
                        hist = hist[-max_hist:]
                    else:
                        while len(hist) < max_hist:
                            hist.append(np.zeros_like(i_feat.numpy()[0]))
                    hist = torch.tensor(np.array(hist), dtype=torch.float32).unsqueeze(0)
                    score = model(u_feat, i_feat, hist)
                else:
                    x = torch.cat([u_feat, i_feat], dim=1)
                    score = model(x)
                scores.append(score.item())
            order = np.argsort(scores)[::-1][:top_k]
            dcg = 0.0
            for rank, idx in enumerate(order, start=1):
                if true_labels[idx] > 0:
                    dcg += 1.0 / math.log2(rank + 1)
            ideal_labels = sorted(true_labels, reverse=True)[:top_k]
            idcg = 0.0
            for rank, rel in enumerate(ideal_labels, start=1):
                if rel > 0:
                    idcg += 1.0 / math.log2(rank + 1)
            if idcg > 0:
                ndcg_list.append(dcg / idcg)
    return np.mean(ndcg_list) if ndcg_list else 0.0


In [None]:

#########################################
# 5. 模型训练与评估
#########################################

# 5.1 训练 DCN（基础数据集）
print("\nTraining DCN ...")
dcn_model = DCN(input_dim=input_dim, num_cross=2, deep_hidden=[64,32])
dcn_model = train_model(dcn_model, train_loader_base, val_loader_base, epochs=5, lr=1e-3, model_type='base')
auc_dcn = evaluate_auc(dcn_model, test_loader_base, model_type='base')
ndcg_dcn = evaluate_ndcg(dcn_model, test_df, model_type='base', top_k=10)
print(f"DCN Test AUC: {auc_dcn:.4f}, NDCG@10: {ndcg_dcn:.4f}")

# 5.2 训练 DIN（DIN 数据集）
print("\nTraining DIN ...")
din_model = DIN(user_dim=user_dim, item_dim=item_dim, hidden_units=[64,32], hist_len=5)
din_model = train_model(din_model, train_loader_din, val_loader_din, epochs=5, lr=1e-3, model_type='din')
auc_din = evaluate_auc(din_model, test_loader_din, model_type='din')
ndcg_din = evaluate_ndcg(din_model, test_df, model_type='din', top_k=10)
print(f"DIN Test AUC: {auc_din:.4f}, NDCG@10: {ndcg_din:.4f}")

# 5.3 训练 DeepFM（基础数据集）
print("\nTraining DeepFM ...")
deepfm_model = DeepFM(input_dim=input_dim, factor_dim=16, hidden_units=[64,32])
deepfm_model = train_model(deepfm_model, train_loader_base, val_loader_base, epochs=5, lr=1e-3, model_type='base')
auc_deepfm = evaluate_auc(deepfm_model, test_loader_base, model_type='base')
ndcg_deepfm = evaluate_ndcg(deepfm_model, test_df, model_type='base', top_k=10)
print(f"DeepFM Test AUC: {auc_deepfm:.4f}, NDCG@10: {ndcg_deepfm:.4f}")
