In [1]:
# SeqRecon-AD 消融实验：Full（finetune embedding + self-clean）vs NoClean、MeanAgg、RandEmb、FreezeEmb

import math
import gc
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score, precision_score, recall_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)

Using device: cuda


In [2]:
# 1. 数据加载（与 SeqRecon-AD 一致）

card_item = pd.read_csv('card_item.csv')
card_feats = pd.read_csv('card_feats.csv', usecols=['label','card_id','name','身份证号','age'])
dataset = pd.concat([card_item, card_feats], axis=1)

import ast
if isinstance(dataset['明细项目名称'].iloc[0], str):
    dataset['明细项目名称'] = dataset['明细项目名称'].apply(ast.literal_eval)

with open('item2id.json', 'r', encoding='utf-8') as f:
    item2id = json.load(f)
id2item = {v: k for k, v in item2id.items()}
num_items = len(item2id)

def map_items_to_ids(items, item2id):
    return [item2id[item] for item in items if item in item2id]
dataset['明细项目ID'] = dataset['明细项目名称'].apply(lambda x: map_items_to_ids(x, item2id))

train_df, temp_df = train_test_split(dataset, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"train={len(train_df)}, val={len(val_df)}, test={len(test_df)}, num_items={num_items}")

train=7133, val=892, test=892, num_items=4119


In [3]:
# 2. 位置感知 Transformer + 消融参数（MeanAgg / RandEmb / FreezeEmb）；异常分数不做账户内 z-score

class RelativePositionalEncoding(nn.Module):
    def __init__(self, num_heads, max_len=512):
        super().__init__()
        self.rel_pos_table = nn.Parameter(torch.randn(2 * max_len - 1, num_heads))
        self.max_len = max_len
    def forward(self, q_len, k_len):
        range_q = torch.arange(q_len)[:, None]
        range_k = torch.arange(k_len)[None, :]
        distance_mat = range_k - range_q
        distance_mat = distance_mat.clamp(-self.max_len + 1, self.max_len - 1) + self.max_len - 1
        rel_bias = self.rel_pos_table[distance_mat].permute(2, 0, 1)
        return rel_bias

class RelativeMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.2):
        super().__init__()
        self.embed_dim, self.num_heads, self.dropout = embed_dim, num_heads, dropout
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None, pos_bias=None):
        B, L, D = query.shape
        H, d = self.num_heads, D // self.num_heads
        q = self.q_proj(query).view(B, L, H, d).transpose(1, 2)
        k = self.k_proj(key).view(B, L, H, d).transpose(1, 2)
        v = self.v_proj(value).view(B, L, H, d).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d)
        if pos_bias is not None: scores = scores + pos_bias.unsqueeze(0)
        if attn_mask is not None: scores = scores + attn_mask.unsqueeze(0).unsqueeze(0)
        if key_padding_mask is not None: scores = scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
        attn_weights = F.dropout(torch.softmax(scores, dim=-1), p=self.dropout, training=self.training)
        out = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(B, L, D)
        return self.out_proj(out)

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.2):
        super().__init__()
        self.self_attn = RelativeMultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, d_model * 4)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_model * 4, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    def forward(self, src, src_mask=None, src_key_padding_mask=None, pos_bias=None):
        src2 = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask, pos_bias=pos_bias)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src = src + self.dropout2(self.linear2(F.relu(self.linear1(src))))
        return self.norm2(src)

class TransformerAnomalyDetectorAblation(nn.Module):
    """支持消融：use_topk_agg(MeanAgg), freeze_embedding(FreezeEmb)；use_score_norm 已统一为 False。"""
    def __init__(self, embedding_matrix, d_model=512, nhead=4, num_layers=6, dropout=0.2, pad_idx=0,
                 use_score_norm=False, use_topk_agg=True, freeze_embedding=True):
        super().__init__()
        self.d_model = d_model
        self.pad_idx = pad_idx
        self.use_score_norm = use_score_norm
        self.use_topk_agg = use_topk_agg
        num_items, embedding_dim = embedding_matrix.size()
        emb = (embedding_matrix - embedding_matrix.mean()) / (embedding_matrix.std() + 1e-8)
        self.embedding = nn.Embedding(num_items, embedding_dim)
        self.embedding.weight = nn.Parameter(emb.clone().detach())
        self.embedding.weight.requires_grad = not freeze_embedding
        self.embed_proj = nn.Linear(embedding_dim, d_model)
        self.pos_encoder = RelativePositionalEncoding(num_heads=nhead, max_len=512)
        self.layers = nn.ModuleList([CustomTransformerEncoderLayer(d_model, nhead, dropout) for _ in range(num_layers)])
        self.final_norm = nn.LayerNorm(d_model)
        self.predictor = nn.Linear(d_model, num_items)
        self._init_weights()

    def _init_weights(self):
        initrange = 0.1
        self.embed_proj.bias.data.zero_()
        self.embed_proj.weight.data.uniform_(-initrange, initrange)
        nn.init.xavier_uniform_(self.predictor.weight)
        self.predictor.bias.data.zero_()

    def generate_mask(self, seq_len, device):
        m = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
        m = m.float().masked_fill(m == 0, float('-inf')).masked_fill(m == 1, 0.0)
        return m.to(device)

    def forward(self, src, src_mask=None):
        B, L = src.size()
        src_emb = self.embedding(src)
        src_emb = self.embed_proj(src_emb) * math.sqrt(self.d_model)
        src_emb = F.layer_norm(src_emb, src_emb.shape[-1:])
        pos_bias = self.pos_encoder(L, L)
        pad_mask = (src == self.pad_idx)
        out = src_emb
        for layer in self.layers:
            out = layer(out, src_mask=src_mask, src_key_padding_mask=pad_mask, pos_bias=pos_bias)
        return self.predictor(self.final_norm(out))

    def compute_loss(self, src, tgt, mask=None):
        seq_len = src.size(1)
        causal_mask = self.generate_mask(seq_len, src.device)
        predictions = self.forward(src, src_mask=causal_mask)[:, :-1, :].contiguous()
        tgt = tgt[:, 1:].contiguous()
        if mask is not None:
            mask = mask[:, 1:].contiguous()
            loss = F.cross_entropy(predictions.view(-1, predictions.size(-1)), tgt.view(-1), reduction='none')
            loss = loss[mask.view(-1) == 1].mean()
        else:
            loss = F.cross_entropy(predictions.view(-1, predictions.size(-1)), tgt.view(-1), ignore_index=self.pad_idx)
        if torch.isnan(loss) or torch.isinf(loss):
            return torch.tensor(0.0, requires_grad=True).to(loss.device)
        return loss

    def compute_anomaly_score(self, sequences, mask=None, topk_ratio=0.2, return_token_level=False):
        with torch.no_grad():
            seq_len = sequences.size(1)
            causal_mask = self.generate_mask(seq_len, sequences.device)
            predictions = self.forward(sequences, src_mask=causal_mask)[:, :-1, :].contiguous()
            targets = sequences[:, 1:].contiguous()
            per_position_loss = F.cross_entropy(
                predictions.view(-1, predictions.size(-1)), targets.view(-1), reduction='none'
            ).view_as(targets)
            if mask is not None:
                mask_cut = mask[:, 1:].contiguous()
                per_position_loss = per_position_loss * mask_cut
            if self.use_score_norm:
                mean = per_position_loss.mean(dim=1, keepdim=True)
                std = per_position_loss.std(dim=1, keepdim=True) + 1e-8
                normalized_loss = (per_position_loss - mean) / std
            else:
                normalized_loss = per_position_loss
            if self.use_topk_agg:
                k = max(1, int(topk_ratio * (seq_len - 1)))
                topk_values, _ = torch.topk(normalized_loss, k=k, dim=1)
                scores = topk_values.mean(dim=1)
            else:
                if mask is not None:
                    scores = (normalized_loss * mask_cut).sum(dim=1) / (mask_cut.sum(dim=1) + 1e-8)
                else:
                    scores = normalized_loss.mean(dim=1)
            if return_token_level:
                return scores.cpu().numpy(), normalized_loss.cpu().numpy()
            return scores.cpu().numpy()

print('TransformerAnomalyDetectorAblation ready.')

TransformerAnomalyDetectorAblation ready.


In [4]:
# 3. Dataset / DataLoader

class PrescriptionDataset(Dataset):
    def __init__(self, dataframe, max_length=517, pad_idx=0):
        self.data = dataframe.reset_index(drop=True)
        self.max_length = max_length
        self.pad_idx = pad_idx
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        sequence = self.data.iloc[idx]['明细项目ID']
        if len(sequence) > self.max_length:
            sequence = sequence[:self.max_length]
            original_len = self.max_length
        else:
            original_len = len(sequence)
            sequence = sequence + [self.pad_idx] * (self.max_length - len(sequence))
        mask = [1] * original_len + [0] * (self.max_length - original_len)
        return {
            'input_seq': torch.tensor(sequence, dtype=torch.long),
            'target_seq': torch.tensor(sequence, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.float),
            'label': torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float),
        }

def custom_collate_fn(batch):
    return {
        'input_seq': torch.stack([b['input_seq'] for b in batch]),
        'target_seq': torch.stack([b['target_seq'] for b in batch]),
        'mask': torch.stack([b['mask'] for b in batch]),
        'label': torch.stack([b['label'] for b in batch]),
    }

train_dataset = PrescriptionDataset(train_df)
val_dataset = PrescriptionDataset(val_df)
test_dataset = PrescriptionDataset(test_df)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

In [5]:
# 4. 评估函数：Recall@10 / NDCG@10 & 异常检测（AUC, PR-AUC, F1）

def evaluate_retrieval(model, data_loader, device, k=10):
    model.eval()
    total_recall, total_ndcg, total_cnt = 0.0, 0.0, 0
    with torch.no_grad():
        for batch in data_loader:
            input_seq = batch['input_seq'].to(device)
            target_seq = batch['target_seq'].to(device)
            mask = batch['mask'].to(device)
            B, L = input_seq.size()
            causal_mask = model.generate_mask(L, device)
            logits = model(input_seq, src_mask=causal_mask)
            last_logits = logits[:, -1, :].clone()
            last_logits[:, 0] = -float('inf')
            _, topk_indices = torch.topk(last_logits, k=k, dim=-1)
            lengths = mask.sum(dim=1).long()
            next_indices = (lengths - 1).clamp(min=0)
            next_item = target_seq.gather(1, next_indices.view(-1, 1)).squeeze(1)
            gt = next_item.cpu().numpy()
            pred = topk_indices.cpu().numpy()
            for g, p in zip(gt, pred):
                if np.any(p == g):
                    total_recall += 1.0
                    rank = np.where(p == g)[0][0] + 1
                    total_ndcg += 1.0 / math.log2(rank + 1)
                total_cnt += 1
    return total_recall / total_cnt, total_ndcg / total_cnt

def evaluate_model_top(model, loader, device, top_percent=0.2):
    all_scores, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Eval'):
            scores = model.compute_anomaly_score(batch['input_seq'].to(device), batch['mask'].to(device), topk_ratio=0.2)
            scores = np.nan_to_num(scores, nan=0.0)
            all_scores.extend(scores)
            all_labels.extend(batch['label'].numpy())
    all_scores = np.asarray(all_scores)
    all_labels = np.asarray(all_labels)
    cutoff = np.percentile(all_scores, 100 * (1 - top_percent))
    pred_labels = (all_scores >= cutoff).astype(int)
    prec, rec, _ = precision_recall_curve(all_labels, all_scores)
    return {
        'auc': roc_auc_score(all_labels, all_scores),
        'pr_auc': auc(rec, prec),
        'f1': f1_score(all_labels, pred_labels, zero_division=0),
        'precision': precision_score(all_labels, pred_labels, zero_division=0),
        'recall': recall_score(all_labels, pred_labels, zero_division=0),
    }

In [6]:
# 5. 训练循环：带自清洗 (Full/MeanAgg/RandEmb/FreezeEmb) 与 无自清洗 (NoClean)

import copy

def self_cleaning_training_loopv3(model, original_dataset, original_val_dataset, device,
                                  max_epochs=150, clean_start_epoch=20, clean_ratio=0.2,
                                  patience=5, batch_size=128, k=10, eval_fn=evaluate_retrieval,
                                  save_path='ablation.pt'):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)
    best_recall, best_ndcg, epochs_no_improve = 0.0, 0.0, 0
    clean_mode, clean_count = False, 0
    current_indices = list(range(len(original_dataset)))
    current_dataset = Subset(original_dataset, current_indices)
    train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
    val_loader = DataLoader(original_val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)
    best_result = None
    for epoch in range(max_epochs):
        model.train()
        total_loss, total_batches = 0.0, 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs}'):
            input_seq = batch['input_seq'].to(device)
            target_seq = batch['target_seq'].to(device)
            mask = batch['mask'].to(device)
            optimizer.zero_grad()
            loss = model.compute_loss(input_seq, target_seq, mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
            total_batches += 1
        scheduler.step()
        avg_loss = total_loss / total_batches
        model.eval()
        recall_k, ndcg_k = eval_fn(model, val_loader, device, k=k)
        print(f'Epoch {epoch+1} Loss: {avg_loss:.4f} Val R@{k}: {recall_k:.4f} NDCG@{k}: {ndcg_k:.4f}')
        if recall_k > best_recall:
            best_recall, best_ndcg = recall_k, ndcg_k
            best_result = {'epoch': epoch+1, 'avg_loss': avg_loss, 'recall': recall_k, 'ndcg': ndcg_k}
            torch.save(model.state_dict(), save_path)
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        if (epoch + 1 >= clean_start_epoch or clean_mode) and epochs_no_improve >= patience:
            print(f'Self-cleaning round {clean_count+1}...')
            model.eval()
            all_scores = []
            for batch in DataLoader(Subset(original_dataset, current_indices), batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn):
                all_scores.append(model.compute_anomaly_score(batch['input_seq'].to(device), batch['mask'].to(device)))
            all_scores = np.concatenate(all_scores)
            valid = (~np.isnan(all_scores)) & (all_scores > 0)
            if valid.sum() > 0:
                thresh = np.quantile(all_scores[valid], 1 - clean_ratio)
                keep = np.where(all_scores < thresh)[0]
                current_indices = [current_indices[i] for i in keep]
                current_dataset = Subset(original_dataset, current_indices)
                train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
                print(f'Retained {len(current_indices)} / {len(original_dataset)} samples.')
            clean_mode, clean_count, epochs_no_improve = True, clean_count + 1, 0
    model.best_result = best_result
    return model

def train_no_cleaning(model, train_loader, val_loader, device, max_epochs=150, patience=5, batch_size=128, k=10,
                      eval_fn=evaluate_retrieval, save_path='ablation.pt'):
    """NoClean：无自清洗，仅按 Val Recall@k 早停。"""
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)
    best_recall, best_ndcg, best_result, epochs_no_improve = 0.0, 0.0, None, 0
    for epoch in range(max_epochs):
        model.train()
        total_loss, total_batches = 0.0, 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs}'):
            input_seq = batch['input_seq'].to(device)
            target_seq = batch['target_seq'].to(device)
            mask = batch['mask'].to(device)
            optimizer.zero_grad()
            loss = model.compute_loss(input_seq, target_seq, mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
            total_batches += 1
        scheduler.step()
        avg_loss = total_loss / total_batches
        model.eval()
        recall_k, ndcg_k = eval_fn(model, val_loader, device, k=k)
        print(f'Epoch {epoch+1} Loss: {avg_loss:.4f} Val R@{k}: {recall_k:.4f} NDCG@{k}: {ndcg_k:.4f}')
        if recall_k > best_recall:
            best_recall, best_ndcg = recall_k, ndcg_k
            best_result = {'epoch': epoch+1, 'avg_loss': avg_loss, 'recall': recall_k, 'ndcg': ndcg_k}
            torch.save(model.state_dict(), save_path)
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f'Early stop at epoch {epoch+1}.')
                break
    model.best_result = best_result
    return model

In [7]:
# 6. 加载 PULSE 嵌入与消融配置

embedding_dim = 4096
pretrained_emb = nn.Embedding(num_items, embedding_dim)
pretrained_emb.load_state_dict(torch.load('item_embedding.pt'))
with torch.no_grad():
    embedding_matrix_pulse = pretrained_emb.weight.clone().detach()

# RandEmb 用随机初始化（同 shape）
torch.manual_seed(42)
embedding_matrix_rand = torch.randn(num_items, embedding_dim) * 0.02

# Full = finetune embedding + self-clean；对比：NoClean、MeanAgg、RandEmb、FreezeEmb
ABLATION_CONFIGS = [
    {'name': 'Full',     'use_cleaning': True,  'use_topk_agg': True,  'embedding': embedding_matrix_pulse, 'freeze_embedding': False},
    {'name': 'NoClean',  'use_cleaning': False, 'use_topk_agg': True,  'embedding': embedding_matrix_pulse, 'freeze_embedding': False},
    {'name': 'MeanAgg',  'use_cleaning': True,  'use_topk_agg': False, 'embedding': embedding_matrix_pulse, 'freeze_embedding': False},
    {'name': 'RandEmb',  'use_cleaning': True,  'use_topk_agg': True,  'embedding': embedding_matrix_rand,  'freeze_embedding': True},
    {'name': 'FreezeEmb','use_cleaning': True,  'use_topk_agg': True,  'embedding': embedding_matrix_pulse, 'freeze_embedding': True},
]
print('Configs:', [c['name'] for c in ABLATION_CONFIGS])

  pretrained_emb.load_state_dict(torch.load('item_embedding.pt'))


Configs: ['Full', 'NoClean', 'MeanAgg', 'RandEmb', 'FreezeEmb']


In [None]:
# 7. 逐组训练并评测，汇总结果表

max_epochs = 150
clean_start_epoch = 20
clean_ratio = 0.1
patience = 5
k = 10

results_list = []

for cfg in ABLATION_CONFIGS:
    name = cfg['name']
    save_path = f'ablation_{name}.pt'
    print(f'\\n========== {name} ==========')
    model = TransformerAnomalyDetectorAblation(
        cfg['embedding'],
        d_model=512, nhead=2, num_layers=8, dropout=0.1, pad_idx=0,
        use_score_norm=False,
        use_topk_agg=cfg['use_topk_agg'],
        freeze_embedding=cfg['freeze_embedding'],
    ).to(device)
    if cfg['use_cleaning']:
        model = self_cleaning_training_loopv3(
            model, train_dataset, val_dataset, device,
            max_epochs=max_epochs, clean_start_epoch=clean_start_epoch, clean_ratio=clean_ratio,
            patience=patience, batch_size=batch_size, k=k, save_path=save_path,
        )
    else:
        model = train_no_cleaning(
            model, train_loader, val_loader, device,
            max_epochs=max_epochs, patience=patience, batch_size=batch_size, k=k, save_path=save_path,
        )
    # 加载最佳权重并在测试集上评测
    model.load_state_dict(torch.load(save_path))
    model.eval()
    test_loss_sum, test_n = 0.0, 0
    with torch.no_grad():
        for batch in test_loader:
            x = batch['input_seq'].to(device)
            y = batch['target_seq'].to(device)
            m = batch['mask'].to(device)
            test_loss_sum += model.compute_loss(x, y, m).item() * x.size(0)
            test_n += x.size(0)
    avg_loss = test_loss_sum / max(test_n, 1)
    recall10, ndcg10 = evaluate_retrieval(model, test_loader, device, k=k)
    ad_metrics = evaluate_model_top(model, test_loader, device, top_percent=0.2)
    results_list.append({
        'Variant': name,
        'Avg_Loss': round(avg_loss, 4),
        'Recall@10': round(recall10, 4),
        'NDCG@10': round(ndcg10, 4),
        'AUC': round(ad_metrics['auc'], 4),
        'PR-AUC': round(ad_metrics['pr_auc'], 4),
        'Precision': round(ad_metrics['precision'], 4),
        'Recall': round(ad_metrics['recall'], 4),
        'F1': round(ad_metrics['f1'], 4),
    })
    # 每组跑完后释放显存，避免多组累积 OOM
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

# 最终评测指标：Avg_Loss, Recall@10, NDCG@10, AUC, PR-AUC, Precision, Recall, F1
ablation_df = pd.DataFrame(results_list)
cols = ['Variant', 'Avg_Loss', 'Recall@10', 'NDCG@10', 'AUC', 'PR-AUC', 'Precision', 'Recall', 'F1']
ablation_df = ablation_df[cols]
print('\\n========== 消融结果汇总（Full=finetune emb+self-clean）==========')
display(ablation_df)
ablation_df.to_csv('ablation_results.csv', index=False, encoding='utf-8-sig')



Epoch 1/150: 100%|██████████| 56/56 [00:42<00:00,  1.31it/s]


Epoch 1 Loss: 6.7690 Val R@10: 0.1166 NDCG@10: 0.0617


Epoch 2/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 2 Loss: 5.8343 Val R@10: 0.1603 NDCG@10: 0.0939


Epoch 3/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 3 Loss: 5.6393 Val R@10: 0.1648 NDCG@10: 0.1025


Epoch 4/150: 100%|██████████| 56/56 [00:42<00:00,  1.31it/s]


Epoch 4 Loss: 5.5462 Val R@10: 0.1827 NDCG@10: 0.1077


Epoch 5/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 5 Loss: 5.4647 Val R@10: 0.1973 NDCG@10: 0.1172


Epoch 6/150: 100%|██████████| 56/56 [00:41<00:00,  1.34it/s]


Epoch 6 Loss: 5.3940 Val R@10: 0.2040 NDCG@10: 0.1261


Epoch 7/150: 100%|██████████| 56/56 [00:41<00:00,  1.34it/s]


Epoch 7 Loss: 5.3263 Val R@10: 0.2108 NDCG@10: 0.1323


Epoch 8/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 8 Loss: 5.2658 Val R@10: 0.2242 NDCG@10: 0.1378


Epoch 9/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 9 Loss: 5.2091 Val R@10: 0.2466 NDCG@10: 0.1472


Epoch 10/150: 100%|██████████| 56/56 [00:40<00:00,  1.37it/s]


Epoch 10 Loss: 5.1604 Val R@10: 0.2466 NDCG@10: 0.1501


Epoch 11/150: 100%|██████████| 56/56 [00:42<00:00,  1.31it/s]


Epoch 11 Loss: 5.1083 Val R@10: 0.2478 NDCG@10: 0.1505


Epoch 12/150: 100%|██████████| 56/56 [00:42<00:00,  1.31it/s]


Epoch 12 Loss: 5.0664 Val R@10: 0.2668 NDCG@10: 0.1566


Epoch 13/150: 100%|██████████| 56/56 [00:42<00:00,  1.31it/s]


Epoch 13 Loss: 5.0257 Val R@10: 0.2668 NDCG@10: 0.1593


Epoch 14/150: 100%|██████████| 56/56 [00:42<00:00,  1.32it/s]


Epoch 14 Loss: 4.9869 Val R@10: 0.2836 NDCG@10: 0.1667


Epoch 15/150: 100%|██████████| 56/56 [00:42<00:00,  1.32it/s]


Epoch 15 Loss: 4.9539 Val R@10: 0.2892 NDCG@10: 0.1696


Epoch 16/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 16 Loss: 4.9179 Val R@10: 0.2848 NDCG@10: 0.1712


Epoch 17/150: 100%|██████████| 56/56 [00:42<00:00,  1.32it/s]


Epoch 17 Loss: 4.8897 Val R@10: 0.2993 NDCG@10: 0.1761


Epoch 18/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 18 Loss: 4.8599 Val R@10: 0.3027 NDCG@10: 0.1784


Epoch 19/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 19 Loss: 4.8250 Val R@10: 0.2971 NDCG@10: 0.1785


Epoch 20/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 20 Loss: 4.7990 Val R@10: 0.3094 NDCG@10: 0.1805


Epoch 21/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 21 Loss: 4.7770 Val R@10: 0.3004 NDCG@10: 0.1773


Epoch 22/150: 100%|██████████| 56/56 [00:40<00:00,  1.37it/s]


Epoch 22 Loss: 4.7560 Val R@10: 0.3195 NDCG@10: 0.1854


Epoch 23/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 23 Loss: 4.7303 Val R@10: 0.3027 NDCG@10: 0.1834


Epoch 24/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 24 Loss: 4.7060 Val R@10: 0.3195 NDCG@10: 0.1863


Epoch 25/150: 100%|██████████| 56/56 [00:42<00:00,  1.33it/s]


Epoch 25 Loss: 4.6861 Val R@10: 0.3229 NDCG@10: 0.1902


Epoch 26/150: 100%|██████████| 56/56 [00:42<00:00,  1.32it/s]


Epoch 26 Loss: 4.6647 Val R@10: 0.3184 NDCG@10: 0.1898


Epoch 27/150: 100%|██████████| 56/56 [00:41<00:00,  1.34it/s]


Epoch 27 Loss: 4.6445 Val R@10: 0.3128 NDCG@10: 0.1860


Epoch 28/150: 100%|██████████| 56/56 [00:41<00:00,  1.35it/s]


Epoch 28 Loss: 4.6253 Val R@10: 0.3229 NDCG@10: 0.1902


Epoch 29/150: 100%|██████████| 56/56 [00:41<00:00,  1.34it/s]


Epoch 29 Loss: 4.6059 Val R@10: 0.3083 NDCG@10: 0.1798


Epoch 30/150: 100%|██████████| 56/56 [00:40<00:00,  1.38it/s]


Epoch 30 Loss: 4.5916 Val R@10: 0.3195 NDCG@10: 0.1873
Self-cleaning round 1...
Retained 6420 / 7133 samples.


Epoch 31/150: 100%|██████████| 51/51 [00:37<00:00,  1.38it/s]


Epoch 31 Loss: 4.5588 Val R@10: 0.3173 NDCG@10: 0.1856


Epoch 32/150: 100%|██████████| 51/51 [00:37<00:00,  1.35it/s]


Epoch 32 Loss: 4.5347 Val R@10: 0.3184 NDCG@10: 0.1877


Epoch 34/150: 100%|██████████| 51/51 [00:37<00:00,  1.34it/s]


Epoch 34 Loss: 4.4992 Val R@10: 0.3240 NDCG@10: 0.1876


Epoch 35/150: 100%|██████████| 51/51 [00:38<00:00,  1.33it/s]


Epoch 35 Loss: 4.4777 Val R@10: 0.3274 NDCG@10: 0.1888


Epoch 36/150:  41%|████      | 21/51 [00:16<00:23,  1.30it/s]