In [11]:
import pandas as pd
import torch
import numpy as np
import random
import copy
import math
from scipy.sparse import csr_matrix

In [3]:
data_file = '../data/Toys_and_Games.txt'
lines = open(data_file).readlines()
user_seq = []
item_set = set()
for line in lines:
    user, items = line.strip().split(' ', 1)
    items = items.split(' ')
    items = [int(item) for item in items]
    user_seq.append(items)
    item_set = item_set | set(items)
    
# 11924
max_item = max(item_set)  # 最大item的编号

# 19412
num_users = len(lines)  # 用户数量

# 11926
num_items = max_item + 2  # items数量（为什么要加2？）

In [13]:
def generate_rating_matrix_train(user_seq, num_users, num_items):
    row = []
    col = []
    data = []
    for user_id, item_list in enumerate(user_seq):
        for item in item_list[:-2]:
            row.append(user_id)
            col.append(item)
            data.append(1)
            
    row = np.array(row)
    col = np.array(col)
    data = np.array(data)
    rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))
    return rating_matrix

## 载入数据
user_seq  用户交互序列, 每一行对应当前userId-1，每一行数据为交互items的id


max_item 最大item的id号


num_users user的数量，就是交互序列的长度

In [42]:
from data_handle.utils.runner import get_user_seqs
user_seq, max_item, num_users = get_user_seqs('../data/Toys_and_Games.txt')

In [43]:
item_size = max_item + 2  # max interaction seq + 4
num_users = num_users     # user length
mask_id = max_item + 1    # max interaction seq + 3

## 创建 DataLoader

In [49]:
from data_handle.utils.DataSet import CDDRecDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from data_handle.model.DemoRecModel import DemoRecModel

# 训练集
train_dataset = CDDRecDataset(
    user_seq=user_seq,
    max_seq_length=25, 
    item_size=item_size, 
    test_neg_items=None,
    data_type='train'
)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=128)

# TODO 验证集、测试集

# Model
model = DemoRecModel(
    T=25,
    hidden_size=128,
    item_size=item_size,
    max_seq_length=25,
    num_attention_heads=2,
    attention_probs_dropout_prob=0.2,
    hidden_act='gelu',
    hidden_dropout_prob=0.0,
    linear_infonce=False,
    initializer_range=0.02
)

In [50]:
model

DemoRecModel(
  (time_embeddings): Embedding(25, 128)
  (item_embeddings): Embedding(11926, 128, padding_idx=0)
  (position_embeddings): Embedding(25, 128)
  (decoder): TransformerDecoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (linear1): Linear(in_features=128, out_features=128, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=128, out_features=128, bias=True)
    (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (dropout3): Dropout(p=0.2, inplace=False)
  )

In [52]:
len(item_set)

11924

In [56]:
frequency_items_dict = dict()
for idx, items in enumerate(user_seq):
    userId = idx + 1
    for item in items:
        if item in frequency_items_dict:
            frequency_items_dict[item] = frequency_items_dict[item] + 1
        else:
            frequency_items_dict[item] = 1

In [116]:
frequency_items_dict[362]

6

In [79]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [152]:
# 流行度阈值
frequency_threshold = math.ceil(sum(frequency_items_dict.values()) / len(frequency_items_dict.values()))


# frequency_threshold = math.ceil((max(frequency_items_dict.values()) + frequency_threshold)/2)


frequency_threshold

15

In [155]:
for idx, data in enumerate(train_dataloader):
    if idx == 0:
        #print(f"Data shape: {data}")
        batch = tuple(t.to(device) for t in data)
        userId, input_ids, target_pos, target_neg, answer = batch
        print("userId: ", userId.shape)        # 每一条交互序列对应的用户id
        print("input_ids: ", input_ids.shape)  # 交互序列
        
        # 创建一个流程物品mask矩阵用来做掩码，遮蔽掉流行度大于阈值的数据
        frequency_items_mask = torch.zeros_like(input_ids).to(device)
        
        for i in range(input_ids.size(0)):
            for j in range(input_ids.size(1)):
                if input_ids[i, j].item() == 0:
                    frequency_items_mask[i, j] = 1
                    continue
                # j 是当前行的第 j 个item 编号，直接通过 [i, j]去dict里面查询是否大于流行度阈值
                if frequency_items_dict[input_ids[i, j].item()] > frequency_threshold:
                    frequency_items_mask[i, j] = 1

userId:  torch.Size([128])
input_ids:  torch.Size([128, 25])


In [158]:
frequency_items_mask

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')

In [157]:
input_ids[5]

tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0, 10753,
         8100,  2399,  2594,  4119,  1830], device='cuda:0')

In [149]:
frequency_items_dict[9168]

21