In [1]:
import pandas as pd
import os
import random
import torch
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(2024)

# Load original dataset

### Selecting datasets

In [3]:
root = '/root/dataset/AmazonRaw/'
output_path = './amazon-toys-noise-50/'
dataset_name_list = [
    # 'ratings_Books',
    # 'ratings_Clothing_Shoes_and_Jewelry',
    # 'ratings_Movies_and_TV',
    # 'ratings_Sports_and_Outdoors',
    'ratings_Toys_and_Games',
    # 'ratings_Beauty',
]
output_dataset_name_list = [
    # 'book',
    # 'cloth',
    # 'movie',
    # 'sport',
    'toy',
    # 'beauty'
]
user_threshold = 5
item_threshold = 5
max_seq_len = 50

### Load

In [4]:
dataset_list = []
for idx, dataset_name in enumerate(dataset_name_list):
    path = root + dataset_name + '.csv'
    dataset = pd.read_csv(path, header=None)
    dataset.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    # dataset = dataset[dataset.rating > 3]
    dataset['domain'] = idx
    dataset_list.append(dataset)

# Filter

### filter by number of interactions

In [5]:
filtered_dataset_list = []
for dataset in dataset_list:
    filtered_dataset = dataset.copy()
    while(True):
        ori_len = len(filtered_dataset)
        filtered_dataset = filtered_dataset[filtered_dataset['user_id'].map(filtered_dataset['user_id'].value_counts()) >= user_threshold]
        filtered_dataset = filtered_dataset[filtered_dataset['item_id'].map(filtered_dataset['item_id'].value_counts()) >= item_threshold]
        if len(filtered_dataset) == ori_len:
            break
    print('done!')
    filtered_dataset_list.append(filtered_dataset)

done!


# Map all ids

In [6]:
all_filtered_dataset = pd.concat(filtered_dataset_list)
all_user = all_filtered_dataset.user_id
all_item = all_filtered_dataset.item_id

In [7]:
user_id, user_token = pd.factorize(all_user)
item_id, item_token = pd.factorize(all_item)
num_users = len(user_token) + 1 # 0 id is for PAD
num_items = len(item_token) + 1 # 0 id is for PAD
user_mapping_dict = {_: idx + 1 for idx, _ in enumerate(user_token)} # 0 id is for PAD
item_mapping_dict = {_: idx + 1 for idx, _ in enumerate(item_token)} # 0 id is for PAD
print(user_token.shape)
print(item_token.shape)

(19412,)
(11924,)


In [8]:
all_filtered_dataset['user_id'] = all_filtered_dataset['user_id'].apply(lambda x: user_mapping_dict[x])
all_filtered_dataset['item_id'] = all_filtered_dataset['item_id'].apply(lambda x: item_mapping_dict[x])

In [9]:
mapped_dataset_list = [all_filtered_dataset[all_filtered_dataset['domain'] == idx] for idx in range(len(filtered_dataset_list))]

# Generate sequences

### Define padding token id

In [10]:
PAD = 0

### In-domain sequences

In [11]:
def to_list(x):
    return list(x)[:-2]
dataset = mapped_dataset_list[0]
dataset = dataset.sort_values(by=['user_id', 'timestamp'])
user_group = dataset.groupby('user_id')['item_id'].apply(to_list)
torch.save(user_group.tolist(), 'pattern-toys-noise-50.pth')

In [12]:


def truncate_or_pad(seq):
    cur_seq_len = len(seq)
    if cur_seq_len > max_seq_len:
        return seq[-max_seq_len:], max_seq_len
    else:
        return seq + [PAD] * (max_seq_len - cur_seq_len), cur_seq_len

for idx, dataset in enumerate(mapped_dataset_list):
    dataset = dataset.sort_values(by=['user_id', 'timestamp'])
    user_group = dataset.groupby('user_id')['item_id'].apply(list)
    train, val, test = [], [], []
    for user_id, user_seq in list(zip(user_group.index, user_group.tolist())):
        user_seq = user_seq[-max_seq_len:]
        # iterate on each user sequence
        # ------ test sample ------------
        history, seq_len = truncate_or_pad(user_seq[:-1])
        target_data = user_seq[-1]
        label = 1
        domain_id = [idx] * max_seq_len
        test.append([user_id, history, target_data, seq_len, label, domain_id, history])
        # ------ val sample -------------
        history, seq_len = truncate_or_pad(user_seq[:-2])
        target_data = user_seq[-2]
        label = 1
        domain_id = [idx] * max_seq_len
        val.append([user_id, history, target_data, seq_len, label, domain_id, history])
        # ------ train sample -----------
        history, seq_len = truncate_or_pad(user_seq[:-3])
        target_data, _ = truncate_or_pad(user_seq[-seq_len-2:-2])
        label = [1] * seq_len + [PAD] * (max_seq_len - seq_len)
        domain_id = [idx] * max_seq_len
        train.append([user_id, history, target_data, seq_len, label, domain_id])
    torch.save(train, os.path.join(output_path, output_dataset_name_list[idx], 'train.pth'))
    torch.save(val, os.path.join(output_path, output_dataset_name_list[idx], 'val.pth'))
    torch.save(test, os.path.join(output_path, output_dataset_name_list[idx], 'test.pth'))
    print('{} done!'.format(output_dataset_name_list[idx]))

toy done!


### mixed sequence

# Save

In [13]:
print(mapped_dataset_list[0]['user_id'].unique().shape)
print(mapped_dataset_list[0]['item_id'].unique().shape)
print(len(mapped_dataset_list[0]))

(19412,)
(11924,)
167597


In [14]:
for d_name, d in zip(output_dataset_name_list, mapped_dataset_list):
    d.to_csv(os.path.join(output_path, d_name, 'inter.csv'), sep=',', index=None)

# Condense

In [12]:

user_id = torch.tensor([_[0] for _ in train])
user_seq = torch.tensor([_[1] for _ in train])
target_item = torch.tensor([_[2] for _ in train])
seq_len = torch.tensor([_[3] for _ in train])
label = torch.tensor([_[4] for _ in train])
domain_id = torch.tensor([_[5] for _ in train])
sorted_seq_len, sorted_index = torch.sort(seq_len, descending=True)
sorted_seq_len = sorted_seq_len.tolist()
sorted_seq = user_seq[sorted_index].tolist()
sorted_target_item = target_item[sorted_index].tolist()
merged_data = [[], [], [], [], [], []]
pre_pointer, post_pointer = 0, len(user_seq) - 1
while(pre_pointer <= post_pointer):
    cur_seq, cur_seq_len, cur_target_item = sorted_seq[pre_pointer], sorted_seq_len[pre_pointer], sorted_target_item[pre_pointer]
    # find to to appended
    while cur_seq_len <= max_seq_len:
        post_seq_len = sorted_seq_len[post_pointer]
        if (cur_seq_len + post_seq_len <= max_seq_len) and pre_pointer != post_pointer:
            cur_seq = cur_seq[:cur_seq_len] + sorted_seq[post_pointer][:post_seq_len]
            cur_target_item = cur_target_item[:cur_seq_len] + sorted_target_item[post_pointer][:post_seq_len]
            cur_seq_len = cur_seq_len + post_seq_len
            post_pointer -= 1
        else:# Add padding and record this sequence
            cur_seq = torch.tensor(cur_seq[:cur_seq_len] + [0] * (max_seq_len - cur_seq_len))
            cur_target_item = torch.tensor(cur_target_item[:cur_seq_len] + [0] * (max_seq_len - cur_seq_len))
            cur_seq_len = torch.tensor([cur_seq_len])
            user_id_, label_, domain_id_ = torch.tensor([0]), torch.tensor([0]), torch.zeros_like(cur_seq)
            merged_data[0].append(user_id_) # useless
            merged_data[1].append(cur_seq)
            merged_data[2].append(cur_target_item)
            merged_data[3].append(cur_seq_len)
            merged_data[4].append(label_) # useless
            merged_data[5].append(domain_id_) # useless
            pre_pointer += 1
            break
merged_data = [torch.stack(_).squeeze() for _ in merged_data]

In [15]:
condensed_data = []
for idx in range(len(merged_data[0])):
    user_id = 0
    user_seq = merged_data[1][idx].tolist()
    target_item = merged_data[2][idx].tolist()
    seq_len = merged_data[3][idx].item()
    label = 1
    domain_id = 0
    for _ in range(seq_len):
        new_seq, new_seq_len = truncate_or_pad(user_seq[:_+1])
        condensed_data.append([
            user_id,
            new_seq,
            target_item[_],
            new_seq_len,
            label,
            domain_id
        ])

In [16]:
len(condensed_data)

106254

In [43]:
original = torch.load('./amazon-toys-seq-noise-50/toy/train.pth')

In [46]:
len(original)

109361

In [17]:
torch.save(condensed_data, 'train_condense.pth')