In [1]:
import pandas as pd
import os
import random
import torch
import yaml

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
random.seed(2024)

# Load original dataset

### Selecting datasets

In [3]:
root = '/data/mjyin/dataset/AmazonRaw/'
output_path = './amazon-toys-seq/'
dataset_name_list = [
    # 'ratings_Books',
    # 'ratings_Clothing_Shoes_and_Jewelry',
    # 'ratings_Movies_and_TV',
    # 'ratings_Sports_and_Outdoors',
    'ratings_Toys_and_Games',
    # 'ratings_Beauty',
]
output_dataset_name_list = [
    # 'book',
    # 'cloth',
    # 'movie',
    # 'sport',
    'toy',
    # 'beauty'
]
user_threshold = 5
item_threshold = 5
max_seq_len = 20

### Load

In [4]:
dataset_list = []
for idx, dataset_name in enumerate(dataset_name_list):
    path = root + dataset_name + '.csv'
    dataset = pd.read_csv(path, header=None)
    dataset.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    dataset = dataset[dataset.rating > 3]
    dataset['domain'] = idx
    dataset_list.append(dataset)

In [5]:
len(dataset_list[0])

1750036

# Filter

### filter by number of interactions

In [6]:
filtered_dataset_list = []
for dataset in dataset_list:
    filtered_dataset = dataset.copy()
    while(True):
        ori_len = len(filtered_dataset)
        filtered_dataset = filtered_dataset[filtered_dataset['user_id'].map(filtered_dataset['user_id'].value_counts()) >= user_threshold]
        filtered_dataset = filtered_dataset[filtered_dataset['item_id'].map(filtered_dataset['item_id'].value_counts()) >= item_threshold]
        if len(filtered_dataset) == ori_len:
            break
    print('done!')
    filtered_dataset_list.append(filtered_dataset)

done!


In [7]:
len(filtered_dataset_list[0])

95420

# Map all ids

In [8]:
all_filtered_dataset = pd.concat(filtered_dataset_list)
all_user = all_filtered_dataset.user_id
all_item = all_filtered_dataset.item_id

In [9]:
user_id, user_token = pd.factorize(all_user)
item_id, item_token = pd.factorize(all_item)
num_users = len(user_token) + 1 # 0 id is for PAD
num_items = len(item_token) + 1 # 0 id is for PAD
user_mapping_dict = {_: idx + 1 for idx, _ in enumerate(user_token)} # 0 id is for PAD
item_mapping_dict = {_: idx + 1 for idx, _ in enumerate(item_token)} # 0 id is for PAD
print(user_token.shape)
print(item_token.shape)

(11268,)
(7309,)


In [10]:
all_filtered_dataset['user_id'] = all_filtered_dataset['user_id'].apply(lambda x: user_mapping_dict[x])
all_filtered_dataset['item_id'] = all_filtered_dataset['item_id'].apply(lambda x: item_mapping_dict[x])

In [11]:
mapped_dataset_list = [all_filtered_dataset[all_filtered_dataset['domain'] == idx] for idx in range(len(filtered_dataset_list))]

# Generate sequences

### Define padding token id

In [12]:
PAD = 0

In [13]:
def truncate_or_pad(seq):
    cur_seq_len = len(seq)
    if cur_seq_len > max_seq_len:
        return seq[-max_seq_len:], max_seq_len
    else:
        return seq + [PAD] * (max_seq_len - cur_seq_len), cur_seq_len


In [14]:

for idx, dataset in enumerate(mapped_dataset_list):
    dataset = dataset.sort_values(by=['user_id', 'timestamp'])
    user_group = dataset.groupby('user_id')['item_id'].apply(list)
    train, val, test = [], [], []
    for user_id, user_seq in list(zip(user_group.index, user_group.tolist())):
        # user_seq = user_seq[-max_seq_len:]
        # iterate on each user sequence
        # ------ test sample ------------
        history, seq_len = truncate_or_pad(user_seq[:-1])
        target_data = user_seq[-1]
        label = 1
        domain_id = [idx] * max_seq_len
        user_hist = user_seq[:-1]
        test.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])
        # ------ val sample -------------
        history, seq_len = truncate_or_pad(user_seq[:-2])
        target_data = user_seq[-2]
        label = 1
        domain_id = [idx] * max_seq_len
        user_hist = user_seq[:-2]
        val.append([user_id, history, target_data, seq_len, label, domain_id, user_hist])
        # ------ train sample -----------
        train_seq = user_seq[:-2]
        assert len(train_seq) >=2
        s, seq_len = 0, 1
        while(True):
            if s + seq_len == len(train_seq):
                break
            if seq_len > max_seq_len:
                s += 1
                seq_len = max_seq_len
            history, _ = truncate_or_pad(train_seq[s:s + seq_len])
            target_data = train_seq[s + seq_len]
            label = 1
            domain_id = idx
            train.append([user_id, history, target_data, seq_len, label, domain_id])

            seq_len += 1
    torch.save(train, os.path.join(output_path, output_dataset_name_list[idx], 'train.pth'))
    torch.save(val, os.path.join(output_path, output_dataset_name_list[idx], 'val.pth'))
    torch.save(test, os.path.join(output_path, output_dataset_name_list[idx], 'test.pth'))
    print('{} done!'.format(output_dataset_name_list[idx]))

toy done!


## Condense

# Save

In [15]:
print(mapped_dataset_list[0]['user_id'].unique().shape)
print(mapped_dataset_list[0]['item_id'].unique().shape)
print(len(mapped_dataset_list[0]))
print(len(train))

(11268,)
(7309,)
95420
61616


In [16]:
for d_name, d in zip(output_dataset_name_list, mapped_dataset_list):
    d.to_csv(os.path.join(output_path, d_name, 'inter.csv'), sep=',', index=None)

# Concat dataset

In [42]:
a = torch.load('./amazon-toys-seq-5/toy/train.pth')
b = torch.load('./amazon-toys-seq-10/toy/train.pth')

In [43]:
a[0]

[1, [4521, 0, 0, 0, 0], 6081, 1, 1, 0]

In [45]:
new_a = []
for line in a:
    user_id, history, target_data, seq_len, label, domain_id = line
    history, _ = truncate_or_pad(history)
    new_a.append([user_id, history, target_data, seq_len, label, domain_id])

In [83]:
torch.save(new_a + b, './amazon-toys-seq-5-10/toy/train.pth')