In [16]:
import pandas as pd
import os
import random
import torch

In [17]:
random.seed(2024)

# Load original dataset

### Selecting datasets

In [18]:
# root = '/data/mjyin/dataset/AmazonRaw/'
# output_path = './debug2/'
# dataset_name_list = [
#     'ratings_Books',
#     # 'ratings_Clothing_Shoes_and_Jewelry',
#     'ratings_Movies_and_TV',
#     # 'ratings_Sports_and_Outdoors',
#     # 'ratings_Toys_and_Games',
# ]
# output_dataset_name_list = [
#     'book',
#     # 'cloth',
#     'movie',
#     # 'sport',
#     # 'toy'
# ]

root = '/data/mjyin/dataset/AmazonRaw/'
output_path = './debug2/'
dataset_name_list = [
    'ratings_Toys_and_Games',
    'ratings_Sports_and_Outdoors',
]
output_dataset_name_list = [
    'toy',
    'sport',
]

### Load

In [19]:
dataset_list = []
for idx, dataset_name in enumerate(dataset_name_list):
    path = root + dataset_name + '.csv'
    dataset = pd.read_csv(path, header=None)
    dataset.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    dataset = dataset[dataset.rating >= 3]
    dataset['domain'] = idx
    dataset_list.append(dataset)

# Filter

### filter by number of interactions

In [20]:
user_threshold = 5
item_threshold = 5
filtered_dataset_list = []
for dataset in dataset_list:
    filtered_dataset = dataset.copy()
    while(True):
        ori_len = len(filtered_dataset)
        filtered_dataset = filtered_dataset[filtered_dataset['user_id'].map(filtered_dataset['user_id'].value_counts()) >= user_threshold]
        filtered_dataset = filtered_dataset[filtered_dataset['item_id'].map(filtered_dataset['item_id'].value_counts()) >= item_threshold]
        if len(filtered_dataset) == ori_len:
            break
    print('done!')
    filtered_dataset_list.append(filtered_dataset)

done!
done!


# Map all ids

In [21]:
all_filtered_dataset = pd.concat(filtered_dataset_list)
all_user = all_filtered_dataset.user_id
all_item = all_filtered_dataset.item_id

In [22]:
user_id, user_token = pd.factorize(all_user)
item_id, item_token = pd.factorize(all_item)
user_mapping_dict = {_: idx for idx, _ in enumerate(user_token)}
item_mapping_dict = {_: idx for idx, _ in enumerate(item_token)}
print(user_token.shape)
print(item_token.shape)

(43422,)
(25091,)


In [23]:
all_filtered_dataset['user_id'] = all_filtered_dataset['user_id'].apply(lambda x: user_mapping_dict[x])
all_filtered_dataset['item_id'] = all_filtered_dataset['item_id'].apply(lambda x: item_mapping_dict[x])

In [24]:
mapped_dataset_list = [all_filtered_dataset[all_filtered_dataset['domain'] == idx] for idx in range(len(filtered_dataset_list))]

# Generate sequences

### Define padding token id

In [25]:
PAD = all_filtered_dataset.item_id.unique().shape[0]

### In-domain sequences

In [26]:
max_seq_len = 50

def truncate_or_pad(seq):
    cur_seq_len = len(seq)
    if cur_seq_len > max_seq_len:
        return seq[-max_seq_len:], max_seq_len
    else:
        return seq + [PAD] * (max_seq_len - cur_seq_len), cur_seq_len

for idx, dataset in enumerate(mapped_dataset_list):
    dataset = dataset.sort_values(by=['user_id', 'timestamp'])
    user_group = dataset.groupby('user_id')['item_id'].apply(list)
    train, val, test = [], [], []
    for user_id, user_seq in list(zip(user_group.index, user_group.tolist())):
        user_seq = user_seq[-max_seq_len:]
        # iterate on each user sequence
        # ------ test sample ------------
        history, seq_len = truncate_or_pad(user_seq[:-1])
        target_data = user_seq[-1]
        label = 1
        domain_id = [idx] * max_seq_len
        test.append([user_id, history, target_data, seq_len, label, domain_id])
        # ------ val sample -------------
        history, seq_len = truncate_or_pad(user_seq[:-2])
        target_data = user_seq[-2]
        label = 1
        domain_id = [idx] * max_seq_len
        val.append([user_id, history, target_data, seq_len, label, domain_id])
        # ------ train sample -----------
        history, seq_len = truncate_or_pad(user_seq[:-3])
        target_data, _ = truncate_or_pad(user_seq[-seq_len-2:-2])
        label = [1] * seq_len + [0] * (max_seq_len - seq_len)
        domain_id = [idx] * max_seq_len
        train.append([user_id, history, target_data, seq_len, label, domain_id])
    torch.save(train, os.path.join(output_path, output_dataset_name_list[idx], 'train.pth'))
    torch.save(val, os.path.join(output_path, output_dataset_name_list[idx], 'val.pth'))
    torch.save(test, os.path.join(output_path, output_dataset_name_list[idx], 'test.pth'))
    print('{} done!'.format(output_dataset_name_list[idx]))

toy done!
sport done!


### mixed sequence

# Save

In [27]:
for d_name, d in zip(output_dataset_name_list, mapped_dataset_list):
    d.to_csv(os.path.join(output_path, d_name, 'inter.csv'), sep=',', index=None)

In [28]:
print(mapped_dataset_list[0]['user_id'].unique().shape)
print(mapped_dataset_list[0]['item_id'].unique().shape)

(15528,)
(9696,)


In [29]:
print(mapped_dataset_list[1]['user_id'].unique().shape)
print(mapped_dataset_list[1]['item_id'].unique().shape)

(29277,)
(15396,)
