# Data Processer

## Data Source:
* Yelp: https://www.yelp.com/dataset/
* Amazon-Books: https://jmcauley.ucsd.edu/data/amazon/
* MovieLens: https://grouplens.org/datasets/movielens/1m/
* KuaiRec: https://kuairec.com/

## Directories Tree
```bash
raw
├── books
│   ├── Books.jsonl
│   └── meta_Books.jsonl
├── kuairec
│   └── data
│       ├── big_matrix.csv
│       ├── item_categories.csv
│       └── small_matrix.csv
├── movielens
│   ├── movies.dat
│   └── ratings.dat
└── yelp
    ├── yelp_academic_dataset_business.json
    └── yelp_academic_dataset_review.json
```


In [1]:
import datetime
import json
import os

from tqdm import tqdm

valid_rate, test_rate = 0.1, 0.2

def tqdm_(data_iter, **kwargs):
    return tqdm(data_iter, bar_format='{l_bar}{r_bar}', **kwargs)

In [2]:
def trunc_categories(items, all_categories, k=100):
    category_set = set(x[0] for x in sorted(all_categories.items(), key=lambda x: x[1], reverse=True)[:k])
    trunc_items = {}
    for iid, categories in tqdm_(items.items()):
        cs = category_set.intersection(categories)
        if len(cs) == 0:
            continue
        trunc_items[iid] = cs
    return trunc_items, category_set

def get_split_nums(n):
    n_valid = round(n * valid_rate)
    n_test = round(n * test_rate)
    n_train = n - n_valid - n_test
    return n_train, n_valid, n_test

def _write(out, uid, iids, train_iids):
    iids = [x for x in iids if x in train_iids]
    if len(iids) == 0:
        return 1
    out.write(f'{uid}\t{",".join(map(lambda x: str(train_iids[x]), iids))}\n')
    return 0


def split_write(root, items, data, min_len=None):
    train_iids = {}
    trunc_data = {}
    for uid, iids in tqdm_(data.items(), desc='[trunc_data]'):
        iids = [x[0] for x in sorted(iids.items(), key=lambda x: x[1]) if x[0] in items]
        n_train, n_valid, n_test = get_split_nums(len(iids))
        if min(n_train, n_valid, n_test) <= 0:
            continue
        if min_len is not None and len(iids) < min_len:
            continue
        trunc_data[uid] = iids
        for iid in iids[:n_train]:
            train_iids[iid] = train_iids.get(iid, len(train_iids))
    
    print('# of trunc_data:', len(trunc_data))
    print('# of train_iids:', len(train_iids))
    
    with open(os.path.join(root, 'item_categories.txt'), 'w', encoding='utf8') as out:
        for iid in tqdm_(train_iids, desc='[item_categories]'):
            out.write(f'{train_iids[iid]}\t{"|".join(items[iid])}\n')

    n_user, n_interactions, n_not_so_good = 0, 0, 0
    with open(os.path.join(root, 'train.txt'), 'w', encoding='utf8') as out_train:
        with open(os.path.join(root, 'valid.txt'), 'w', encoding='utf8') as out_valid:
            with open(os.path.join(root, 'test.txt'), 'w', encoding='utf8') as out_test:
                for uid, iids in tqdm_(trunc_data.items(), desc='[train_valid_test]'):
                    n_train, n_valid, n_test = get_split_nums(len(iids))
                    assert min(n_train, n_valid, n_test) > 0
                    n_interactions += len([x for x in iids if x in train_iids])
                    n_not_so_good += _write(out_train, n_user, iids[:n_train], train_iids)
                    n_not_so_good += _write(out_valid, n_user, iids[n_train:-n_test], train_iids)
                    n_not_so_good += _write(out_test, n_user, iids[-n_test:], train_iids)
                    n_user += 1

    print(f'[{root}]')
    print(f'# of users: {n_user}')
    print(f'# of items: {len(train_iids)}')
    print(f'# of interactions: {n_interactions}')
    print(f'n_not_so_good: {n_not_so_good}')

# Yelp

In [3]:
raw_root = 'raw/yelp'
root = 'data/yelp'
os.makedirs(root, exist_ok=True)

In [4]:
items = {}
all_categories = {}
with open(os.path.join(raw_root, 'yelp_academic_dataset_business.json'), encoding='utf8') as fin:
    for line in tqdm_(fin):
        item = json.loads(line)
        if item['categories'] is None:
            continue
        categories = set(map(str.strip, item['categories'].split(',')))
        for c in categories:
            all_categories[c] = all_categories.get(c, 0) + 1
        items[item['business_id']] = categories

|| 0/? [00:00<?, ?it/s]

|| 150346/? [00:01<00:00, 97810.91it/s] 


In [5]:
data = {}
with open(os.path.join(raw_root, 'yelp_academic_dataset_review.json'), encoding='utf8') as fin:
    for line in tqdm_(fin):
        edge = json.loads(line)
        uid, iid, rating, t = edge['user_id'], edge['business_id'], edge['stars'], datetime.datetime.strptime(edge['date'], '%Y-%m-%d %H:%M:%S').timestamp()
        if rating >= 4:
            if uid not in data:
                data[uid] = {}
            data[uid][iid] = t

|| 6990280/? [01:15<00:00, 92658.09it/s]


In [6]:
trunc_items, category_set = trunc_categories(items, all_categories)

100%|| 150243/150243 [00:00<00:00, 465232.12it/s]


In [7]:
split_write(root, trunc_items, data)

[trunc_data]: 100%|| 1464850/1464850 [00:04<00:00, 335780.46it/s]


# of trunc_data: 146949
# of train_iids: 125879


[item_categories]: 100%|| 125879/125879 [00:00<00:00, 591608.83it/s]
[train_valid_test]: 100%|| 146949/146949 [00:01<00:00, 74786.49it/s] 


[data/yelp]
# of users: 146949
# of items: 125879
# of interactions: 2397886
n_not_so_good: 4624


# Amazon-Books

In [8]:
raw_root = 'raw/books'
root = 'data/books'
os.makedirs(root, exist_ok=True)

In [9]:
data = {}
with open(os.path.join(raw_root, 'Books.jsonl'), encoding='utf8') as fin:
    for line in tqdm_(fin):
        d = json.loads(line)
        uid, iid, rating, t = d['user_id'], d['parent_asin'], d['rating'], d['timestamp']
        if rating >= 4:
            if uid not in data:
                data[uid] = {}
            data[uid][iid] = t

|| 29475453/? [02:10<00:00, 225898.44it/s]


In [10]:
items = {}
all_categories = {}
with open(os.path.join(raw_root, 'meta_Books.jsonl'), encoding='utf8') as fin:
    for line in tqdm_(fin):
        d = json.loads(line)
        iid = d['parent_asin']
        categories = d['categories']
        if len(categories) == 0:
            continue
        items[iid] = set(categories)
        for c in categories:
            all_categories[c] = all_categories.get(c, 0) + 1
print(len(items), len(all_categories))


|| 4448181/? [01:10<00:00, 62672.06it/s]

3919508 2723





In [11]:
trunc_items, category_set = trunc_categories(items, all_categories)
print(len(trunc_items))

100%|| 3919508/3919508 [00:07<00:00, 500583.76it/s]

3917787





In [12]:
split_write(root, trunc_items, data)

[trunc_data]: 100%|| 9244317/9244317 [00:37<00:00, 247157.13it/s]


# of trunc_data: 712802
# of train_iids: 1874004


[item_categories]: 100%|| 1874004/1874004 [00:03<00:00, 540980.94it/s]
[train_valid_test]: 100%|| 712802/712802 [00:09<00:00, 78764.01it/s] 


[data/books]
# of users: 712802
# of items: 1874004
# of interactions: 9838152
n_not_so_good: 165223


# MovieLens

In [13]:
raw_root = 'raw/movielens'
root = 'data/movielens'
os.makedirs(root, exist_ok=True)

In [14]:
items = {}
all_categories = {}

with open(os.path.join(raw_root, 'movies.dat'), encoding='ISO-8859-1') as fin:
    for line in tqdm_(fin, desc='[movies.dat]'):
        iid, _, categories = line.strip().split('::')
        if len(categories) == 0:
            continue
        categories = categories.split('|')
        items[iid] = set(categories)
        for c in categories:
            all_categories[c] = all_categories.get(c, 0) + 1
print(len(items), len(all_categories))


[movies.dat]: || 3883/? [00:00<00:00, 233287.24it/s]

3883 18





In [15]:
data = {}
with open(os.path.join(raw_root, 'ratings.dat')) as fin:
    for line in tqdm_(fin, desc='[ratings.dat]'):
        uid, iid, rating, t = line.strip().split('::')
        rating = float(rating)
        if rating >= 4:
            if uid not in data:
                data[uid] = {}
            data[uid][iid] = int(t)

[ratings.dat]: || 1000209/? [00:00<00:00, 1149570.84it/s]


In [16]:
split_write(root, items, data)

[trunc_data]: 100%|| 6038/6038 [00:00<00:00, 29000.76it/s]


# of trunc_data: 6028
# of train_iids: 3422


[item_categories]: 100%|| 3422/3422 [00:00<00:00, 891706.53it/s]
[train_valid_test]: 100%|| 6028/6028 [00:00<00:00, 24131.45it/s]

[data/movielens]
# of users: 6028
# of items: 3422
# of interactions: 575056
n_not_so_good: 1





# KuaiRec

In [17]:
raw_root = 'raw/kuairec'
root = 'data/kuairec'
os.makedirs(root, exist_ok=True)

In [18]:
all_data = {}
with open(os.path.join(raw_root, 'data/big_matrix.csv')) as fin:
    fin.readline()
    for line in tqdm_(fin, desc='[big_matrix.csv]'):
        user_id, video_id, play_duration, video_duration, time, date, timestamp, watch_ratio = line.strip().split(',')
        watch_ratio = float(watch_ratio)
        timestamp = float(timestamp)
        if user_id not in all_data:
            all_data[user_id] = {}
        all_data[user_id][video_id] = (timestamp, all_data[user_id].get(video_id, (0, 0))[1] + watch_ratio)

[big_matrix.csv]: || 12530806/? [00:19<00:00, 651126.94it/s]


In [19]:
train_data = {}
train_iids = {}
for uid in tqdm_(all_data, desc='[train_data]'):
    d = []
    for iid in all_data[uid]:
        if all_data[uid][iid][1] >= 2.0:
            d.append((iid, all_data[uid][iid][0]))
            train_iids[iid] = train_iids.get(iid, len(train_iids))
    if len(d) > 0:
        train_data[uid] = d

print(f'# of train_idis: {len(train_iids)}')
print(f'# of train_data: {len(train_data)}')

[train_data]:   0%|| 0/7176 [00:00<?, ?it/s]

[train_data]: 100%|| 7176/7176 [00:01<00:00, 4307.17it/s]

# of train_idis: 10722
# of train_data: 7176





In [20]:
test_data = {}
n_skip_uid, n_skip_iid, n_skip_timestamp, n_skip_wr = 0, 0, 0, 0

with open(os.path.join(raw_root, 'data/small_matrix.csv')) as fin:
    fin.readline()
    for line in tqdm_(fin, desc='[small_matrix.csv]'):
        user_id, video_id, play_duration, video_duration, time, date, timestamp, watch_ratio = line.strip().split(',')
        watch_ratio = float(watch_ratio)
        if user_id not in train_data:
            n_skip_iid += 1
            continue
        if video_id not in train_iids:
            n_skip_iid += 1
            continue
        if watch_ratio < 2.0:
            n_skip_wr += 1
            continue
        if timestamp == '':
            n_skip_timestamp += 1
            continue
        assert video_id not in all_data[user_id]
        if user_id not in test_data:
            test_data[user_id] = []
        test_data[user_id].append((video_id, float(timestamp)))

print('n_skip_uid:', n_skip_uid)
print('n_skip_iid:', n_skip_timestamp)
print('n_skip_timestamp:', n_skip_timestamp)
print('n_skip_wr:', n_skip_wr)

[small_matrix.csv]: || 0/? [00:00<?, ?it/s]

[small_matrix.csv]: || 4676570/? [00:05<00:00, 801536.64it/s]

n_skip_uid: 0
n_skip_iid: 7613
n_skip_timestamp: 7613
n_skip_wr: 4459341





In [21]:
items = {}
all_categories = {}
with open(os.path.join(raw_root, 'data/item_categories.csv')) as fin:
    fin.readline()
    for line in tqdm_(fin, desc='[item_categories.csv]'):
        iid, feat = line.strip().split(',', 1)
        if iid not in train_iids:
            continue
        feat = eval(feat.strip('"'))
        assert isinstance(feat, list)        
        for c in feat:
            all_categories[c] = all_categories.get(c, 0) + 1
        assert iid not in items
        items[iid] = feat

with open(os.path.join(root, 'item_categories.txt'), 'w') as out:
    for iid in tqdm_(items, desc='[item_categories.txt]'):
        out.write(f'{iid}\t{"|".join(map(str, items[iid]))}\n')

print('# of all_categories:', len(all_categories))

[item_categories.csv]: || 0/? [00:00<?, ?it/s]

[item_categories.csv]: || 10728/? [00:00<00:00, 165742.10it/s]
[item_categories.txt]: 100%|| 10722/10722 [00:00<00:00, 974607.80it/s]

# of all_categories: 31





In [22]:
n_not_so_good_1 = 0
n_not_so_good_2 = 0
kuairec_valid_rate = 1 / 3
n_interactions = 0
with open(os.path.join(root, 'train.txt'), 'w') as out_train:
    with open(os.path.join(root, 'valid.txt'), 'w') as out_valid:
        with open(os.path.join(root, 'test.txt'), 'w') as out_test:
            for uid in tqdm_(train_data, desc='[splits]'):
                iids = [x[0] for x in sorted(train_data[uid], key=lambda x: x[1])]
                out_train.write(f'{uid}\t{",".join(iids)}\n')
                n_interactions += len(iids)

                if uid in test_data:
                    iids = [x[0] for x in sorted(test_data[uid], key=lambda x: x[1])]
                    n_interactions += len(iids)
                    n_valid = round(len(iids) * kuairec_valid_rate)
                    n_test = len(iids) - n_valid
                    if n_valid <= 0 or n_test <= 0:
                        n_not_so_good_1 += 1
                    else:
                        out_valid.write(f'{uid}\t{",".join(iids[:n_valid])}\n')
                        out_test.write(f'{uid}\t{",".join(iids[n_valid:])}\n')
                else:
                    n_not_so_good_2 += 1

print(f'[{root}]')
print(f'# of users: {len(train_data)}')
print(f'# of items: {len(train_iids)}')
print(f'# of interactions: {n_interactions}')
print(f'n_not_so_good_1: {n_not_so_good_1}')
print(f'n_not_so_good_2: {n_not_so_good_2}')

[splits]: 100%|| 7176/7176 [00:00<00:00, 22977.35it/s]

[data/kuairec]
# of users: 7176
# of items: 10722
# of interactions: 1514281
n_not_so_good_1: 0
n_not_so_good_2: 5765



