In [56]:
import json
import random
import pandas as pd
import numpy as np

class Args:
    data_frac = 0.3
    min_user_freq = 10
    min_book_freq = 10
    max_user_freq = 200
    train_frac = 0.95
    his_len = 100
    n_neg = 10


args = Args()


In [34]:
def get_valid_user_and_book_set(data_frac):
    user_freq, book_freq = {}, {}
    with open('data/ratings_Books.csv', 'r') as f:
        for line in f.readlines():
            if random.random() < data_frac:
                # user, item, rating, timestamp
                splitted = line.split(",")
                try:
                    user_freq[splitted[0]] += 1
                except KeyError:
                    user_freq[splitted[0]] = 1
                try:
                    book_freq[splitted[1]] += 1
                except KeyError:
                    book_freq[splitted[1]] = 1

    valid_users = set([user_id for user_id, cnt in user_freq.items() if cnt >= args.min_user_freq and cnt <= args.max_user_freq])
    valid_books = set([book_id for book_id, cnt in book_freq.items() if cnt >= args.min_book_freq])

    return valid_users, valid_books
    

def parse_rating_data(valid_users, valid_books):
    user_rates = {}
    with open('data/ratings_Books.csv', 'r') as f:
        for line in f.readlines():
            line = line[:-1]
            splitted = line.split(",")
            user_id, book_id = splitted[0], splitted[1]
            if user_id in valid_users and book_id in valid_books:
                rate = [user_id, book_id, int(splitted[-1])]
                try:
                    user_rates[user_id].append(rate)
                except KeyError:
                    user_rates[user_id] = [rate]
    
    # convert user ratings to pd.DataFrame for efficient history quering
    for user_id, rates in user_rates.items():
        user_rates[user_id] = pd.DataFrame(rates, columns=['user_id', 'book_id', 'timestamp']).sort_values(by='timestamp', ignore_index=True)

    return user_rates


valid_users, valid_books = get_valid_user_and_book_set(args.data_frac)
all_user_rates = parse_rating_data(valid_users, valid_books)


In [3]:
def read_book_meta(valid_books):
    book_cates = {}
    with open("data/meta_Books.json", "r") as f:
        for line in f.readlines():
            meta = json.loads(line[:-1])
            if meta['asin'] in valid_books:
                book_cates[meta['asin']] = meta['category']

    return book_cates

book_cates = read_book_meta(valid_books)



In [65]:
def encode_users(valid_users):
    encoder, decoder = {}, []
    encode_id = 0
    for user_id in valid_users:
        if user_id not in encoder:
            encoder[user_id] = encode_id
            decoder.append(user_id)
            encode_id += 1

    return encoder, decoder

def encode_books(valid_books):
    encoder, decoder = {'<pad>': 0}, ['<pad>']
    encode_id = 1
    for book_id in valid_books:
        if book_id not in encoder:
            encoder[book_id] = encode_id
            decoder.append(book_id)
            encode_id += 1
    
    return encoder, decoder

def encode_cates(book_cates):
    encoder, decoder = {'<pad>': 0}, ['<pad>']
    encode_id = 1
    for cate in book_cates:
        if cate not in encoder:
            encoder[cate] = encode_id
            decoder.append(cate)
            encode_id += 1
    
    return encoder, decoder


user_encoder, user_decoder = encode_users(valid_users)
book_encoder, book_decoder = encode_books(valid_books)
cate_encoder, cate_decoder = encode_cates(valid_cates)


In [70]:
cate_decoder[89]
cate_encoder['Certification']

89

1. Each rating action reflect user's interest upon that book, so each rating action is one postive sample.
2. Split all the rating actions to 19:1 as described in the paper.
3. From each rating action, use that user's rating actions before the current rating time as history behaviors.
4. For negative samples, keep the history behavior same as the positive sample, but randomly draw target books from the whole book pool.

In [57]:
def split_train_and_test_rates(user_rates, train_frac):
    all_train_ratings, all_test_ratings = [], []
    for user_id, ratings in user_rates.items():
        train_rates = ratings.sample(frac=train_frac, replace=False)
        test_rates = ratings.drop(train_rates.index)

        for rate in train_rates.values: all_train_ratings.append(rate)
        for rate in test_rates.values: all_test_ratings.append(rate)
    
    return all_train_ratings, all_test_ratings

all_train_rates, all_test_rates = split_train_and_test_rates(all_user_rates, args.train_frac)

In [35]:
def pad_or_cut(seq, length):
    if len(seq) > length:
        # cut the front
        # len(seq) - 1 - x + 1 = length --> x = len(seq) - length
        return seq[len(seq) - length:]
    else:
        return np.concatenate((seq, np.array([0] * (length - len(seq)))))


def query_history_books(user_id, timestamp, user_rates, his_len):
    bool_idx = user_rates['timestamp'] < timestamp
    history_books = user_rates['timestamp'].loc[bool_idx, ['book_id']].values
    if len(history_books) != his_len:
        history_books = pad_or_cut(history_books, his_len)

    return history_books


def build_samples(rates, all_user_rates, valid_books, args):
    samples = []
    candidates = list(valid_books)
    for rate in rates:
        his = query_history_books(rate[0], rate[2], all_user_rates[rate[0]], args.his_len)
        tar = rate[1]
        samples.append({'his': his, 'tar': tar, 'label': 1})
        # negative sampling
        neg_tar = random.choices(candidates, k = args.n_neg)
        for tar in neg_tar:
            samples.append({'his': his, 'tar': tar, 'label': 0})
    
    return samples

train_samples = build_samples(all_train_rates, all_user_rates)
test_samples = build_samples(all_test_rates, all_user_rates, valid_books, args)

        

        

In [55]:
random.choices(list(valid_books), k = 10)

['030726646X',
 '1491521775',
 'B009VSDN42',
 '0979622808',
 'B0073WA85U',
 '1607148439',
 '0823014010',
 '1555838197',
 '141698271X',
 '0989180204']

In [48]:
k
pad = np.array([0] * 5)
pad

array([0, 0, 0, 0, 0])

In [49]:
np.concatenate((a, pad))

array(['0099994003', '0684844419', '1585670111', '0375701613',
       '0465057128', 0, 0, 0, 0, 0], dtype=object)