In [None]:
import os
import json
import random
import pickle
from tqdm import tqdm
from collections import defaultdict

## Stage 1 preprocessing

In [None]:
target_keys = ['similar_books', 'description','authors','publisher','book_id','title_without_series','title', 'format', 'popular_shelves', 'edition_information','series','language_code','country_code','edition_information']

In [None]:
# read book data
data = []

with open('goodreads_books.json') as f:
    readin = f.readlines()
    for line in tqdm(readin):
        tmp = json.loads(line)
        tmp_clean = {}
        for k in target_keys:
            tmp_clean[k] = tmp[k]
        data.append(tmp_clean)

In [None]:
len(data)

In [None]:
filter_data = []

for d in tqdm(data):
    if len(d['similar_books']) != 0:
        filter_data.append(d)

In [None]:
len(filter_data)

In [None]:
## judge if one book has no similar books, will it appear in other books' similar book?
## if cnt != 0, answer is yes

appear_in_similar_book_set = set()
for b in tqdm(filter_data):
    for bid in b['similar_books']:
        appear_in_similar_book_set.add(bid)

cnt = 0
for b in tqdm(data):
    if len(b['similar_books']) == 0 and b['book_id'] in appear_in_similar_book_set:
        cnt += 1

print(cnt)

In [None]:
# delete those books which lie in similar_books but do not lie in filter_data

books = set([b['book_id'] for b in tqdm(filter_data)])

cnt = 0
for i in tqdm(range(len(filter_data))):
    reconstruct_similar = []
    for bid in filter_data[i]['similar_books']:
        if bid in books:
            reconstruct_similar.append(bid)
    if len(reconstruct_similar) != len(filter_data[i]['similar_books']):
        filter_data[i]['similar_books'] = reconstruct_similar
        cnt += 1

print(cnt)

In [None]:
# read book genres
book_genres_dict = {}

with open('goodreads_book_genres_initial.json') as f:
    readin = f.readlines()
    for line in tqdm(readin):
        tmp = json.loads(line)
        assert tmp['book_id'] not in book_genres_dict
        book_genres_dict[tmp['book_id']] = [k for k in tmp['genres']]

print(len(book_genres_dict))

In [None]:
# add genres into filter_data

for i in tqdm(range(len(filter_data))):
    assert filter_data[i]['book_id'] in book_genres_dict
    filter_data[i]['genres'] = book_genres_dict[filter_data[i]['book_id']]

In [None]:
# save the first stage processed data
pickle.dump(filter_data, open('books_filtered.pkl','wb'))

## Stage 2 preprocessing

In [None]:
# load the first stage processed data
data = pickle.load(open('books_filtered.pkl','rb'))

In [None]:
len(data)

In [None]:
# construct book_dict: key book_id, value book_info_dict

book_dict = {}

for b in tqdm(data):
    assert b['book_id'] not in book_dict
    book_dict[b['book_id']] = b

In [None]:
# some statistics
author_degree_dict = defaultdict(int)
publisher_degree_dict = defaultdict(int)
genres_degree_dict = defaultdict(int)
shelves_degree_dict = defaultdict(int)
country_code_degree_dict = defaultdict(int)
language_code_degree_dict = defaultdict(int)
format_degree_dict = defaultdict(int)
edition_information_degree_dict = defaultdict(int)

no_author_cnt = 0
no_publisher_cnt = 0
no_genres_cnt = 0
no_title_cnt = 0
no_description_cnt = 0
avg_rough_title_len = 0
avg_rough_description_len = 0
avg_author_per_book = 0
avg_similar_per_book = 0
max_author_per_book = 0

for bid in tqdm(book_dict):
    if len(book_dict[bid]['authors']) == 0:
        no_author_cnt += 1
    else:
        avg_author_per_book += len(book_dict[bid]['authors'])
        if len(book_dict[bid]['authors']) > max_author_per_book:
            max_author_per_book = len(book_dict[bid]['authors'])
    for author_dict in book_dict[bid]['authors']:
        author_degree_dict[author_dict['author_id']] += 1
    
    if book_dict[bid]['publisher'] == '':
        no_publisher_cnt += 1
    else:
        publisher_degree_dict[book_dict[bid]['publisher']] += 1
    
    if 'genres' not in book_dict[bid] or len(book_dict[bid]['genres'])==0:
        no_genres_cnt += 1
    else:
        for g in book_dict[bid]['genres']:
            genres_degree_dict[g] += 1

    if 'title' not in book_dict[bid] or book_dict[bid]['title'] == '':
        no_title_cnt += 1
    else:
        avg_rough_title_len += len(book_dict[bid]['title'].split(' '))
    
    if 'description' not in book_dict[bid] or book_dict[bid]['description'] == '':
        no_description_cnt += 1
    else:
        avg_rough_description_len += len(book_dict[bid]['description'].split(' '))

    avg_similar_per_book += len(book_dict[bid]['similar_books'])
    
    # popular shelves
    for ss in book_dict[bid]['popular_shelves']:
        shelves_degree_dict[ss['name']] += 1
        
    # country code
    country_code_degree_dict[book_dict[bid]['country_code']] += 1
    
    # language code
    language_code_degree_dict[book_dict[bid]['language_code']] += 1

    # format
    format_degree_dict[book_dict[bid]['format']] += 1

    # edition information
    edition_information_degree_dict[book_dict[bid]['edition_information']] += 1

    
avg_rough_title_len = avg_rough_title_len / len(book_dict)
avg_rough_description_len = avg_rough_description_len / len(book_dict)
avg_author_per_book = avg_author_per_book / len(book_dict)
avg_similar_per_book = avg_similar_per_book / len(book_dict)

print(f'No author books:{no_author_cnt}, No publisher books:{no_publisher_cnt}, No genre books:{no_genres_cnt}, No title books:{no_title_cnt}, No description books:{no_description_cnt}')
print(f'Books num:{len(book_dict)}, Author num:{len(author_degree_dict)}, Publisher num:{len(publisher_degree_dict)}, Genres num:{len(genres_degree_dict)}')
print(f'Average title len:{avg_rough_title_len}, Average description len:{avg_rough_description_len}, Average author per book:{avg_author_per_book}')
print(f'Average similar books:{avg_similar_per_book}, Max author per book:{max_author_per_book}')
print(f'Num of shelves:{len(shelves_degree_dict)}, Num of country_code:{len(country_code_degree_dict)}, Num of language_code:{len(language_code_degree_dict)}')
print(f'Num of format:{len(format_degree_dict)}, Num of edition information:{len(edition_information_degree_dict)}')

In [None]:
genres_degree_dict

In [None]:
# filter shelves_degree_dict

thresholdH = 100000
thresholdL = 1000

shelves_degree_dict_filtered = {}
for s in tqdm(shelves_degree_dict):
    if shelves_degree_dict[s] <= thresholdH and shelves_degree_dict[s] >= thresholdL and s[:4] != 'read' and s[-5:] != 'reads' and len(s) >=3:
        shelves_degree_dict_filtered[s] = shelves_degree_dict[s]
        
print(len(shelves_degree_dict_filtered))

In [None]:
# some statistics on whether one meta-data is significantly useful for similar paper prediction
## analysis: 
## edition information(x): 90w+/100w books don't have edition information.
## country_code(x): only have USA
## language_code(v)
## format(v)

cnt = 0

country_code_cnt = 0
language_code_cnt = 0
format_cnt = 0
edition_information_cnt = 0

language_code_cnt_total = 0
format_cnt_total = 0
edition_information_cnt_total = 0

for bid in tqdm(book_dict):
    for sbid in book_dict[bid]['similar_books']:
        cnt += 1
        
        if book_dict[bid]['country_code'] == book_dict[sbid]['country_code']:
            country_code_cnt += 1
            
        if book_dict[bid]['language_code'] != '' and book_dict[bid]['language_code'] == book_dict[sbid]['language_code']:
            language_code_cnt += 1
        if book_dict[bid]['language_code'] != '' and book_dict[sbid]['language_code'] != '':
            language_code_cnt_total += 1
            
        if book_dict[bid]['format'] != '' and book_dict[bid]['format'] == book_dict[sbid]['format']:
            format_cnt += 1
        if book_dict[bid]['format'] != '' and book_dict[sbid]['format'] != '':
            format_cnt_total += 1
            
        if book_dict[bid]['edition_information'] != '' and book_dict[bid]['edition_information'] == book_dict[sbid]['edition_information']:
            edition_information_cnt += 1
        if book_dict[bid]['edition_information'] != '' and book_dict[sbid]['edition_information'] != '':
            edition_information_cnt_total += 1
            
print(f'Same country code:{country_code_cnt/cnt}, same language code:{language_code_cnt/language_code_cnt_total}, same format:{format_cnt/format_cnt_total}, same edition information:{edition_information_cnt/edition_information_cnt_total}.')

## Finally, we select shelves, author, publisher, language_code, format

## Seperate train/val/test

In [None]:
all_bids = list(book_dict.keys())
random.shuffle(all_bids)

In [None]:
# seperate data
train_ratio = 0.7
val_ratio = 0.1

train_set = all_bids[:int(len(all_bids)* train_ratio)]
val_set = all_bids[int(len(all_bids)* train_ratio):int(len(all_bids)* (train_ratio+val_ratio))]
test_set = all_bids[int(len(all_bids)* (train_ratio+val_ratio)):]

In [None]:
# save file
with open('raw/train.tsv','w') as fout:
    for k in tqdm(train_set):
        fout.write(json.dumps(book_dict[k])+'\n')

with open('raw/val.tsv','w') as fout:
    for k in tqdm(val_set):
        fout.write(json.dumps(book_dict[k])+'\n')

with open('raw/test.tsv','w') as fout:
    for k in tqdm(test_set):
        fout.write(json.dumps(book_dict[k])+'\n')

In [None]:
# save
pickle.dump(shelves_degree_dict_filtered, open('raw/shelves_degree_dict_1000_100000.pkl','wb'))