In [None]:
import os
import json
from tqdm import tqdm

import pickle

In [None]:
## nodes: books, authors, publisher, series
## book features: country_code, language_code, popular_shelves, is_ebook, description, format, num_pages, publication_year, title, genres
## author features: name
## publisher: name
## series: title, description

In [None]:
raw_data_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/raw_data/goodreads"
save_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/processed_data/goodreads"

In [None]:
## read raw data files

def read_json_lines(file, id_key):
    data = {}
    with open(file) as f:
        readin = f.readlines()
        for line in tqdm(readin):
            tmp = json.loads(line)
            data[tmp[id_key]] = tmp
    return data

book_raw_data = read_json_lines(os.path.join(raw_data_dir, 'goodreads_books.json'), 'book_id')
author_raw_data = read_json_lines(os.path.join(raw_data_dir, 'goodreads_book_authors.json'), 'author_id')
genres_raw_data = read_json_lines(os.path.join(raw_data_dir, 'goodreads_book_genres_initial.json'), 'book_id')
series_raw_data = read_json_lines(os.path.join(raw_data_dir, 'goodreads_book_series.json'), 'series_id')

In [None]:
list(book_raw_data.keys())[:4]

In [None]:
book_raw_data['1333909']['similar_books']

In [None]:
## construct book node dictionary
## book features: country_code, language_code, is_ebook, title, description, format, num_pages, publication_year, genres, popular_shelves
## book neighbors: author, publisher, series

## author features: name
## author neighbors: books

## publisher features: name
## publisher neighbors: books

## series features: title, description
## series neighbors: books

book_nodes = {}
author_nodes = {}
publisher_nodes = {}
series_nodes = {}

publisher_name2id = {}

for book_id in tqdm(book_raw_data):
    
    # publisher nodes
    if book_raw_data[book_id]['publisher'] != '':
        if book_raw_data[book_id]['publisher'] not in publisher_name2id:
            idd = f'pub_{len(publisher_nodes)}'
            publisher_name2id[book_raw_data[book_id]['publisher']] = idd
            publisher_nodes[idd] = {'features': {}, 'neighbors': {}}
            publisher_nodes[idd]['features']['name'] = book_raw_data[book_id]['publisher']
            publisher_nodes[idd]['neighbors']['book'] = [book_id]
        else:
            publisher_nodes[publisher_name2id[book_raw_data[book_id]['publisher']]]['neighbors']['book'].append(book_id)

    # book nodes
    book_nodes[book_id] = {'features': {}, 'neighbors': {}}
    ## add features
    book_nodes[book_id]['features']['country_code'] = book_raw_data[book_id]['country_code']
    book_nodes[book_id]['features']['language_code'] = book_raw_data[book_id]['language_code']
    book_nodes[book_id]['features']['is_ebook'] = book_raw_data[book_id]['is_ebook']
    book_nodes[book_id]['features']['title'] = book_raw_data[book_id]['title']
    book_nodes[book_id]['features']['description'] = book_raw_data[book_id]['description']
    book_nodes[book_id]['features']['format'] = book_raw_data[book_id]['format']
    book_nodes[book_id]['features']['num_pages'] = book_raw_data[book_id]['num_pages']
    book_nodes[book_id]['features']['publication_year'] = book_raw_data[book_id]['publication_year']
    book_nodes[book_id]['features']['url'] = book_raw_data[book_id]['url']
    book_nodes[book_id]['features']['popular_shelves'] = [dd['name'] for dd in book_raw_data[book_id]['popular_shelves']]
    book_nodes[book_id]['features']['genres'] = list(genres_raw_data[book_id]['genres'].keys())
    ## add neighbors
    book_nodes[book_id]['neighbors']['author'] = [a['author_id'] for a in book_raw_data[book_id]['authors']]
    book_nodes[book_id]['neighbors']['publisher'] = [publisher_name2id[book_raw_data[book_id]['publisher']]] if book_raw_data[book_id]['publisher'] != '' else []
    book_nodes[book_id]['neighbors']['series'] = book_raw_data[book_id]['series']
    book_nodes[book_id]['neighbors']['similar_books'] = book_raw_data[book_id]['similar_books']

    # author nodes
    for aid in book_nodes[book_id]['neighbors']['author']:
        if aid not in author_nodes:
            author_nodes[aid] = {'features': {}, 'neighbors': {}}
            author_nodes[aid]['features']['name'] = author_raw_data[aid]['name']
            author_nodes[aid]['neighbors']['book'] = [book_id]
        else:
            author_nodes[aid]['neighbors']['book'].append(book_id)

    # series nodes
    for sid in book_nodes[book_id]['neighbors']['series']:
        if sid not in series_nodes:
            series_nodes[sid] = {'features': {}, 'neighbors': {}}
            series_nodes[sid]['features']['title'] = series_raw_data[sid]['title']
            series_nodes[sid]['features']['description'] = series_raw_data[sid]['description']
            series_nodes[sid]['neighbors']['book'] = [book_id]
        else:
            series_nodes[sid]['neighbors']['book'].append(book_id)


In [None]:
## save graph
#pickle.dump({
#    'book_nodes': book_nodes,
#    'author_nodes': author_nodes,
#    'publisher_nodes': publisher_nodes,
#    'series_nodes': series_nodes
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'book_nodes': book_nodes,
    'author_nodes': author_nodes,
    'publisher_nodes': publisher_nodes,
    'series_nodes': series_nodes
}, open(os.path.join(save_dir, 'graph.json'),"w"), indent=4)