In [1]:
import json
import os
import csv
from datetime import datetime, timezone

In [2]:
# def save_json_files(json_path, folder):
#     file_name = os.path.splitext(os.path.basename(json_path))[0]
#     # Make it a plain strings with only letters and numbers
#     file_name = ''.join(e for e in file_name if e.isalnum())
#     file = json.loads(open(json_path).read())
#     sub_folder = f'{folder}/{file_name}'
#     os.makedirs(sub_folder, exist_ok=True)
# 
#     for key in file:
#         # Create suffix with the matched keyword and the last part of the key
#         suffix = f"{next((word for word in ['book', 'work', 'author', 'series'] if word in key), key)}_{key.split('.')[-1]}"
#         with open(f'{sub_folder}/{file_name}_{suffix}.json', 'w') as f:
#             json.dump(file[key], f, indent=4)
# 

In [3]:
# folder = 'book_details'
# # For each json in the folder cann save_json_files
# for file in os.listdir(folder):
#     if file.endswith('.json'):
#         save_json_files(f'{folder}/{file}', folder)
#         

In [2]:
def extract_relevant_fields(json_path):
    file = json.loads(open(json_path).read())
    # Get the keys
    keys = file.keys()
    # from the key that contains 'book' get the values
    book = file[next((key for key in keys if 'Book' in key), None)]
    # from the key that contains 'work' get the values
    work = file[next((key for key in keys if 'Work' in key), None)]
    # from each key that contains 'author' get the values
    authors = [file[key] for key in keys if 'Contributor' in key]
    # from each key that contains 'serie' get the values
    series = [file[key] for key in keys if 'Series' in key]
    return book, work, authors, series

def split_genres_topics(genres: list, input: list):
    # input to lower case
    input = [i.lower() for i in input]
    genres = [genre for genre in genres if genre in input]
    # put the rest of the input in topics
    topics = [topic for topic in input if topic not in genres]
    return genres, topics

def convert_unix_time_to_date(unix_time):
    unix_time = int(unix_time)/1000
    return datetime.fromtimestamp(unix_time, tz=timezone.utc).date()

# Extract book details
def extract_book_details(book: dict, genres: list):
    details = {}
    details['legacyId'] = book.get('legacyId')
    details['title'] = book.get('title')
    details['primaryContributorEdge_role'] = book.get('primaryContributorEdge').get('role')
    details['primaryContributorEdge_ref'] = book.get('primaryContributorEdge').get('node').get('__ref').replace('Contributor:', '')
    # Extract secondaryContributorEdges and append only the first one as a dictionary in details
    if book.get('secondaryContributorEdges'):
        secondary_contributor = book.get('secondaryContributorEdges')[0]
        details['secondaryContributorEdge_role'] = secondary_contributor.get('role')
        details['secondaryContributorEdge_ref'] = secondary_contributor.get('node').get('__ref').replace('Contributor:', '')
    details['bookSeries'] = [series.get('series', {}).get('__ref').replace('Series:', '') for series in book.get('bookSeries', [])]
    goodreadsGenres = [genre.get('genre', {}).get('name') for genre in book.get('bookGenres', [])]
    details['bookGenres'], details['bookTopics'] = split_genres_topics(genres, goodreadsGenres)
    details['numPages'] = book.get('details').get('numPages')
    details['publicationTime'] = convert_unix_time_to_date(book.get('details').get('publicationTime'))
    details['publisher'] = book.get('details').get('publisher')
    details['language'] = book.get('details').get('language').get('name')
    details['work_ref'] = book.get('work').get('__ref').replace('Work:', '')
    return details

# Extract work details
def extract_work_details(work: dict):
    details = {}
    details['legacyId'] = work.get('legacyId')
    details['originalTitle'] = work.get('details').get('originalTitle')
    details['characters'] = [character.get('name') for character in work.get('details').get('characters', [])]
    details['editions_url'] = work.get('editions').get('webUrl')
    return details

# Extract author details
def extract_author_details(authors: list, book_details: dict):
    details = []
    for author in authors:
        author_id = author.get('id')
        primary_contributor = book_details.get('primaryContributorEdge_ref')
        secondary_contributor = book_details.get('secondaryContributorEdge_ref')
        if author_id == primary_contributor or author_id == secondary_contributor:
            author_details = {}
            author_details['legacyId'] = author.get('legacyId')
            author_details['name'] = author.get('name')
            author_details['description'] = author.get('description')
            author_details['webUrl'] = author.get('webUrl')
            details.append(author_details)
    return details

# Extract series details
def extract_series_details(series: list):
    details = []
    for serie in series:
        serie_details = {}
        serie_details['title'] = serie.get('title')
        details.append(serie_details)
    return details

# Extract all the details
def extract_all_details(json_path, genres):
    book, work, authors, series = extract_relevant_fields(json_path)
    book_details = extract_book_details(book, genres)
    work_details = extract_work_details(work)
    author_details = extract_author_details(authors, book_details)
    series_details = extract_series_details(series)
    return book_details, work_details, author_details, series_details


In [3]:
# Save the merged details in a csv file
def save_details_to_csv(details, csv_file):
    with open(csv_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=details.keys())
        writer.writeheader()
        writer.writerow(details)

# Author details is a list of dicts with keys 'legacyId', 'name', 'description', convert each element in a row
def save_author_details_to_csv(author_details, csv_file):
    with open(csv_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=author_details[0].keys())
        writer.writeheader()
        writer.writerows(author_details)


In [4]:
genre_file = 'goodreads_genres.csv'
with open(genre_file) as f:
    reader = csv.DictReader(f)
    genres = [row['Genre'] for row in reader]

# Extract all the details from the json
book_details, work_details, author_details, series_details = extract_all_details('book_details/The_Unbroken_Line_of_the_Moon_(Sagan_om_Valhalla_#4;_Valhalla_#1).json', genres)

# Save the details in a csv file
save_details_to_csv(book_details, 'book_details.csv')

# Append series as a key to work_details
work_details['series'] = series_details
save_details_to_csv(work_details, 'work_details.csv')

save_author_details_to_csv(author_details, 'author_details.csv')


In [5]:
directory = 'book_details'

genre_file = 'goodreads_genres.csv'
with open(genre_file) as f:
    reader = csv.DictReader(f)
    genres = [row['Genre'] for row in reader]

# For each json in the directory cann save_json_files
for file in os.listdir(directory):
    if file.endswith('.json'):
        # Create a folder with the name of the file (with only letters and numbers) without the extension
        folder_name = ''.join(e for e in os.path.splitext(file)[0] if e.isalnum())
        folder_path = f'{directory}/{folder_name}'
        os.makedirs(folder_path, exist_ok=True)
        # Extract all the details from the json
        book_details, work_details, author_details, series_details = extract_all_details(f'{directory}/{file}', genres)
        # Save the details in a csv file
        save_details_to_csv(book_details, f'{folder_path}/book_details.csv')
        # Append series as a key to work_details
        work_details['series'] = series_details
        save_details_to_csv(work_details, f'{folder_path}/work_details.csv')
        save_author_details_to_csv(author_details, f'{folder_path}/author_details.csv')
