# Introduction, imports, and function defs

Preprocessing steps needed for Goodreads books and authors datasets. Data retrieved from: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home?authuser=0. Specifically, in this notebook for the sake of exploration, I will be cleaning the genre dataset.

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gdown
import seaborn as sns
from google.colab import drive
import json
from os.path import join
import re
# !apt install unzip
import csv

In [3]:
# Mount drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [4]:
dir = '/content/drive/Shareddrives/CIS5500-BookReviews/RawFiles/'
# authors_path = 'goodreads_book_authors.json'
genres_path = 'goodreads_book_genres.json'


# !unzip "/content/drive/Shareddrives/CIS5500-BookReviews/RawFiles/goodreads_book_genres.json.gz" -d "/content/drive/Shareddrives/CIS5500-BookReviews/RawFiles/"

In [5]:
# Function to load the data from the unzipped json files:
def load_json(file_name, head = 500):
  ''' file_name: path to the json file to be loaded
      head: the number of elements to load
  '''
  count = 0
  data = []
  with open(file_name) as fn:
      for l in fn:
          d = json.loads(l)
          count += 1
          data.append(d)
          
          # break if reaches the 100th line
          if (head is not None) and (count > head):
              break
  return data

# Authors data loading and cleaning

In [6]:
genres_data = load_json(join(dir, genres_path), head=None)

In [7]:
genres_df = pd.DataFrame(genres_data)
genres_df.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


In [8]:
genres_df.tail()

Unnamed: 0,book_id,genres
2360650,3084038,"{'non-fiction': 5, 'history, historical fictio..."
2360651,26168430,"{'mystery, thriller, crime': 4, 'children': 1,..."
2360652,2342551,"{'poetry': 14, 'children': 7, 'young-adult': 1..."
2360653,22017381,"{'romance': 13, 'mystery, thriller, crime': 2}"
2360654,11419866,"{'romance': 19, 'fiction': 4}"


In [9]:
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   book_id  object
 1   genres   object
dtypes: object(2)
memory usage: 36.0+ MB


In [10]:
genres_df.describe

<bound method NDFrame.describe of           book_id                                             genres
0         5333265      {'history, historical fiction, biography': 1}
1         1333909  {'fiction': 219, 'history, historical fiction,...
2         7327624  {'fantasy, paranormal': 31, 'fiction': 8, 'mys...
3         6066819  {'fiction': 555, 'romance': 23, 'mystery, thri...
4          287140                                 {'non-fiction': 3}
...           ...                                                ...
2360650   3084038  {'non-fiction': 5, 'history, historical fictio...
2360651  26168430  {'mystery, thriller, crime': 4, 'children': 1,...
2360652   2342551  {'poetry': 14, 'children': 7, 'young-adult': 1...
2360653  22017381     {'romance': 13, 'mystery, thriller, crime': 2}
2360654  11419866                      {'romance': 19, 'fiction': 4}

[2360655 rows x 2 columns]>

In [11]:
# Ensure that there are no missing values in this table
assert(genres_df[genres_df.isna().any(axis=1)].shape == (0, len(genres_df.columns)))

In [12]:
# Ensure that each book_id is unique
assert(genres_df['book_id'].nunique() == genres_df.shape[0])

In [70]:
f1 = open('/content/drive/Shareddrives/CIS5500-BookReviews/CleanedFilesForDB/genres.csv', 'w+')
keys1 = ['genre_id', 'genre_name']
writer1 = csv.DictWriter(f1, fieldnames=keys1)
writer1.writeheader()  

21

In [71]:
# Get all genres

g_id = 1
genre_names = {}
for index, row in genres_df.iterrows():
    # book_id = row['book_id']
    genres_dict = row['genres']
    
    for k,v in genres_dict.items():
      genres = k.split(',')
      
      for gname in genres:
        if gname not in genre_names.values():
          genre_names[g_id] = gname
          writer1.writerow({'genre_id': g_id, 'genre_name': gname})
          g_id+=1


In [72]:
f1.close()

In [73]:
f2 = open('/content/drive/Shareddrives/CIS5500-BookReviews/CleanedFilesForDB/books_genres.csv', 'w+')

keys2 = ['book_id', 'genre_id', 'n_votes']
writer2 = csv.DictWriter(f2, fieldnames=keys2)
writer2.writeheader()


26

In [None]:
# Get books and their genres

for index, row in genres_df.iterrows():
    book_id = row['book_id']
    genres_dict = row['genres']
    i=0
    for k,v in genres_dict.items():
      genres = k.split(',')
      n_vote = v
      for gname in genres:
        key = list(filter(lambda x: genre_names[x] == gname, genre_names))[0]
        writer2.writerow({'book_id': book_id, 'genre_id': key, 'n_votes' : n_vote})
        i=i+1
        if i==3:
          break

In [None]:
f2.close()