<a href="https://colab.research.google.com/github/RenisaPati/CIS5500_Project_Group44/blob/main/CIS5500_Milestone_2_Books_Authors_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction, imports, and function defs

Preprocessing steps needed for Goodreads books and authors datasets. Data retrieved from: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home?authuser=0. Specifically, in this notebook for the sake of exploration, I will be cleaning the mysteries/thriller genre subset of books. The full authors list will be used.

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gdown
import seaborn as sns
from google.colab import drive
import json
from os.path import join
import re

In [2]:
# Mount drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [4]:
dir = '/content/drive/MyDrive/Spring2023/CIS5500-Databases/Final_Project/Genre_Mystery-Thriller/'
authors_path = 'goodreads_book_authors.json'
books_path = 'goodreads_books_mystery_thriller_crime.json'

In [5]:
# Function to load the data from the unzipped json files:
def load_json(file_name, head = 500):
  ''' file_name: path to the json file to be loaded
      head: the number of elements to load
  '''
  count = 0
  data = []
  with open(file_name) as fn:
      for l in fn:
          d = json.loads(l)
          count += 1
          data.append(d)
          
          # break if reaches the 100th line
          if (head is not None) and (count > head):
              break
  return data

# Authors data loading and cleaning

In [12]:
authors_data = load_json(join(dir, authors_path), head=None)

In [13]:
authors_df = pd.DataFrame(authors_data)
authors_df.head()

Unnamed: 0,average_rating,author_id,text_reviews_count,name,ratings_count
0,3.98,604031,7,Ronald J. Fields,49
1,4.08,626222,28716,Anita Diamant,546796
2,3.92,10333,5075,Barbara Hambly,122118
3,3.68,9212,36262,Jennifer Weiner,888522
4,3.82,149918,96,Nigel Pennick,1740


In [14]:
authors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829529 entries, 0 to 829528
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   average_rating      829529 non-null  object
 1   author_id           829529 non-null  object
 2   text_reviews_count  829529 non-null  object
 3   name                829529 non-null  object
 4   ratings_count       829529 non-null  object
dtypes: object(5)
memory usage: 31.6+ MB


In [15]:
authors_df = authors_df[['author_id', 'average_rating', 'text_reviews_count', 'name', 'ratings_count']]
authors_df.head()

Unnamed: 0,author_id,average_rating,text_reviews_count,name,ratings_count
0,604031,3.98,7,Ronald J. Fields,49
1,626222,4.08,28716,Anita Diamant,546796
2,10333,3.92,5075,Barbara Hambly,122118
3,9212,3.68,36262,Jennifer Weiner,888522
4,149918,3.82,96,Nigel Pennick,1740


In [33]:
authors_df.tail()

Unnamed: 0,author_id,average_rating,text_reviews_count,name,ratings_count
829524,197551,4.36,4,Patty Furbush,11
829525,3988103,4.33,3,Jim Schlinkman,6
829526,13464507,4.0,2,Rich Jolly,18
829527,7427847,3.31,1,sr@ mwrGn,13
829528,5401342,3.7,11,Barry S. Brown,43


In [20]:
# Ensure that there are no missing values in this table
assert(authors_df[authors_df.isna().any(axis=1)].shape == (0, len(authors_df.columns)))

In [21]:
# Ensure that each author has a unique ID
assert(authors_df['author_id'].nunique() == authors_df.shape[0])

In [26]:
authors_df[authors_df['name'].str.contains(',')]

Unnamed: 0,author_id,average_rating,text_reviews_count,name,ratings_count
1329,4599042,4.31,19,"Yudis, Broky, Pak Waw",131
1381,6936990,5.00,1,"Jiao Chuan gemusu, gurasuhotsupamanihuakuchiyua",2
1674,7325512,4.09,55,"Madden, Colleen M.",392
3877,4878973,4.75,3,"Bill Kovach, Tom Rosenstiel",8
3937,2278579,2.11,4,"Meryl Dory, Susan Lindberg, Stephanie Messenger",9
...,...,...,...,...,...
829199,296279,3.90,8,"Esther Allen, trans.",118
829204,8056403,3.40,3,"Charles Corey, C.B. Corey",10
829205,8056404,3.40,3,"Hesketh Prichard, Kate Prichard, E. Heron, H. ...",10
829258,7172301,2.33,1,"Wahib Saray al-Din ,whyb sry ldyn",3


In [37]:
3177 / 829528

0.0038298888042356616

In [34]:
split_names = authors_df['name'].str.split(',')

In [35]:
def FindMaxLength(lst):
    maxList = max(lst, key=len)
    maxLength = len(maxList)
     
    return maxList, maxLength

In [36]:
FindMaxLength(split_names)

(['Saroyan',
  ' Cunninghan',
  ' Mazilu',
  ' Pantu',
  ' Sibisteanu',
  ' Vieru',
  ' Grosan',
  ' Ilea',
  ' Popa',
  ' Pricajan',
  ' Romila',
  ' Lungu'],
 12)

In [None]:
# Possible TODO: split multi-author lists into multiple rows

The authors data is clean and ready for loading into the Authors Table in our database

# Books data loading and cleaning (Mysteries/Thrillers/Crime genre subset)

In [38]:
books_data = load_json(join(dir, books_path), 100000)

In [40]:
books_df = pd.DataFrame(books_data)
books_df.head(10)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,184737297X,15,[169353],US,,"[{'count': '159', 'name': 'to-read'}, {'count'...",,False,3.93,B007YLTG5I,...,4.0,,2009.0,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,6066814,186,6243149,"Crowner Royal (Crowner John Mystery, #13)","Crowner Royal (Crowner John Mystery, #13)"
1,,60,[1052227],US,eng,"[{'count': '54', 'name': 'currently-reading'},...",B01NCIKAQX,True,4.33,B01NCIKAQX,...,,,,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,33394837,269,54143148,The House of Memory (Pluto's Snitch #2),The House of Memory (Pluto's Snitch #2)
2,,23,[953679],US,eng,"[{'count': '90', 'name': 'to-read'}, {'count':...",B01ALOWJN0,True,3.49,B01ALOWJN0,...,,,,https://www.goodreads.com/book/show/29074697-t...,https://s.gr-assets.com/assets/nophoto/book/11...,29074697,192,49305010,The Slaughtered Virgin of Zenopolis (Inspector...,The Slaughtered Virgin of Zenopolis (Inspector...
3,0854563903,8,[408775],US,,"[{'count': '51', 'name': 'to-read'}, {'count':...",,False,3.3,,...,12.0,Large Print,1975.0,https://www.goodreads.com/book/show/1902202.De...,https://s.gr-assets.com/assets/nophoto/book/11...,1902202,52,1903897,"Dead in the Morning (Patrick Grant, #1)","Dead in the Morning (Patrick Grant, #1)"
4,8838920931,3,[274410],US,ita,"[{'count': '48', 'name': 'to-read'}, {'count':...",,False,3.54,,...,,,2006.0,https://www.goodreads.com/book/show/9671977-ar...,https://images.gr-assets.com/books/1474788304m...,9671977,22,2152906,Aristotele e i misteri di Eleusi,Aristotele e i misteri di Eleusi
5,0062265806,3,[199039],US,eng,"[{'count': '1694', 'name': 'mystery'}, {'count...",,False,3.96,,...,3.0,,2013.0,https://www.goodreads.com/book/show/16158998-a...,https://images.gr-assets.com/books/1360572193m...,16158998,5,2288775,A Murder is Announced,A Murder is Announced
6,,5,[],US,en-GB,"[{'count': '27', 'name': 'to-read'}, {'count':...",B00UQVGQMO,True,3.8,B00UQVGQMO,...,,,,https://www.goodreads.com/book/show/25162836-d...,https://s.gr-assets.com/assets/nophoto/book/11...,25162836,8,44866515,Dark Flames Rising,Dark Flames Rising
7,0752844458,8,[326237],US,,"[{'count': '38', 'name': 'to-read'}, {'count':...",,False,3.61,B00KKFTAL0,...,8.0,,2001.0,https://www.goodreads.com/book/show/2805495-wy...,https://images.gr-assets.com/books/1328819096m...,2805495,58,2831381,Wycliffe and the Cycle of Death,Wycliffe and the Cycle of Death
8,8293326247,6,[],US,eng,"[{'count': '171', 'name': 'to-read'}, {'count'...",,False,4.14,,...,11.0,,2014.0,https://www.goodreads.com/book/show/22722787-t...,https://s.gr-assets.com/assets/nophoto/book/11...,22722787,18,42251489,The Cost of Doing Business,The Cost of Doing Business
9,0062265784,2,[199041],US,eng,"[{'count': '1642', 'name': 'mystery'}, {'count...",,False,3.93,,...,3.0,,2013.0,https://www.goodreads.com/book/show/16158996-4,https://images.gr-assets.com/books/1360566349m...,16158996,5,6490729,4:50 From Paddington,4:50 From Paddington


In [41]:
# Missing values seem to be stored as an empty string
books_df.isbn[1]

''

In [42]:
keep_cols = ['isbn', 'text_reviews_count', 'series', 'language_code', 'is_ebook', 'average_rating', 'similar_books',
             'description', 'format', 'authors', 'publisher', 'num_pages', 'publication_year', 'image_url',
             'book_id', 'ratings_count', 'title']

TODOS:


*   Replace missing values: may need to have column-specific solutions to this ie. 'Not Available' string for ISBN or a url to a 'missing' image for not available book icons
*   remove brackets from 'series' column numbers
*   Replace value in 'series' for books that aren't part of one with a coded value (-999 maybe?) so that code can be written to check for this value when rendering book page.
*   Extract the 'similar_books' list and keep up to three similar books. Create another separate table of book IDs and retained similar book IDs.
*   Check book IDs are unique
*   cast year as datetime year object
*   Extract authors list and split into separate table, WrittenBy(book_id, author_id, role)
*   Remove any book with no listed authors
 









