In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# genre_df = pd.DataFrame(load_data("../dataset/goodreads_book_genres_initial.json.gz"))
genres_df = pd.read_json("../dataset/goodreads_book_genres_initial.json", lines=True)

In [3]:
genres_df.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


In [4]:
def load_data(file_name, genres_df, head = 5000):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            if (d["image_url"] != "https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png") \
            and (d["description"] != "") and (float(d["average_rating"]) > 0):
                genre_list_idx = genres_df[genres_df["book_id"] == int(d["book_id"])]["genres"].index[0]
                if len(genres_df[genres_df["book_id"] == int(d["book_id"])]["genres"][genre_list_idx]) > 0:
                    count += 1
                    data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [5]:
book = load_data(os.path.join("../dataset/", 'goodreads_books.json.gz'), genres_df)
len(book)

5001

In [6]:
book_df = pd.DataFrame(book)
print(book_df.shape)
book_df.head()

(5001, 29)


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
1,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,False,4.22,,...,3.0,,2009,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,701117,All's Fairy in Love and War (Avalon: Web of Ma...,All's Fairy in Love and War (Avalon: Web of Ma...
2,,4,[],US,,"[{'count': '4', 'name': 'to-read'}, {'count': ...",,True,3.86,,...,5.0,,2017,https://www.goodreads.com/book/show/34883016-p...,https://images.gr-assets.com/books/1493525974m...,34883016,5,56135087,Playmaker: A Venom Series Novella,Playmaker: A Venom Series Novella
3,0922915113,39,[],US,,"[{'count': '961', 'name': 'to-read'}, {'count'...",,False,3.81,B00AFYVB8Q,...,4.0,,2000,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,287149,986,278586,The Devil's Notebook,The Devil's Notebook
4,184737297X,15,[169353],US,,"[{'count': '159', 'name': 'to-read'}, {'count'...",,False,3.93,B007YLTG5I,...,4.0,,2009,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,6066814,186,6243149,"Crowner Royal (Crowner John Mystery, #13)","Crowner Royal (Crowner John Mystery, #13)"


In [7]:
book_df["image_url"].value_counts()

https://images.gr-assets.com/books/1304100136m/7327624.jpg     1
https://images.gr-assets.com/books/1488914969m/34504470.jpg    1
https://images.gr-assets.com/books/1380855539m/15869592.jpg    1
https://images.gr-assets.com/books/1406380355m/15707437.jpg    1
https://images.gr-assets.com/books/1369817696m/12984135.jpg    1
                                                              ..
https://images.gr-assets.com/books/1395951349m/18505769.jpg    1
https://images.gr-assets.com/books/1396227441m/18505768.jpg    1
https://images.gr-assets.com/books/1325556548m/13040792.jpg    1
https://images.gr-assets.com/books/1323470229m/13180905.jpg    1
https://images.gr-assets.com/books/1467737940m/30848018.jpg    1
Name: image_url, Length: 5001, dtype: int64

In [8]:
book_df.isna().sum()

isbn                    0
text_reviews_count      0
series                  0
country_code            0
language_code           0
popular_shelves         0
asin                    0
is_ebook                0
average_rating          0
kindle_asin             0
similar_books           0
description             0
format                  0
link                    0
authors                 0
publisher               0
num_pages               0
publication_day         0
isbn13                  0
publication_month       0
edition_information     0
publication_year        0
url                     0
image_url               0
book_id                 0
ratings_count           0
work_id                 0
title                   0
title_without_series    0
dtype: int64

In [9]:
! mkdir -p ../dataset/images/

In [10]:
def download_image(row):
    name = row["book_id"]
    ! wget -O ../dataset/images/{name} -q {row["image_url"]}

In [11]:
book_df[["image_url", "book_id"]].apply(download_image, axis = 1) # uncomment to download

0       None
1       None
2       None
3       None
4       None
        ... 
4996    None
4997    None
4998    None
4999    None
5000    None
Length: 5001, dtype: object

In [12]:
genres_df["book_id"] = genres_df["book_id"].astype('str')

In [13]:
pd.merge(book_df, genres_df, on="book_id")

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,genres
0,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,false,4.03,,...,Book Club Edition,1987,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...","{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
1,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,false,4.22,,...,,2009,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,701117,All's Fairy in Love and War (Avalon: Web of Ma...,All's Fairy in Love and War (Avalon: Web of Ma...,"{'fantasy, paranormal': 32, 'young-adult': 8, ..."
2,,4,[],US,,"[{'count': '4', 'name': 'to-read'}, {'count': ...",,true,3.86,,...,,2017,https://www.goodreads.com/book/show/34883016-p...,https://images.gr-assets.com/books/1493525974m...,34883016,5,56135087,Playmaker: A Venom Series Novella,Playmaker: A Venom Series Novella,{'romance': 3}
3,0922915113,39,[],US,,"[{'count': '961', 'name': 'to-read'}, {'count'...",,false,3.81,B00AFYVB8Q,...,,2000,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,287149,986,278586,The Devil's Notebook,The Devil's Notebook,"{'non-fiction': 24, 'history, historical ficti..."
4,184737297X,15,[169353],US,,"[{'count': '159', 'name': 'to-read'}, {'count'...",,false,3.93,B007YLTG5I,...,,2009,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,6066814,186,6243149,"Crowner Royal (Crowner John Mystery, #13)","Crowner Royal (Crowner John Mystery, #13)","{'fiction': 19, 'history, historical fiction, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,,1,[],US,swe,"[{'count': '4904', 'name': 'to-read'}, {'count...",,false,3.58,,...,,2006,https://www.goodreads.com/book/show/24859526-i...,https://images.gr-assets.com/books/1423302307m...,24859526,3,1010571,Ikarosflickan,Ikarosflickan,"{'fiction': 249, 'fantasy, paranormal': 84, 'y..."
4997,0992254574,25,[822198],US,eng,"[{'count': '405', 'name': 'to-read'}, {'count'...",,false,4.08,,...,,2014,https://www.goodreads.com/book/show/18492796-f...,https://images.gr-assets.com/books/1379291628m...,18492796,47,26177684,Fionn: Defence of Ráth Bládhma (Fionn mac Cumh...,Fionn: Defence of Ráth Bládhma (Fionn mac Cumh...,"{'fantasy, paranormal': 25, 'fiction': 13, 'hi..."
4998,,44,[],US,eng,"[{'count': '614', 'name': 'to-read'}, {'count'...",,false,4.05,,...,,2016,https://www.goodreads.com/book/show/30079589-t...,https://images.gr-assets.com/books/1463483062m...,30079589,78,50499639,This is What Goodbye Looks Like,This is What Goodbye Looks Like,"{'young-adult': 13, 'romance': 11}"
4999,,69,[556603],US,eng,"[{'count': '1480', 'name': 'to-read'}, {'count...",,false,4.18,B00VO2ING6,...,,2015,https://www.goodreads.com/book/show/18492793-f...,https://images.gr-assets.com/books/1429246110m...,18492793,481,26177675,"Finding Bliss (Bliss, #4)","Finding Bliss (Bliss, #4)",{'romance': 11}


In [14]:
genres_df.isna().sum()

book_id    0
genres     0
dtype: int64

In [15]:
genre_labels_df = pd.DataFrame(list(genres_df["genres"]))

In [16]:
genre_labels_df["book_id"] = genres_df["book_id"]

In [17]:
genre_labels_df_reduced = pd.merge(book_df, genre_labels_df, on = "book_id")
print(genre_labels_df_reduced.shape)
genre_labels_df_reduced.head()

(5001, 39)


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,"history, historical fiction, biography",fiction,"fantasy, paranormal","mystery, thriller, crime",poetry,romance,non-fiction,children,young-adult,"comics, graphic"
0,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,8.0,31.0,1.0,1.0,,,,,
1,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,False,4.22,,...,,7.0,32.0,,,,,16.0,8.0,
2,,4,[],US,,"[{'count': '4', 'name': 'to-read'}, {'count': ...",,True,3.86,,...,,,,,,3.0,,,,
3,0922915113,39,[],US,,"[{'count': '961', 'name': 'to-read'}, {'count'...",,False,3.81,B00AFYVB8Q,...,1.0,,,,,,24.0,,,
4,184737297X,15,[169353],US,,"[{'count': '159', 'name': 'to-read'}, {'count'...",,False,3.93,B007YLTG5I,...,38.0,19.0,,38.0,,,,,,


In [18]:
def change_label_to_binary(val):
    if str(val) == "nan":
        return 0
    else:
        if float(val) > 0:
            return 1
        else:
            return 0

In [19]:
for col in genre_labels_df.columns:
    genre_labels_df_reduced[col] = genre_labels_df_reduced[col].apply(change_label_to_binary)

In [20]:
genre_labels_df_reduced

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,"history, historical fiction, biography",fiction,"fantasy, paranormal","mystery, thriller, crime",poetry,romance,non-fiction,children,young-adult,"comics, graphic"
0,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,false,4.03,,...,0,1,1,1,1,0,0,0,0,0
1,1934876569,6,[151854],US,,"[{'count': '515', 'name': 'to-read'}, {'count'...",,false,4.22,,...,0,1,1,0,0,0,0,1,1,0
2,,4,[],US,,"[{'count': '4', 'name': 'to-read'}, {'count': ...",,true,3.86,,...,0,0,0,0,0,1,0,0,0,0
3,0922915113,39,[],US,,"[{'count': '961', 'name': 'to-read'}, {'count'...",,false,3.81,B00AFYVB8Q,...,1,0,0,0,0,0,1,0,0,0
4,184737297X,15,[169353],US,,"[{'count': '159', 'name': 'to-read'}, {'count'...",,false,3.93,B007YLTG5I,...,1,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,,1,[],US,swe,"[{'count': '4904', 'name': 'to-read'}, {'count...",,false,3.58,,...,0,1,1,0,0,0,0,0,1,0
4997,0992254574,25,[822198],US,eng,"[{'count': '405', 'name': 'to-read'}, {'count'...",,false,4.08,,...,1,1,1,0,0,1,0,1,0,0
4998,,44,[],US,eng,"[{'count': '614', 'name': 'to-read'}, {'count'...",,false,4.05,,...,0,0,0,0,0,1,0,0,1,0
4999,,69,[556603],US,eng,"[{'count': '1480', 'name': 'to-read'}, {'count...",,false,4.18,B00VO2ING6,...,0,0,0,0,0,1,0,0,0,0


In [22]:
genre_labels_df_reduced.to_csv("../dataset/books_with_genres.csv", index=False)