# Getting Started

In [1]:
!pip install xmltodict

Collecting xmltodict
  Using cached xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
import xmltodict 
import urllib.request
import seaborn as sns
import re

# Parsing the link

In [3]:
Goodread_profile = "https://www.goodreads.com/user/show/73376016-abid"
user_id = ''.join(filter(lambda i: i.isdigit(), Goodread_profile))
user_name = re.findall(r'(?:\d[-.]|[^-.])*(?:[-.]|$)', Goodread_profile.split(user_id, 1)[1])[1]
user_id_name = user_id+'-'+user_name
print(user_id_name)

'73376016-abid'

# Goodreads Data Extraction

In [4]:
apiKey = "ZRnySx6awjQuExO9tKEJXw"
version = "2"
shelf = "read"
per_page = "200"

def get_user_data(user_id, apiKey, version, shelf, per_page):
    api_url_base = "https://www.goodreads.com/review/list/"
    final_url = (
        api_url_base
        + user_id
        + ".xml?key="
        + apiKey
        + "&v="
        + version
        + "&shelf="
        + shelf
        + "&per_page="
        + per_page
    )
    contents = urllib.request.urlopen(final_url).read()
    return contents


In [5]:
contents = get_user_data(user_id_name,apiKey,version, shelf, per_page)
print(contents[0:100])

b'<?xml version="1.0" encoding="UTF-8"?>\n<GoodreadsResponse>\n  <Request>\n    <authentication>true</aut'


# Converting XML to JSON

In [6]:
contents_json = xmltodict.parse(contents)
print(contents_json["GoodreadsResponse"]["reviews"]["review"][:1])

[{'id': '4626706284', 'book': {'id': {'@type': 'integer', '#text': '57771224'}, 'isbn': '1250809606', 'isbn13': '9781250809605', 'text_reviews_count': {'@type': 'integer', '#text': '150'}, 'uri': 'kca://book/amzn1.gr.book.v3.tcNoY0o7ErAhczdQ', 'title': 'Good Intentions', 'title_without_series': 'Good Intentions', 'image_url': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1643679980l/57771224._SX98_.jpg', 'small_image_url': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1643679980l/57771224._SY75_.jpg', 'large_image_url': None, 'link': 'https://www.goodreads.com/book/show/57771224-good-intentions', 'num_pages': '288', 'format': 'Hardcover', 'edition_information': None, 'publisher': 'Henry Holt and Co.', 'publication_day': '8', 'publication_year': '2022', 'publication_month': '3', 'average_rating': '3.53', 'ratings_count': '655', 'description': "<b>The Big Sick meets Nick Hornby in Kasim Ali's debut Good Intentions, a novel about a young m

# Conveting JSON to Pandas Dataframe

In [7]:
df = pd.json_normalize(contents_json["GoodreadsResponse"]["reviews"]["review"])
df = df[df["date_updated"].notnull()]
df.head()


Unnamed: 0,id,rating,votes,spoiler_flag,spoilers_state,recommended_for,recommended_by,started_at,read_at,date_added,...,book.published,book.work.id,book.work.uri,shelves.shelf,book.isbn.@nil,book.isbn13.@nil,shelves.shelf.@name,shelves.shelf.@exclusive,shelves.shelf.@id,shelves.shelf.@review_shelf_id
0,4626706284,3,0,False,none,,,Wed Mar 23 00:00:00 -0700 2022,Thu Mar 24 00:00:00 -0700 2022,Wed Mar 23 21:56:32 -0700 2022,...,2022,90497833,kca://work/amzn1.gr.work.v3.7xBn-b8AcOz6511T,"[{'@name': 'read', '@exclusive': 'true', '@id'...",,,,,,
1,4617282277,5,0,False,none,,,Sat Mar 19 00:00:00 -0700 2022,Mon Mar 21 13:12:00 -0700 2022,Sat Mar 19 07:18:42 -0700 2022,...,2018,52114608,kca://work/amzn1.gr.work.v1.YyJ0VjF7KwfGDd9fFK...,,True,True,read,True,239330059.0,
2,4611790134,3,0,False,none,,,Mon Mar 28 00:35:23 -0700 2022,Fri Apr 08 12:12:46 -0700 2022,Wed Mar 16 07:14:39 -0700 2022,...,2022,88094938,kca://work/amzn1.gr.work.v3.MP6mM2rlVEYwcGMZ,,,,read,True,239330059.0,
3,4539885289,4,0,False,none,,,Wed Feb 09 00:00:00 -0800 2022,Mon Mar 07 10:50:51 -0800 2022,Wed Feb 09 11:20:31 -0800 2022,...,2021,87563549,kca://work/amzn1.gr.work.v3.HQFBPvzZYkUReW8F,,,,read,True,239330059.0,
4,4386936522,5,0,False,none,,,Mon Dec 13 06:32:37 -0800 2021,Sun Dec 26 09:51:35 -0800 2021,Mon Dec 13 06:32:36 -0800 2021,...,2019,62311508,kca://work/amzn1.gr.work.v1.CkO5oyQPm04FrjKHb1...,,,,read,True,239330059.0,


# Data Cleaning

In [8]:
df.shape

(200, 61)

In [9]:
df.dropna(axis=1, how='all', inplace=True)
df.shape

(200, 58)

In [10]:
df.columns

Index(['id', 'rating', 'votes', 'spoiler_flag', 'spoilers_state', 'started_at',
       'read_at', 'date_added', 'date_updated', 'read_count', 'body',
       'comments_count', 'url', 'link', 'owned', 'book.id.@type',
       'book.id.#text', 'book.isbn', 'book.isbn13',
       'book.text_reviews_count.@type', 'book.text_reviews_count.#text',
       'book.uri', 'book.title', 'book.title_without_series', 'book.image_url',
       'book.small_image_url', 'book.link', 'book.num_pages', 'book.format',
       'book.edition_information', 'book.publisher', 'book.publication_day',
       'book.publication_year', 'book.publication_month',
       'book.average_rating', 'book.ratings_count', 'book.description',
       'book.authors.author.id', 'book.authors.author.name',
       'book.authors.author.role', 'book.authors.author.image_url.@nophoto',
       'book.authors.author.image_url.#text',
       'book.authors.author.small_image_url.@nophoto',
       'book.authors.author.small_image_url.#text', 'boo

In [11]:
final_df = df[
    [
        "rating",
        "started_at",
        "read_at",
        "date_added",
        "book.title",
        "book.average_rating",
        'book.ratings_count',
        "book.publication_year",
        "book.authors.author.name"
    ]
]
final_df.head()

Unnamed: 0,rating,started_at,read_at,date_added,book.title,book.average_rating,book.ratings_count,book.publication_year,book.authors.author.name
0,3,Wed Mar 23 00:00:00 -0700 2022,Thu Mar 24 00:00:00 -0700 2022,Wed Mar 23 21:56:32 -0700 2022,Good Intentions,3.53,655,2022,Kasim Ali
1,5,Sat Mar 19 00:00:00 -0700 2022,Mon Mar 21 13:12:00 -0700 2022,Sat Mar 19 07:18:42 -0700 2022,The One,4.12,100020,2018,John Marrs
2,3,Mon Mar 28 00:35:23 -0700 2022,Fri Apr 08 12:12:46 -0700 2022,Wed Mar 16 07:14:39 -0700 2022,Nine Lives,3.6,11973,2022,Peter Swanson
3,4,Wed Feb 09 00:00:00 -0800 2022,Mon Mar 07 10:50:51 -0800 2022,Wed Feb 09 11:20:31 -0800 2022,Out of Office: The Big Problem and Bigger Prom...,3.79,1505,2021,Charlie Warzel
4,5,Mon Dec 13 06:32:37 -0800 2021,Sun Dec 26 09:51:35 -0800 2021,Mon Dec 13 06:32:36 -0800 2021,"Supernova (Renegades, #3)",4.42,49612,2019,Marissa Meyer


# Saving Final Data to CSV file

In [12]:
final_df.to_csv("abid_goodreads_clean_data.csv",index=False)