In [1]:
from itertools import product
from matplotlib import dates as mdates
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
SEED = 27432 # from random.org, for reproductibility

# First we load the data from the Google Spreadsheet. Conveniently, pandas can load CSVs from a web link.
GDOC_URI = 'https://docs.google.com/spreadsheets/d/1wNbJv1Zf4Oichj3-dEQXE_lXVCwuYQjaoyU1gGQQqk4/export?gid=0&format=csv'

In [2]:
raw_df = (pd.read_csv(
                GDOC_URI,
                usecols=[
                     'Year', 'Month', 'Day',
                     'End Year', 'End Month', 'End Day',
                     'Headline', 'Text'
                 ]
            )
            .dropna(axis=1, how='all')
            .dropna(axis=0))

raw_df.head()

Unnamed: 0,Year,Month,Day,End Year,End Month,End Day,Headline,Text
0,2014,12,13,2014.0,12.0,23.0,The Bloody Chamber,"Angela Carter, 126 pages"
1,2014,12,23,2015.0,1.0,4.0,The Last Place on Earth,"Roland Huntford, 564 pages"
2,2015,1,24,2015.0,2.0,13.0,Empire Falls,"Richard Russo, 483 pages"
3,2015,2,14,2015.0,2.0,20.0,Wonder Boys,"Michael Chabon, 368 pages"
4,2015,2,25,2015.0,3.0,4.0,"Red State, Blue State, Rich State, Poor State:...","Andrew Gelman, 196 pages"


In [3]:
from goodreads import client

key = "zAOH8Lj7qKDDXQCAIZ2OvQ"
secret = "9uMGVBpTfvWM2esHqciT9tZkO0oC0Y1sQi9XsNPbqkk"
goodreads = client.GoodreadsClient(key, secret)

In [4]:
df = pd.DataFrame({
    'start_date': raw_df.apply(
        lambda s: pd.datetime(
            s['Year'],
            s['Month'],
            s['Day']
        ),
        axis=1
    ),
    'end_date': raw_df.apply(
        lambda s: pd.datetime(
            int(s['End Year']),
            int(s['End Month']),
            int(s['End Day'])
        ),
        axis=1
    ),
    'title': raw_df['Headline'],
    'author': (raw_df['Text']
                     .str.extract('(.*),.*', expand=True)
                     .iloc[:, 0]),
    'pages': (raw_df['Text']
                    .str.extract(r'.*, (\d+) pages', expand=False)
                    .astype(np.int64))
})

df['days'] = (df['end_date']
                .sub(df['start_date'])
                .dt.days)

df = df[[
    'author', 'title',
    'start_date', 'end_date', 'days',
    'pages'
]]

In [5]:
df.head(3)

Unnamed: 0,author,title,start_date,end_date,days,pages
0,Angela Carter,The Bloody Chamber,2014-12-13,2014-12-23,10,126
1,Roland Huntford,The Last Place on Earth,2014-12-23,2015-01-04,12,564
2,Richard Russo,Empire Falls,2015-01-24,2015-02-13,20,483


In [70]:
def get_top_shelf(goodreads, book_name):
    try:
        if len(book_name.strip())==0:
            return None
        books = goodreads.search_books(book_name)
        if books is not None and len(books)>0:
            book = books[0]
            if book is not None and len(book.popular_shelves)>0:
                for e in book.popular_shelves:
                    if str(e)=='to-read' or str(e)=='currently-reading':
                        continue
                    else:
                        return str(e)
    except:
        pass
    return None

In [54]:
df['top_shelf'] = df['title'].map(lambda title: get_top_shelf(goodreads, str(title)) if title is not None else None)

In [None]:
name_to_shelf = {}
for value in df.title.values:
    name_to_shelf[value] = get_top_shelf(goodreads, value)

In [None]:
name_to_shelf