In [35]:
import requests
import json
import pandas as pd

# read the data from the API
# fetch the specific data that appear in the books-isbns.txt file

def get_book(book):
    url_books = 'https://openlibrary.org/ISBN/{}.json'.format(book)
    res = requests.get(url_books, verify=False)
    return res

with open('books-isbns.txt', 'r') as f:
    books = [line.strip() for line in f]
    
dataset = []
for b in books:
    try:
        res = get_book(b)
        if (res.status_code != 404): # not error 404 - not fund
            dataset.append(res.json())
    except:
        print("An exception occurred")

# write the data to json file
with open('dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

In [521]:
# read the dataset from the json file and convert to dataframe
df = pd.read_json('dataset.json')

In [538]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [539]:
# clean the data

df["identifiers"].fillna(0, inplace = True)
df["authors"].fillna(0, inplace = True)
df["description"].fillna(0, inplace = True)
df["first_sentence"].fillna(0, inplace = True)


df = df.loc[:,['title', 'authors', 'publish_date', 'number_of_pages', 'publishers', 'isbn_10', 'isbn_13', 'full_title', 'created', 'last_modified',
       'description', 'identifiers', 'first_sentence']]


In [540]:
# 1 & 2
title = df["title"]
title.describe().loc[["unique", "top"]]

unique                 475
top       The Night Circus
Name: title, dtype: object

In [541]:
# 3

identify = df["identifiers"]
identify = identify.to_dict()

goodreads = 0
for id in identify.values():
    if (id != 0 and id.get("goodreads", None)):
        goodreads += 1

print(len(identify) - goodreads)

359


In [542]:
# 4

authors = df["authors"]
authors = authors.to_dict()


some_authors = 0
for a in authors.values():
    if (a != 0 and len(a) > 1):
        some_authors += 1
        
some_authors

34

In [543]:
# 5
df['publishe'] = df.publishers.apply(lambda x: str(x))
df['publishe'] = df.publishe.apply(lambda x: str(x[2:len(x)-2]))
num_books_per_publisher = df['publishe'].value_counts().rename_axis('publisher').reset_index(name='counts')
num_books_per_publisher


Unnamed: 0,publisher,counts
0,HarperCollins,77
1,Orbit,40
2,Voyager,31
3,Del Rey,25
4,,24
5,Vintage,22
6,Harper & Row,19
7,Oxford University Press,19
8,HarperCollins Publishers,18
9,Eos,16


In [544]:
# 6
df['number_of_pages'].median(skipna=True)


340.0

In [545]:
# 7
# solve the multiple datetime format, can be more formatt I didnt notice.. can be deviation in the result
df['Dates'] = pd.to_datetime(df['publish_date'], format="%b %d, %Y", errors="coerce")  \
      .fillna(pd.to_datetime(df['publish_date'], format="%B %d, %Y", errors="coerce"))

df['Month'] = pd.DatetimeIndex(df['Dates']).month
df['Month'].value_counts().iloc[:1]


5.0    48
Name: Month, dtype: int64

In [546]:
# 8
longest_word = df[['title', 'description','first_sentence']]
#longest_word["max_in_description"] = longest_word.description.apply(lambda x: max(x["value"].split(), key=len) if x else 0)
#longest_word["max_in_first_sentence"] = longest_word.description.apply(lambda x: max(x["value"].split(), key=len) if x else 0)
longest_word


Unnamed: 0,title,description,first_sentence
0,Deadeye Dick,0,0
1,Bluebeard,0,0
2,Cat's Cradle,0,0
3,"Sun, moon, star","{'type': '/type/text', 'value': 'When the Crea...",0
4,Slaughterhouse Five,0,0
5,Breakfast of Champions,0,0
6,Slaughterhouse-Five (or The Children's Crusade...,0,0
7,Breakfast of Champions CD,0,0
8,Welcome to the Monkey House CD,0,0
9,Cat's Cradle CD,0,0


In [547]:
# 9
df['Dates'] = pd.to_datetime(df['publish_date'], format="%b %d, %Y", errors="coerce")  \
      .fillna(pd.to_datetime(df['publish_date'], format="%B %d, %Y", errors="coerce")) \
      .fillna(pd.to_datetime(df['publish_date'], format="%Y", errors="coerce"))
df['Dates']
new_df = df[['title','Dates']]
new_df = new_df.sort_values("Dates", ascending=False)
new_df = new_df.iloc[:1]

new_df

Unnamed: 0,title,Dates
242,Dearly,2021-11-02


In [548]:
# 10
updated = df[['title','last_modified']]
updated["date"] = updated.last_modified.apply(lambda x: x["value"])
updated = updated.sort_values("date", ascending=False)
updated = updated.iloc[:1]

updated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated["date"] = updated.last_modified.apply(lambda x: x["value"])


Unnamed: 0,title,last_modified,date
600,Temeraire,"{'type': '/type/datetime', 'value': '2021-11-0...",2021-11-02T07:30:57.214830


In [549]:
# 11

max_title = df[['title','authors', 'Dates']]
#pd.pivot_table(max_title,index=["authors"], values=["Dates"], columns=["title"], fill_value=0)
max_title


Unnamed: 0,title,authors,Dates
0,Deadeye Dick,[{'key': '/authors/OL20187A'}],2019-07-11
1,Bluebeard,[{'key': '/authors/OL20187A'}],2019-07-11
2,Cat's Cradle,[{'key': '/authors/OL20187A'}],1963-05-05
3,"Sun, moon, star",[{'key': '/authors/OL20187A'}],1980-01-01
4,Slaughterhouse Five,[{'key': '/authors/OL20187A'}],2003-11-04
5,Breakfast of Champions,[{'key': '/authors/OL20187A'}],2004-03-02
6,Slaughterhouse-Five (or The Children's Crusade...,[{'key': '/authors/OL20187A'}],2003-11-04
7,Breakfast of Champions CD,[{'key': '/authors/OL20187A'}],2004-03-02
8,Welcome to the Monkey House CD,[{'key': '/authors/OL20187A'}],2006-05-30
9,Cat's Cradle CD,[{'key': '/authors/OL20187A'}],2007-11-01


In [550]:
# 12
pair = df["publishe"].to_frame().join(df["authors"].to_frame())
pair['pair_publish_author'] = tuple(zip(pair['publishe'], pair['authors']))
pair = pair['pair_publish_author'].value_counts()

pair.iloc[1:2] # in the first place was: (HarperCollins, 0) when the publish is HarperCollins and not appear which author

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


(Orbit, [{'key': '/authors/OL7468598A'}])    16
Name: pair_publish_author, dtype: int64

In [551]:
f = open("questions.txt","w") 
f.write("q1 & q2 :\n\n" + str(title.describe().loc[["unique", "top"]]))
f.write("\n\nq3:\n\n" + str(len(identify) - goodreads))
f.write("\n\nq4:\n\n" + str(some_authors))
#f.write("\n\nq5:\n\n" + str(num_books_per_publisher))
f.write("\n\nq6:\n\n" + str(df['number_of_pages'].median(skipna=True)))
f.write("\n\nq7:\n\n" + str(df['Month'].value_counts().iloc[:1]))
f.write("\n\nq8:\n\n")
f.write("\n\nq9:\n\n " + str(new_df))
f.write("\n\nq10:\n\n" + str(updated))
f.write("\n\nq11:\n\n")
f.write("\n\nq12:\n\n"  + str(pair.iloc[1:2]))

f.close()