In [5]:
import requests
from bs4 import BeautifulSoup

BASE_URL = 'https://www.goodreads.com/author/on_goodreads?page={page}&skip_cache=true'
author_names = []

#If we were to visit the above mentioned URL, we would realise that each page has the names of only 30 authors.
#Hence we would have to go through 2 such pages.
for page in [1, 2]:
    response = requests.get(BASE_URL.format(page=page))
    #This line sends an HTTP GET request to a URL, with a placeholder for the page number. The {page} is filled in with the value of page from the loop.
    
    # now we check if the request was successful (this is something we are adviced to do in general).
    if response.status_code == 200:
        #status code 200 indicates a successful request.
        soup = BeautifulSoup(response.content, 'html.parser')

        
        authors_tags = soup.select('a.bookAuthorProfile__name')
        #When we inspect the website, we find out that the author names all belong to a class called "bookAuthorProfile__name".
        
        # Then we extract text from each such element and add it to the list.
        for author in authors_tags:
            author_names.append(author.get_text(strip=True))
            
        
        if len(author_names) >= 50:
            break
        # We break out of the loop if we have 50 or more names already.
            
    else:
        print(f"Failed to retrieve webpage for page {page}. Status code: {response.status_code}")

# Even though its unlikely, in the offchance that we entered more than 50 names, the step below will filter only the top 50 and keep them in the variable.
author_names = author_names[:50]
print(author_names)



['Stephen King', 'Colleen Hoover', 'Sarah J. Maas', 'Rick Riordan', 'Cassandra Clare', 'James Patterson', 'John Green', 'Neil Gaiman', 'Taylor Jenkins Reid', 'Dan Brown', 'Nicholas Sparks', 'Brandon Sanderson', 'Veronica Roth', 'Leigh Bardugo', 'John Grisham', 'Jennifer L. Armentrout', 'Jodi Picoult', 'Nora Roberts', 'Gillian Flynn', 'Kristin Hannah', 'Holly Black', 'Khaled Hosseini', 'Emily Henry', 'Margaret Atwood', 'Paulo Coelho', 'Kiera Cass', 'E.L. James', 'Christina Lauren', 'Janet Evanovich', 'Liane Moriarty', 'Jojo Moyes', 'Marissa Meyer', 'Rainbow Rowell', 'Diana Gabaldon', 'Fredrik Backman', 'Paula Hawkins', 'Jenny Han', 'Richelle Mead', 'Dean Koontz', 'Charlaine Harris', 'Madeline Miller', 'Elle Kennedy', 'Lois Lowry', 'Sophie Kinsella', 'Julia Quinn', 'Markus Zusak', 'James Dashner', 'Lee Child', 'Ali Hazelwood', 'Freida McFadden']


In [6]:
#One thing we can observe on the website is that when we click on the author's name, we are taken to a page which displays the author's website.
#Its important to note that this holds true for almost all the authors.
#Whats interesting is that each author's website has the exact same format with the only thing that changes being the author's name.
#For example, if the name of an author is "xyz" then their website's URL would be "https://xyz.com/".

author_websites = [f"https://{author.lower().replace(' ', '')}.com/" for author in author_names]
print(author_websites)


['https://stephenking.com/', 'https://colleenhoover.com/', 'https://sarahj.maas.com/', 'https://rickriordan.com/', 'https://cassandraclare.com/', 'https://jamespatterson.com/', 'https://johngreen.com/', 'https://neilgaiman.com/', 'https://taylorjenkinsreid.com/', 'https://danbrown.com/', 'https://nicholassparks.com/', 'https://brandonsanderson.com/', 'https://veronicaroth.com/', 'https://leighbardugo.com/', 'https://johngrisham.com/', 'https://jenniferl.armentrout.com/', 'https://jodipicoult.com/', 'https://noraroberts.com/', 'https://gillianflynn.com/', 'https://kristinhannah.com/', 'https://hollyblack.com/', 'https://khaledhosseini.com/', 'https://emilyhenry.com/', 'https://margaretatwood.com/', 'https://paulocoelho.com/', 'https://kieracass.com/', 'https://e.l.james.com/', 'https://christinalauren.com/', 'https://janetevanovich.com/', 'https://lianemoriarty.com/', 'https://jojomoyes.com/', 'https://marissameyer.com/', 'https://rainbowrowell.com/', 'https://dianagabaldon.com/', 'http

In [21]:
#In this cell we will use the Open Library API to get the names of 5 books written by each author that we have in our "author_names" list.
#the URL for the docs for this website is here - "https://openlibrary.org/developers/api".




author_data = []
#We have created an empty list. This list would go on to hold 50 dictionaries where each dictionary would represent an author.

for author_name in author_names:
    search_url = f"https://openlibrary.org/search.json?author={requests.utils.quote(author_name)}"
    #We construct a URL using Open Library API for each author name. "requests.utils.quote(author_name)" is used to ensure that author names with spaces or special characters are correctly encoded in the URL.
    response = requests.get(search_url)
    #Once again, making get requests.

    if response.status_code == 200:
        data = response.json()
        #Open Library API returns data in JSON format.
        
        docs = data['docs']

        book_titles_list = []  # List to store book titles for the current author.
        author_details = ''  # Variable to store the author name.
        
        
        #Now we iterate through the 'docs' field in the JSON data. Each 'doc' represents a book.
        for doc in docs:
            if 'title' in doc:
                book_titles_list.append(doc['title'])
                if len(book_titles_list) >= 5:  # We stop after storing 5 book titles.
                    break
            
            # We assume that author name structure is consistent across docs
            if 'author_name' in doc and not author_details:
                author_details = ', '.join(doc['author_name'])
            #When we are processing a list of books, some books may contain an 'author_name' field while others may not. 
            #By using author_details, we can capture the author's name from the first book that contains it without overwriting the variable for subsequent documents that may not have an 'author_name' field.
        
        #In case either or both are not found - 
        if not author_details:
            author_details = 'Author details not found'
            
        if not book_titles_list:
            book_titles_list.append('No books found')
        
        #We append the author name and the 5 books(in dictionary form) into our list of dictionaries.
        author_data.append({
            'Author': author_details,
            'Books': ", ".join(book_titles_list)
        })

    else:
        print(f"Failed to retrieve data for {author_name}. Status code: {response.status_code}")
        #In case we dont find the author using the API
        #This is possible as we are using a different website for getting the book names compared to theone of which we performed web scraping to get the author names.

# Printing the final data.
for xyz in author_data:
    print(xyz)


{'Author': 'Stephen King', 'Books': 'Carrie, The Shining, Misery, It, Skeleton Crew'}
{'Author': 'Colleen Hoover', 'Books': 'It Ends With Us, Verity, It Starts with Us, Point Of Retreat, Ugly Love'}
{'Author': 'Sarah J. Maas', 'Books': 'A Court of Thorns and Roses, A Court of Mist and Fury, Throne of Glass, Heir of Fire, Tower of Dawn'}
{'Author': 'Rick Riordan', 'Books': "The Lightning Thief, The Lost Hero, The Sea of Monsters, The Battle of the Labyrinth, The Titan's Curse"}
{'Author': 'Holly Black, Kate Rudd', 'Books': 'Tithe, The Cruel Prince, Valiant, The Wicked King, Ironside'}
{'Author': 'James Patterson', 'Books': '1st to Die, Along Came a Spider, 2nd Chance, Roses Are Red, Jack & Jill'}
{'Author': 'John Green', 'Books': 'The Fault in Our Stars, Looking for Alaska, Paper towns, A short history of the English people, An Abundance of Katherines'}
{'Author': 'Neil Gaiman', 'Books': 'Coraline, American Gods, Good Omens, Stardust, The Ocean at the End of the Lane'}
{'Author': 'Taylo

In [23]:
#In this cell we will create a function that gets the summary of a particular title from Google's API.
#First we need to anable "Books API" and generate an API key on google cloud console.
#I have substituted my actual API key with "xyz" for security reasons.


import requests

def get_book_summary_from_google(title, api_key):
    endpoint = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{requests.utils.quote(title)}&key={api_key}"
    response = requests.get(endpoint)
    data = response.json()
    #This line parses the JSON response from the Google Books API and stores it in the "data" variable. 
    #This data should contain information about books, including their summaries.

    
    if "items" in data and len(data["items"]) > 0 and "volumeInfo" in data["items"][0] and "description" in data["items"][0]["volumeInfo"]:
        # Now we retrieve the description from the first result, if its available.
        return data["items"][0]["volumeInfo"]["description"]
    else:
        return "Summary not available"

# my google API key which I will need while calling the function defined above.
API_KEY = 'xyz'

#The loop below will add the summary for each author.
for author_entry in author_data:
    updated_books = []
    for book in author_entry['Books'].split(", "):  # We split the books back into a list.
        summary = get_book_summary_from_google(book, API_KEY)
        updated_books.append(f"{book} ({summary})")
        #The "updated_books" list will contain the names of each book followed by their summary inside curved brackets.
        
    author_entry['Books'] = ", ".join(updated_books)
    #Now the books key will have the "updated_books" list as its value.
    #However it wont be in a list form and all the elements will be seperated by a comma. 
    #This will happen for each author.


for something in author_data:
    print(something)

{'Author': 'Stephen King', 'Books': 'Carrie (Make a date with terror -- and live the nightmare that is ...), The Shining (A narrative history of the unlikely Maoist rebellion that terrorized Peru even after the fall of global Communism. On May 17, 1980, on the eve of Peru’s presidential election, five masked men stormed a small town in the Andean heartland. They set election ballots ablaze and vanished into the night, but not before planting a red hammer-and-sickle banner in the town square. The lone man arrested the next morning later swore allegiance to a group called Shining Path. The tale of how this ferocious group of guerrilla insurgents launched a decade-long reign of terror, and how brave police investigators and journalists brought it to justice, may be the most compelling chapter in modern Latin American history, but the full story has never been told. Described by a U.S. State Department cable as “cold-blooded and bestial,” Shining Path orchestrated bombings, assassinations,

In [11]:
import pandas as pd


# We start by converting list of dictionaries into dataframe.
df = pd.DataFrame(author_data)

# We split the 'Books' column by ',' to convert it into a list of books. This is necessary for the next step (exploding).
df['Books'] = df['Books'].str.split(r'\),')


df = df.explode('Books')
#The dataframe is exploded on the 'Books' column, which means that each row is duplicated for each book in the list.

df['Books'] = df['Books'].str.strip() + ')'
#To make sure that every row in the 'Books' column ends with a ')'. 
#This operation is performed to make sure that the data in the 'Books' column is correctly formatted.


df[['Book Name', 'Summary']] = df['Books'].str.extract(r'(.+?)\s*\((.+)\)')
#This line extracts the 'Book Name' and 'Summary' from the 'Books' column using a regular expression.
# It captures two groups within each book entry: the book name and the summary.

# We then drop the original 'Books' column.
df = df.drop(columns=['Books'])

print(df)



             Author                  Book Name  \
0      Stephen King                     Carrie   
0      Stephen King                The Shining   
0      Stephen King                     Misery   
0      Stephen King  "dazzlingly well-written"   
0      Stephen King       and "truly gripping"   
..              ...                        ...   
49  Freida McFadden              The breakdown   
49  Freida McFadden              Bring me back   
49  Freida McFadden        Behind closed doors   
49  Freida McFadden                    Dilemma   
49  Freida McFadden              The Therapist   

                                              Summary  
0   Make a date with terror -- and live the nightm...  
0   A narrative history of the unlikely Maoist reb...  
0   The #1 New York Times bestseller about a famou...  
0                               The Indianapolis Star  
0                                   Publishers Weekly  
..                                                ...  
49  Nat

In [12]:
author_data3 = author_data
#As "author_data" took a lot of time and processing power to create, if something were to happen which would cause me to lose it I would have to go through the entire process again. Hence for safekeep, I created a copy of it.

In [14]:
df['Websites'] = [f"https://{author.lower().replace(' ', '')}.com/" for author in df['Author']]
#To create the "Websites" column.

In [28]:
df.rename(columns={'Websites': 'Website'}, inplace=True)
#Change the name of the column from "Websites" to "Website".

df

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,Stephen King,The Shining,A narrative history of the unlikely Maoist reb...,https://stephenking.com/
0,Stephen King,Misery,The #1 New York Times bestseller about a famou...,https://stephenking.com/
0,Stephen King,"""dazzlingly well-written""",The Indianapolis Star,https://stephenking.com/
0,Stephen King,"and ""truly gripping""",Publishers Weekly,https://stephenking.com/
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,Freida McFadden,Bring me back,The million-copy bestselling author B A Paris ...,https://freidamcfadden.com/
49,Freida McFadden,Behind closed doors,***The four-million-copy International Bestsel...,https://freidamcfadden.com/
49,Freida McFadden,Dilemma,The Himalayas have experienced a population ex...,https://freidamcfadden.com/


In [20]:
print(df['Summary'].iloc[1])

A narrative history of the unlikely Maoist rebellion that terrorized Peru even after the fall of global Communism. On May 17, 1980, on the eve of Peru’s presidential election, five masked men stormed a small town in the Andean heartland. They set election ballots ablaze and vanished into the night, but not before planting a red hammer-and-sickle banner in the town square. The lone man arrested the next morning later swore allegiance to a group called Shining Path. The tale of how this ferocious group of guerrilla insurgents launched a decade-long reign of terror, and how brave police investigators and journalists brought it to justice, may be the most compelling chapter in modern Latin American history, but the full story has never been told. Described by a U.S. State Department cable as “cold-blooded and bestial,” Shining Path orchestrated bombings, assassinations, and massacres across the cities, countryside, and jungles of Peru in a murderous campaign to seize power and impose a Com

In [29]:
pf = df
jf = df
pf
#Safekeep

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,Stephen King,The Shining,A narrative history of the unlikely Maoist reb...,https://stephenking.com/
0,Stephen King,Misery,The #1 New York Times bestseller about a famou...,https://stephenking.com/
0,Stephen King,"""dazzlingly well-written""",The Indianapolis Star,https://stephenking.com/
0,Stephen King,"and ""truly gripping""",Publishers Weekly,https://stephenking.com/
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,Freida McFadden,Bring me back,The million-copy bestselling author B A Paris ...,https://freidamcfadden.com/
49,Freida McFadden,Behind closed doors,***The four-million-copy International Bestsel...,https://freidamcfadden.com/
49,Freida McFadden,Dilemma,The Himalayas have experienced a population ex...,https://freidamcfadden.com/


In [31]:
pf['Author'] = pf['Author'].where(pf['Author'] != pf['Author'].shift(), '')
pf['Website'] = pf['Website'].where(pf['Website'] != pf['Website'].shift(), '')
#We replace values in the 'Author' and 'Website' columns of a DataFrame pf with empty strings ('') where the value in the current row is the same as the value in the previous row.

In [32]:
pf

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,,The Shining,A narrative history of the unlikely Maoist reb...,
0,,Misery,The #1 New York Times bestseller about a famou...,
0,,"""dazzlingly well-written""",The Indianapolis Star,
0,,"and ""truly gripping""",Publishers Weekly,
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,,Bring me back,The million-copy bestselling author B A Paris ...,
49,,Behind closed doors,***The four-million-copy International Bestsel...,
49,,Dilemma,The Himalayas have experienced a population ex...,


In [34]:
df['Author'] = df['Author'].where(df['Author'] != df['Author'].shift(), '')
df['Website'] = df['Website'].where(df['Website'] != df['Website'].shift(), '')

In [35]:
df

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,,The Shining,A narrative history of the unlikely Maoist reb...,
0,,Misery,The #1 New York Times bestseller about a famou...,
0,,"""dazzlingly well-written""",The Indianapolis Star,
0,,"and ""truly gripping""",Publishers Weekly,
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,,Bring me back,The million-copy bestselling author B A Paris ...,
49,,Behind closed doors,***The four-million-copy International Bestsel...,
49,,Dilemma,The Himalayas have experienced a population ex...,


In [36]:
jf

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,,The Shining,A narrative history of the unlikely Maoist reb...,
0,,Misery,The #1 New York Times bestseller about a famou...,
0,,"""dazzlingly well-written""",The Indianapolis Star,
0,,"and ""truly gripping""",Publishers Weekly,
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,,Bring me back,The million-copy bestselling author B A Paris ...,
49,,Behind closed doors,***The four-million-copy International Bestsel...,
49,,Dilemma,The Himalayas have experienced a population ex...,


In [42]:
import numpy as np

pf['Author'].replace('', np.nan, inplace=True)
#We replace the empty author names with Null values.
pf['Author'] = pf['Author'].fillna(method='ffill')
#The missing author names will be filled with the last valid author name found in the 'Author' column.

In [43]:
pf

Unnamed: 0,Author,Book Name,Summary,Website
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/
0,Stephen King,The Shining,A narrative history of the unlikely Maoist reb...,
0,Stephen King,Misery,The #1 New York Times bestseller about a famou...,
0,Stephen King,"""dazzlingly well-written""",The Indianapolis Star,
0,Stephen King,"and ""truly gripping""",Publishers Weekly,
...,...,...,...,...
49,Freida McFadden,The breakdown,National Book Award Finalist: “This man’s idea...,https://freidamcfadden.com/
49,Freida McFadden,Bring me back,The million-copy bestselling author B A Paris ...,
49,Freida McFadden,Behind closed doors,***The four-million-copy International Bestsel...,
49,Freida McFadden,Dilemma,The Himalayas have experienced a population ex...,


In [44]:
#Now we define a function to get the genre of each book via google books API.
def get_genre(book_name, author):
    url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{book_name}+inauthor:{author}&key=xyz"
    #I replced my API key with "xyz" for security purposes.
    response = requests.get(url)
    data = response.json()
    
    # I assume the genre is under 'categories' key in the first item of the 'items' list.
    # This might not always be the case, so error handling is essential.
    try:
        genre = data['items'][0]['volumeInfo']['categories'][0]
        return genre
    except (KeyError, IndexError):
        return 'Genre not found'

# Apply the function to populate the "Genre" column
pf['Genre'] = pf.apply(lambda row: get_genre(row['Book Name'], row['Author']), axis=1)
print(pf)

             Author                  Book Name  \
0      Stephen King                     Carrie   
0      Stephen King                The Shining   
0      Stephen King                     Misery   
0      Stephen King  "dazzlingly well-written"   
0      Stephen King       and "truly gripping"   
..              ...                        ...   
49  Freida McFadden              The breakdown   
49  Freida McFadden              Bring me back   
49  Freida McFadden        Behind closed doors   
49  Freida McFadden                    Dilemma   
49  Freida McFadden              The Therapist   

                                              Summary  \
0   Make a date with terror -- and live the nightm...   
0   A narrative history of the unlikely Maoist reb...   
0   The #1 New York Times bestseller about a famou...   
0                               The Indianapolis Star   
0                                   Publishers Weekly   
..                                                ...   


In [63]:
df = pf
df.head(10)

Unnamed: 0,Author,Book Name,Summary,Website,Genre
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/,Bullying
0,Stephen King,The Shining,A narrative history of the unlikely Maoist reb...,,Fiction
0,Stephen King,Misery,The #1 New York Times bestseller about a famou...,,Fiction
0,Stephen King,It,Summary not available,,Fiction
0,Stephen King,Skeleton Crew,Age-old images of fear fuse with the iconograp...,,Fiction
1,Colleen Hoover,It Ends With Us,In this “brave and heartbreaking novel that di...,https://colleenhoover.com/,Fiction
1,Colleen Hoover,Verity,Whose truth is the lie? Stay up all night read...,,Fiction
1,Colleen Hoover,It Starts with Us,"Before It Ends with Us, it started with Atlas....",,Fiction
1,Colleen Hoover,Point Of Retreat,From the #1 New York Times bestselling author ...,,Fiction
1,Colleen Hoover,Ugly Love,"From Colleen Hoover, the #1 New York Times bes...",,Fiction


In [66]:
df['Author'] = df['Author'].where(df['Author'] != df['Author'].shift(), '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Author'] = df['Author'].where(df['Author'] != df['Author'].shift(), '')


In [69]:
df.head(15)

Unnamed: 0,Author,Book Name,Summary,Website,Genre
0,Stephen King,Carrie,Make a date with terror -- and live the nightm...,https://stephenking.com/,Bullying
0,,The Shining,A narrative history of the unlikely Maoist reb...,,Fiction
0,,Misery,The #1 New York Times bestseller about a famou...,,Fiction
0,,It,Summary not available,,Fiction
0,,Skeleton Crew,Age-old images of fear fuse with the iconograp...,,Fiction
1,Colleen Hoover,It Ends With Us,In this “brave and heartbreaking novel that di...,https://colleenhoover.com/,Fiction
1,,Verity,Whose truth is the lie? Stay up all night read...,,Fiction
1,,It Starts with Us,"Before It Ends with Us, it started with Atlas....",,Fiction
1,,Point Of Retreat,From the #1 New York Times bestselling author ...,,Fiction
1,,Ugly Love,"From Colleen Hoover, the #1 New York Times bes...",,Fiction


In [73]:
# My desired column order - 
column_order = ['Author', 'Website', 'Book Name', 'Genre', 'Summary']

# Reassigning the columns of df based on my desired order.
df = df[column_order]
df.head(10)


Unnamed: 0,Author,Website,Book Name,Genre,Summary
0,Stephen King,https://stephenking.com/,Carrie,Bullying,Make a date with terror -- and live the nightm...
0,,,The Shining,Fiction,A narrative history of the unlikely Maoist reb...
0,,,Misery,Fiction,The #1 New York Times bestseller about a famou...
0,,,It,Fiction,Summary not available
0,,,Skeleton Crew,Fiction,Age-old images of fear fuse with the iconograp...
1,Colleen Hoover,https://colleenhoover.com/,It Ends With Us,Fiction,In this “brave and heartbreaking novel that di...
1,,,Verity,Fiction,Whose truth is the lie? Stay up all night read...
1,,,It Starts with Us,Fiction,"Before It Ends with Us, it started with Atlas...."
1,,,Point Of Retreat,Fiction,From the #1 New York Times bestselling author ...
1,,,Ugly Love,Fiction,"From Colleen Hoover, the #1 New York Times bes..."
