In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import requests 
import pandas as pd
import time
import re

In [2]:
from bs4 import BeautifulSoup as bs

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

options = Options()
options.add_argument("--headless")

# Genres

In [4]:
#get main genres
url= "https://www.goodreads.com/choiceawards/best-books-2021?ref=nav_brws_gca" 
page = requests.get(url) 
soup = bs(page.content, 'html.parser') 
#print(soup)

In [5]:
#initialize empty list
genres = []
#iterate over lists
for genre in soup.find_all('h4', {'class' : 'category__copy'}):
    genre_to_clean = genre.get_text().strip().replace("&","-").replace('\'','').lower()
    #clean genres
    if ' - ' in genre_to_clean:
        genre_to_clean = genre_to_clean.replace(' ','')
    elif ' ' in genre_to_clean:
        genre_to_clean = genre_to_clean.replace(' ','-')
    else:
        genre_to_clean
    genres.append(genre_to_clean)

In [6]:
genres

['fiction',
 'mystery-thriller',
 'historical-fiction',
 'fantasy',
 'romance',
 'science-fiction',
 'horror',
 'humor',
 'nonfiction',
 'memoir-autobiography',
 'history-biography',
 'graphicnovels-comics',
 'poetry',
 'debut-novel',
 'young-adult-fiction',
 'young-adult-fantasy',
 'middlegrade-childrens']

# Best fiction books

In [92]:
#runs over 3 hours

def choice_awards_fun(genres_list, start, end):
    
    #initialize empty dataframe
    choice_awards = pd.DataFrame({'Author': [''],'Title': [''], 'Ranking':[''], 
                                    'Year':[''], 'Genre':[''], 'Description' : ['']})

    for genre in genres_list:    
        
        for year in range(start,end):

            url = 'https://www.goodreads.com/choiceawards/best-' + genre + '-books-'+ str(year)
            page = requests.get(url)         
            soup = bs(page.content, 'lxml')
            rank = 1
            
            for book in soup.find_all('a',  {'class':'pollAnswer__bookLink'}):            

                author = book.img.get('title').split(' by ')[1]
                title = book.img.get('title').split(' by ')[0]   
                link = 'https://www.goodreads.com' + book.get('href') 

                #get the html of the page of individual books
                page = requests.get(link)
                soup = bs(page.content, 'lxml')

                #different class names for description across pages
                try:
                    desc = soup.find('div', class_ = 'readable stacked').text

                #the first AttributeError is the more common one, the second one captures other error
                except AttributeError:
                    try:
                        desc = soup.find('div', class_ = 'TruncatedText__text TruncatedText__text--5').text
                    except:
                        desc = 'no text'             

                except:
                    desc = soup.find('div', class_ = 'BookPageMetadataSection__description').text    

                ranking = rank
                year = year
                genre = genre
                rank += 1

                choice_awards = choice_awards.append({'Author': author,'Title': title, 'Ranking':ranking, 
                                                                  'Year':year, 'Genre':genre,'Description': desc}, 
                                                             ignore_index=True)
    
    return choice_awards



In [13]:
#get votes
start_time = time.time()
#empty dataframe
votes_result = pd.DataFrame({'Year': [''], 'Genre' : [''], 'Ranking' : [''],  'Votes' : ['']})

for genre in genres:    
    #print(genre)
    
    for year in range(2011,2022):
        
        url = 'https://www.goodreads.com/choiceawards/best-' + genre + '-books-'+ str(year)
        page = requests.get(url)         
        soup = bs(page.content, 'lxml')
        rank = 1

        for vote in soup.find_all('strong', class_ = 'uitext result'):
            votes = vote.text.strip()
            #print(votes)
            ranking = rank
            year = year
            genre = genre
            rank += 1

            votes_result = votes_result.append({'Year': year, 'Genre': genre, 'Ranking': ranking, 'Votes' : votes}, 
                                                         ignore_index=True)
print("--- %s seconds ---" % (time.time() - start_time))

--- 456.05928444862366 seconds ---


In [14]:
votes_result['Votes'] = votes_result['Votes'].str.replace('\nvotes','')

In [15]:
votes_result = votes_result[1:]

In [16]:
votes_result.to_csv('votes_result.csv', index = False)

In [107]:
votes_result = pd.read_csv('votes_result.csv')

In [9]:
#runs about 3 hours
start_time = time.time()
choice_awards = choice_awards_fun(genres,start = 2011, end = 2022)
print("--- %s seconds ---" % (time.time() - start_time))

--- 10565.85234618187 seconds ---


In [11]:
choice_awards.to_csv('choice_awards.csv')

In [106]:
choice_awards_full = pd.read_csv('choice_awards.csv').drop('Unnamed: 0', axis = 1)[1:]

In [108]:
df = votes_result.merge(choice_awards_full, how = 'left', on = ['Year', 'Ranking', 'Genre'])

In [109]:
#get books where the description, author and title didn't fill up
df[df['Description'].isnull()]

Unnamed: 0,Year,Genre,Ranking,Votes,Author,Title,Description
2200,2011,history-biography,1,5463,,,
2201,2011,history-biography,2,3128,,,
2202,2011,history-biography,3,2097,,,
2203,2011,history-biography,4,1371,,,
2204,2011,history-biography,5,1265,,,
2205,2011,history-biography,6,1129,,,
2206,2011,history-biography,7,943,,,
2207,2011,history-biography,8,843,,,
2208,2011,history-biography,9,767,,,
2209,2011,history-biography,10,626,,,


In [113]:
#rerun the function on the history-biography, year 2011
start_time = time.time()
choice_awards_hist_bio = choice_awards_fun(['history-biography'],start = 2011, end = 2012)
print("--- %s seconds ---" % (time.time() - start_time))

--- 99.70999526977539 seconds ---


In [110]:
#drop history-biography, year 2011 from the main dataframe
df = df.drop(df[(df['Genre'] == 'history-biography') & (df['Year'] == 2011)].index)

In [114]:
#merge history-biography with votes
choice_awards_hist_bio = choice_awards_hist_bio.merge(votes_result, how = 'left', on = ['Year', 'Ranking', 'Genre'])[1:]

In [115]:
choice_awards_hist_bio

Unnamed: 0,Author,Title,Ranking,Year,Genre,Description,Votes
1,Walter Isaacson,Steve Jobs,1,2011,history-biography,"Walter Isaacson's ""enthralling"" (The New Yorke...",5463
2,Erik Larson,In the Garden of Beasts,2,2011,history-biography,"\nThe time is 1933, the place, Berlin, when Wi...",3128
3,Robert K. Massie,Catherine the Great,3,2011,history-biography,\nPulitzer Prize winner Massie offers the tale...,2097
4,Elizabeth Letts,The Eighty-Dollar Champion,4,2011,history-biography,\n#1 NEW YORK TIMES BESTSELLERNovember 1958: t...,1371
5,Annie Jacobsen,Area 51,5,2011,history-biography,\nIt is the most famous military installation ...,1265
6,Gayle Tzemach Lemmon,The Dressmaker of Khair Khana,6,2011,history-biography,\nThe life Kamila Sidiqi had known changed ove...,1129
7,David McCullough,The Greater Journey,7,2011,history-biography,"\nThe Greater Journey is the enthralling, insp...",943
8,Mitchell Zuckoff,Lost in Shangri-la,8,2011,history-biography,"\n“A lost world, man-eating tribesmen, lush an...",843
9,Manning Marable,Malcolm X,9,2011,history-biography,\nSelected by The New York Times Book Review a...,767
10,Paul Collins,The Murder of the Century,10,2011,history-biography,"\nOn Long Island, a farmer finds a duck pond t...",626


In [116]:
#merge main dataframe and history-biography dataframe
df = pd.concat([df, choice_awards_hist_bio], ignore_index=True)

In [118]:
#get rows where description is not filled in the description 
df[df['Description'] == 'no text']

Unnamed: 0,Year,Genre,Ranking,Votes,Author,Title,Description
332,2016,mystery-thriller,13,4706,Fiona Barton,The Widow,no text
353,2017,mystery-thriller,14,3463,Kimberly Belle,The Marriage Lie,no text
523,2015,historical-fiction,4,11819,Michelle Moran,Rebel Queen,no text
532,2015,historical-fiction,13,3051,Kate Morton,The Lake House,no text
754,2015,fantasy,15,3182,Peter V. Brett,The Skull Throne,no text
776,2016,fantasy,17,1613,Mark Lawrence,The Wheel of Osheim,no text
1015,2017,romance,16,2524,Beverly Jenkins,Breathless,no text
1647,2016,humor,8,6926,Phil Lester,Dan and Phil Go Outside,no text
1858,2015,nonfiction,19,756,Steven D. Levitt,When to Rob a Bank,no text
2409,2012,poetry,10,610,Emily Pettit,Goat In The Snow,no text


In [119]:
#replace missing descriptions, no need to run the code from scratch, in some cases there's missing description on goodreads
df.loc[df['Title'] == 'The Widow', 'Description'] = "When the police started asking questions, Jean Taylor turned into a different woman. One who enabled her and her husband to carry on, when more bad things began to happen...But that woman’s husband died last week. And Jean doesn’t have to be her anymore.There’s a lot Jean hasn’t said over the years about the crime her husband was suspected of committing. She was too busy being the perfect wife, standing by her man while living with the accusing glares and the anonymous harassment.Now there’s no reason to stay quiet. There are people who want to hear her story. They want to know what it was like living with that man. She can tell them that there were secrets. There always are in a marriage.The truth—that’s all anyone wants. But the one lesson Jean has learned in the last few years is that she can make people believe anything…"
df.loc[df['Title'] == 'The Marriage Lie', 'Description'] = "Even the perfect marriage has its dark side… Iris and Will's marriage is as close to perfect as it can be: a large house in a nice Atlanta neighborhood, rewarding careers and the excitement of trying for their first baby. But on the morning Will leaves for a business trip to Orlando, Iris's happy world comes to an abrupt halt. Another plane headed for Seattle has crashed into a field, killing everyone on board, and according to the airline, Will was one of the passengers on this plane. Grief-stricken and confused, Iris is convinced it all must be a huge misunderstanding. But as time passes and there is still no sign of Will, she reluctantly accepts that he is gone. Still, Iris needs answers. Why did Will lie about where he was going? What is in Seattle? And what else has he lied about? As Iris sets off on a desperate quest to find out what her husband was keeping from her, the answers she receives will shock her to her very core."
df.loc[df['Title'] == 'Rebel Queen', 'Description'] = "From the internationally best-selling author of Nefertitiand Cleopatra's Daughter comes the breathtaking story of Queen Lakshmi - India's Joan of Arc - who against all odds defied the mighty British invasion to defend her beloved kingdom. When the British Empire sets its sights on India in the mid-nineteenth century, it expects a quick and easy conquest. India is fractured and divided into kingdoms, each independent and wary of one another, seemingly no match for the might of the English. But when they arrive in the Kingdom of Jhansi, the British army is met with a surprising challenge. Instead of surrendering, Queen Lakshmi raises two armies - one male and one female - and rides into battle, determined to protect her country and her people. Although her soldiers may not appear at first to be formidable against superior British weaponry and training, Lakshmi refuses to back down from the empire that is determined to take away the land she loves. Told from the unexpected perspective of Sita - Queen Lakshmi's most favored companion and most trusted soldier in the all-female army - Rebel Queen shines a light on a time and place rarely explored in historical fiction. In the tradition of her best-selling novel, Nefertiti ,and through her strong, independent heroines fighting to make their ways in a male-dominated world, Michelle Moran brings nineteenth-century India to rich, vibrant life."
df.loc[df['Title'] == 'The Lake House', 'Description'] = "An abandoned house...June 1933, and sixteen-year-old Alice Edevane is preparing for her family's Midsummer Eve party at their country home, Loeanneth. But by the time midnight strikes and fireworks light up the night skies, the Edevane family will have suffered a loss so great that they leave Loeanneth forever. A missing child...Seventy years later, after a particularly troubling case, Detective Sadie Sparrow retreats to her beloved grandfather's cottage in Cornwall. Once there, she stumbles upon an abandoned house, and learns the story of a baby boy who disappeared without a trace.An unsolved mystery...Meanwhile, in her elegant Hampstead home, the formidable Alice Edevane, now an old lady, leads a life as neatly plotted as the bestselling detective novels she writes. Until a young police detective starts asking questions about her family's past, seeking to resurrect the complex tangle of secrets Alice has spent her life trying to escape..."
df.loc[df['Title'] == 'The Skull Throne', 'Description'] = "The Skull Throne of Krasia stands empty. Built from the skulls of fallen generals and demon princes, it is a seat of honor and ancient, powerful magic, keeping the demon corelings at bay. From atop the throne, Ahmann Jardir was meant to conquer the known world, forging its isolated peoples into a unified army to rise up and end the demon war once and for all. But Arlen Bales, the Warded Man, stood against this course, challenging Jardir to a duel he could not in honor refuse. Rather than risk defeat, Arlen cast them both from a precipice, leaving the world without a savior, and opening a struggle for succession that threatens to tear the Free Cities of Thesa apart. In the south, Inevera, Jardir’s first wife, must find a way to keep their sons from killing each other and plunging their people into civil war as they strive for glory enough to make a claim on the throne. In the north, Leesha Paper and Rojer Inn struggle to forge an alliance between the duchies of Angiers and Miln against the Krasians before it is too late. Caught in the crossfire is the duchy of Lakton--rich and unprotected, ripe for conquest. All the while, the corelings have been growing stronger, and without Arlen and Jardir there may be none strong enough to stop them. Only Renna Bales may know more about the fate of the missing men, but she, too, has disappeared..."
df.loc[df['Title'] == 'The Wheel of Osheim', 'Description'] = "From the international bestselling author of the Broken Empire Trilogy, the thrilling conclusion to the Red Queen’s War...All the horrors of Hell stand between Snorri ver Snagason and the rescue of his family, if indeed the dead can be rescued. For Jalan Kendeth, getting out alive and with Loki’s key is all that matters. Loki’s creation can open any lock, any door, and it may also be the key to Jalan’s fortune back in the living world. Jalan plans to return to the three w’s that have been the core of his idle and debauched life: wine, women, and wagering. Fate, however, has other plans, larger plans. The Wheel of Osheim is turning ever faster, and it will crack the world unless it’s stopped. When the end of all things looms, and there’s nowhere to run, even the worst coward must find new answers. Jalan and Snorri face many dangers, from the corpse hordes of the Dead King to the many mirrors of the Lady Blue, but in the end, fast or slow, the Wheel of Osheim always pulls you back. In the end, it’s win or die."
df.loc[df['Title'] == 'Breathless', 'Description'] = "A strong-willed beauty finds herself in the arms of the handsome drifter from her past, in this second book in the sizzling series set in the Old West, from USA Today bestselling author Beverly Jenkins As manager of one of the finest hotels in Arizona Territory, Portia Carmichael has respect and stability—qualities sorely missing from her harsh childhood. She refuses to jeopardize that by hitching herself to the wrong man. Suitors are plentiful, but none of them has ever looked quite as tempting as the family friend who just rode into town…and none have looked at her with such intensity and heat. Duchess. That’s the nickname Kent Randolph gave Portia when she was a young girl. Now she’s a stunning, intelligent woman—and Kent has learned his share of hard lessons. After drifting through the West, he’s learned the value of a place to settle down, and in Portia’s arms he’s found that and more. But convincing her to trust him with her heart, not just her passion, will be the greatest challenge he’s known—and one he intends to win…"
df.loc[df['Title'] == 'Dan and Phil Go Outside', 'Description'] = "Dan Howell and Phil Lester, avoiders of human contact and direct sunlight, actually went outside. Travelling around the world on tour, they have collected hundreds of exclusive, intimate and funny photos, as well as revealing and captivating side notes, to show the behind-the-scenes story of their adventure."
df.loc[df['Title'] == 'When to Rob a Bank', 'Description'] = "In celebration of the 10th anniversary of the landmark book Freakonomics comes this curated collection from the most readable economics blog in the universe. It’s the perfect solution for the millions of readers who love all things Freakonomics. Surprising and erudite, eloquent and witty, When to Rob a Bank demonstrates the brilliance that has made the Freakonomics guys an international sensation, with more than 7 million books sold in 40 languages, and 150 million downloads of their Freakonomics Radio podcast. When Freakonomics was first published, the authors started a blog—and they’ve kept it up. The writing is more casual, more personal, even more outlandish than in their books. In When to Rob a Bank, they ask a host of typically off-center questions: Why don’t flight attendants get tipped? If you were a terrorist, how would you attack? And why does KFC always run out of fried chicken? Over the past decade, Steven D. Levitt and Stephen J. Dubner have published more than 8,000 blog posts on the Freakonomics website. Many of them, they freely admit, were rubbish. But now they’ve gone through and picked the best of the best. You’ll discover what people lie about, and why; the best way to cut gun deaths; why it might be time for a sex tax; and, yes, when to rob a bank. (Short answer: never; the ROI is terrible.) You’ll also learn a great deal about Levitt and Dubner’s own quirks and passions, from gambling and golf to backgammon and the abolition of the penny."
df.loc[df['Title'] == 'Goat In The Snow', 'Description'] = "Poetry. 'Emily Pettit has included a number of 'how to' poems in her nimble and dazzling first collection, such as: 'How to Make No Noise,' and the especially useful 'How to Avoid Confronting Most Large Animals.' Her kindness is always ahead of us, anticipating the problems we will or won't run into, and we always end up in a different, precise place than the one we started out from, as she reassuringly tells us: 'You know / you know you know. It's all uncertainty / and your neck. You walk slowly / in a calm voice.' GOAT IN THE SNOW is multicolored, ever-changing, a delight to try to clasp.'—John Ashbery. 'GOAT IN THE SNOW is like a taste test between an etch-a-sketch and a spotlight, a race between a wind-up beetle and an idea. The certainty of Pettit's 'I know,' and 'I think' quickly turns into a quicksand of questions. Perceptive, jumpy and perfectly odd, this book encourages you to 'try to maneuver like a spacecraft / passing sufficiently close to a planet / in order to make some relatively detailed observations / Without landing.'—Matthea Harvey. 'The poems in GOAT IN THE SNOW often ask odd, penetrating questions. 'What do you call a field of black telephones ringing?' 'Where did you find such a stunning embankment?' 'Is this what loving someone is like?' 'Do you remember the basement?' 'In what direction do you look when someone says something true?' These poems are full of mortal awareness, and are sophisticated without being ornate or 'poetic.' When the poet says, 'Once in modest and murky water, I had a very disturbing conversation with a boat,' I don't feel as if she is writing in metaphor. I feel like something real has happened.'—Matthew Zapruder"
df.loc[df['Title'] == 'Almost Invisible', 'Description'] = "From Pulitzer Prize–winner Mark Strand comes an exquisitely witty and poignant series of prose poems. Sometimes appearing as pure prose, sometimes as impure poetry, but always with Strand’s clarity and simplicity of style, they are like riddles, their answers vanishing just as they appear within reach. Fable, domestic satire, meditation, joke, and fantasy all come together in what is arguably the liveliest, most entertaining book that Strand has yet written."
df.loc[df['Title'] == 'Fabric', 'Description'] = "A rich collection of poems that take the reader on a deep tour of the psyche. Charting and moving across politics of language, Bell explores love, pain, failure and redemption from a variety of angles. Most of the poems sit at the fragile threshold of instinct and meaning, using symbol and sensation to get to the shock of denouement. From 'Spandex' to the Greek kafeneion, there are unexpected juxtapositions and discoveries to be found in Jessica Bell's 'Fabric'. This voice is equally inspired by the quotidian, Greek jargon words, and the mythic figures of Echo and Narcissus, Aphrodite, and, of course, Euterpe, the muse of music and the lyric. The interstices of the so-called ordinary with the always larger dramas of feeling and its consequences are among the subjects this young poet explores in her vivid weave of language."
df.loc[df['Title'] == 'You Can Make Anything Sad', 'Description'] = "Thoughts, lists, ideas, memories, not really poems, but not really anything else. Like being alone but surrounded by yourself. Sounds hellish actually."
df.loc[df['Title'] == 'Black Butterfly', 'Description'] = "The Black Butterfly is a symbol of transformation and rebirth after death. Drake wrote this book for those who have lost someone in death and in life. This book is a collection of memories and experiences Drake lived after the death of one of his brothers. He promised he would write him a few words after he failed to complete the task while his brother was alive. This book is everything… this book is for all who are breathing and for all who are no longer here. This book is for you."
df.loc[df['Title'] == 'War of the Foxes', 'Description'] = "In this long-awaited follow-up to Crush, Yale Series of Younger Poets prize-winner Richard Siken turns toward the problems of making and representation, in an unrelenting interrogation of our world of doublings. In this restless, swerving book simple questions—such as, Why paint a bird?—are immediately complicated by concerns of morality, human capacity, and the ways we look to art for meaning and purpose while participating in its—and our own—invention."
df.loc[df['Title'] == 'Pansy', 'Description'] = "In Pansy, Gibson balances themes of love, gender, politics, sexuality, illness, family and forgiveness with stunning imagery and a fierce willingness to delve into the exploration of what it means to truly heal. Each turn of the page represents both that which as been forgotten and that which is yet to be released. While this book is a rally cry for political action, it is also a celebration of wonder and longing and love."
df.loc[df['Title'] == 'Lost in the Never Woods', 'Description'] = "When children go missing in the small coastal town of Astoria, people look to Wendy for answers. It's been five years since Wendy and her two brothers went missing in the woods, but when the town’s children start to disappear, the questions surrounding her brothers’ mysterious circumstances are brought back into light. Attempting to flee her past, Wendy almost runs over an unconscious boy lying in the middle of the road, and gets pulled into the mystery haunting the town. Peter, a boy she thought lived only in her stories, claims that if they don't do something, the missing children will meet the same fate as her brothers. In order to find them and rescue the missing kids, Wendy must confront what's waiting for her in the woods."


In [120]:
#check if all descriptions are already included
df[df['Description'] == 'no text']

Unnamed: 0,Year,Genre,Ranking,Votes,Author,Title,Description


In [122]:
df.to_csv('choice_awards_full_with_votes.csv')

# Most popular books in shelfes by year

In [123]:
start_time = time.time()
#empty dataframe
pop_books = pd.DataFrame({'Author': [''],'Title': [''], 'Description':[''], 'Year':[''], 'Ranking' : ['']})

for year in range(1922,2022):
    
    url= "https://www.goodreads.com/book/popular_by_date/" + str(year) 
    page = requests.get(url) 
    soup = bs(page.content, 'lxml')
    
    rank = 1

    for book in soup.find_all('div', class_ = 'BookListItem__body'):
        title = book.find('a', {'data-testid':'bookTitle'}).text
        author = book.find('span', {'data-testid' : 'name'}).text
        desc = book.find('span', class_ = 'Formatted').text
        year = year
        ranking = rank
        rank += 1

        pop_books = pop_books.append({'Author': author, 'Title': title, 'Description':desc, 'Year':year, 'Ranking' :ranking
                                     }, ignore_index=True)

print("--- %s seconds ---" % (time.time() - start_time))        

--- 335.6167504787445 seconds ---


In [125]:
pop_books = pop_books[1:]
pop_books.to_csv('popular_books_in_shelves_by_years.csv')

# Books 2020 - shelf

In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

options = Options()
options.add_argument("--headless")

In [None]:
#not suitable for longer run and multiple pages
start_time = time.time()

df = pd.DataFrame({'Rank': [''],'Year': [''], 'Link':[''], 'Description':['']})

years = ['2019','2020','2021']

#50 books on the page, increment by two, starting with 4, ending with 102

for year in range(2011,2022):

    counter = 4
    rank = 1

    while counter <= 102:
        
        try:
            i = counter
            #print(i)
            driver = webdriver.Chrome('D:/inštalačky/chromedriver', options = options)
            driver.get('https://www.goodreads.com/shelf/show/' + year)

            driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div['+str(i)+']/div[1]/a[2]').click()    
            time.sleep(2)    

            driver.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/button').click()
            #adjust link that it's received from the driver above directly
            
            soup = bs(driver.page_source, 'lxml')    
            desc = soup.find('div', class_ = 'readable stacked').text
            head = soup.find('head')
            link = head.find('link').get('href')

            df = df.append({'Rank': rank, 'Year': year, 'Link':link, 'Description':desc}, ignore_index=True)

            counter += 2
            rank += 1

        except:
            pass

    
    
end_time = time.time()

  driver = webdriver.Chrome('D:/inštalačky/chromedriver', options = options)


In [66]:
#code runs around 25 minutes for one bookshelf, 2 hours for 3
(end_time - start_time)/60/60

1.904349700808525

In [67]:
df

Unnamed: 0,Rank,Year,Link,Description
0,,,,
1,1,2019,https://www.goodreads.com/book/show/36809135-w...,"\nFor years, rumors of the “Marsh Girl” haunte..."
2,2,2019,https://www.goodreads.com/book/show/35133922-e...,\nA newer edition of ISBN 9780399590504 can be...
3,3,2019,https://www.goodreads.com/book/show/38746485-b...,\nIn a life filled with meaning and accomplish...
4,4,2019,https://www.goodreads.com/book/show/40597810-d...,\nA gripping novel about the whirlwind rise of...
...,...,...,...,...
146,46,2021,https://www.goodreads.com/book/show/55361205-a...,"\nA riveting, deeply personal account of histo..."
147,47,2021,https://www.goodreads.com/book/show/15881.Harr...,\nEver since Harry Potter had come home for th...
148,48,2021,https://www.goodreads.com/book/show/16143347-w...,\nA beautiful and distinguished family.A priva...
149,49,2021,https://www.goodreads.com/book/show/33385229-t...,\nAdam Silvera reminds us that there’s no life...


In [68]:
df.to_csv('shelf_2019-2021.csv')

In [41]:
def shelves_df(start, end):

    shelves = pd.DataFrame({'Author': [''],'Title': [''], 'Ranking':[''], 
                                        'Year':[''], 'Description' : ['']})

    for year in range(start,end):

        rank = 1
        url= 'https://www.goodreads.com/shelf/show/' + str(year)
        page = requests.get(url) 
        soup = bs(page.content, 'lxml')

        for book in soup.find_all('div', {'class': 'elementList', 'style': 'padding-top: 10px;'}):
            link = 'https://www.goodreads.com/' + book.find('a', class_ = 'leftAlignedImage').get('href')
            title = book.find('a', class_ = 'bookTitle').text
            author = book.find('span', {'itemprop': 'name'}).text


            #get the html of the page of individual books
            page = requests.get(link)
            soup = bs(page.content, 'lxml')

            #different class names for description across pages
            try:
                desc = soup.find('div', class_ = 'readable stacked').text

            #the first AttributeError is the more common one, the second one captures other error
            except AttributeError:
                try:
                    desc = soup.find('div', class_ = 'TruncatedText__text TruncatedText__text--5').text
                except:
                    desc = 'no text'             

            except:
                desc = soup.find('div', class_ = 'BookPageMetadataSection__description').text           


            ranking = rank
            year = year    
            rank += 1

            shelves = shelves.append({'Author': author,'Title': title, 'Ranking':ranking, 
                                                              'Year':year, 'Description': desc}, 
                                                         ignore_index=True)
    return shelves


In [42]:
#runs about an hour
start_time = time.time()
shelves_all = shelves_df(start = 2011, end = 2022)
end_time = time.time()

In [48]:
#fill-in missing descriptions
shelves_all.loc[shelves_all['Title'] == 'Harry Potter and the Deathly Hallows (Harry Potter, #7)', 'Description'] = "It's no longer safe for Harry at Hogwarts, so he and his best friends, Ron and Hermione, are on the run. Professor Dumbledore has given them clues about what they need to do to defeat the dark wizard, Lord Voldemort, once and for all, but it's up to them to figure out what these hints and suggestions really mean. Their cross-country odyssey has them searching desperately for the answers, while evading capture or death at every turn. At the same time, their friendship, fortitude, and sense of right and wrong are tested in ways they never could have imagined. The ultimate battle between good and evil that closes out this final chapter of the epic series takes place where Harry's Wizarding life began: at Hogwarts. The satisfying conclusion offers shocking last-minute twists, incredible acts of courage, powerful new forms of magic, and the resolution of many mysteries. Above all, this intense, cathartic book serves as a clear statement of the message at the heart of the Harry Potter series: that choice matters much more than destiny, and that love will always triumph over death."
shelves_all.loc[shelves_all['Title'] == 'Beach Read (Paperback)', 'Description'] = "A romance writer who no longer believes in love and a literary writer stuck in a rut engage in a summer-long challenge that may just upend everything they believe about happily ever afters. Augustus Everett is an acclaimed author of literary fiction. January Andrews writes bestselling romance. When she pens a happily ever after, he kills off his entire cast. They’re polar opposites. In fact, the only thing they have in common is that for the next three months, they're living in neighboring beach houses, broke, and bogged down with writer's block. Until, one hazy evening, one thing leads to another and they strike a deal designed to force them out of their creative ruts: Augustus will spend the summer writing something happy, and January will pen the next Great American Novel. She’ll take him on field trips worthy of any rom-com montage, and he’ll take her to interview surviving members of a backwoods death cult (obviously). Everyone will finish a book and no one will fall in love. Really."
shelves_all.loc[shelves_all['Title'] == 'Verity (ebook)', 'Description'] = "Lowen Ashleigh is a struggling writer on the brink of financial ruin when she accepts the job offer of a lifetime. Jeremy Crawford, husband of bestselling author Verity Crawford, has hired Lowen to complete the remaining books in a successful series his injured wife is unable to finish. Lowen arrives at the Crawford home, ready to sort through years of Verity's notes and outlines, hoping to find enough material to get her started. What Lowen doesn't expect to uncover in the chaotic office is an unfinished autobiography Verity never intended for anyone to read. Page after page of bone-chilling admissions, including Verity's recollection of what really happened the day her daughter died. Lowen decides to keep the manuscript hidden from Jeremy, knowing its contents would devastate the already grieving father. But as Lowen's feelings for Jeremy begin to intensify, she recognizes all the ways she could benefit if he were to read his wife's words. After all, no matter how devoted Jeremy is to his injured wife, a truth this horrifying would make it impossible for him to continue to love her."

In [50]:
shelves_all.to_csv('shelf_2011-2021.csv')