In [25]:
from bs4 import BeautifulSoup
import numpy as np
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC


               

In [26]:
# load the webdriver
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('disable-gpu')

driver = webdriver.Chrome(options=options)



In [27]:
def scrape_poems():
    """
    This function scrapes poems from the poetry foundation website
    and exports the data to a csv file

    Returns:
    df: a dataframe of the scraped data
    """
    BASE_URL = "https://www.poetryfoundation.org"
    urls = [
        "https://www.poetryfoundation.org/categories/winter",
        "https://www.poetryfoundation.org/categories/love",
        "https://www.poetryfoundation.org/categories/youth",
        "https://www.poetryfoundation.org/categories/relationships",
        "https://www.poetryfoundation.org/categories/travels-journeys",
        "https://www.poetryfoundation.org/categories/history-politics",
    ]

    data = []  # to store the data

    for url in urls:
        # get the page
        driver.get(url)
        time.sleep(2)
        # get the page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        poems = soup.find_all('li', class_="col-span-full pt-6")
        for p in poems:
            title = p.find('h3').text
            author = p.find('div', class_="type-kappa text-gray-600")
            if author:
                author = author.text
            else:
                author = None
            summary = p.find('div', class_="rich-text line-clamp-[var(--line-clamp)]")
            if summary:
                summary = summary.text
            else:
                summary = None

            link = p.find('a')['href']
            # Being the link is a short link, we need to add the base url to it to access the entire poem
            link = BASE_URL + link

            # simulate a click action to click the link to access the entire poem in the next page!
            driver.get(link)
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            poem = soup.find('article', class_="mb-6 flex flex-col gap-12 md:mb-0")  # get the poem
            if poem:
                poem = poem.text
            else:
                poem = None
            

            data.append({"title": title, "author": author, "summary": summary, "poem": poem, "link": link})

    df = pd.DataFrame(data)  # create a dataframe
    df.to_csv('poems.csv', mode='a', header=False, index=False)  # export to csv file
    return df  # returns a dataframe of the scraped data

In [28]:
scrape_poems()

Unnamed: 0,title,author,summary,poem,link
0,"Winter Journal: Threshed Blue, Cardings, Dim T...",By Emily Wilson,stripped batting of cloudglimpsed ligamentsdus...,stripped batting of cloud\n glimpsed ligaments...,https://www.poetryfoundation.org/poems/52227/w...
1,To a Wreath of Snow,By Emily Brontë,O transient voyager of heaven!⁠ ⁠ ⁠ O silent s...,O transient voyager of heaven!\n ⁠ ⁠ ⁠ O silen...,https://www.poetryfoundation.org/poems/161908/...
2,Preludes,By T. S. Eliot,IThe winter evening settles downWith smell of ...,I\n\n The winter evening settles down\n With s...,https://www.poetryfoundation.org/poems/44214/p...
3,Growing Season,By Angela Voras-Hills,"On the bike path, a bunny's body and bloodwher...","On the bike path, a bunny's body and blood\n w...",https://www.poetryfoundation.org/poems/154760/...
4,The Snow Arrives After Long Silence,By Nancy Willard,The snow arrives after long silencefrom its hi...,The snow arrives after long silence\n from its...,https://www.poetryfoundation.org/poems/154170/...
...,...,...,...,...,...
115,Coming Undone,By Jared Marcel Pollen,"On Context Collapse, Ryan Ruby’s vertiginous s...",,https://www.poetryfoundation.org/articles/1625...
116,Shock and Ore,By Alexander Wells,The German poet Lutz Seiler has spent his care...,,https://www.poetryfoundation.org/articles/1623...
117,[ ] noise,By Victoria Adukwei Bulley,& even when we said we were alone there was [ ...,& even when we said we were alone there was [ ...,https://www.poetryfoundation.org/poems/162164/...
118,The Caliban of Old Blighty,By Declan Ryan,W.H. Auden's conflicted love of England is fro...,,https://www.poetryfoundation.org/articles/1621...


In [29]:
df = scrape_poems()

In [30]:
df.shape

(120, 5)

In [31]:
df.head()

Unnamed: 0,title,author,summary,poem,link
0,"Winter Journal: Threshed Blue, Cardings, Dim T...",By Emily Wilson,stripped batting of cloudglimpsed ligamentsdus...,stripped batting of cloud\n glimpsed ligaments...,https://www.poetryfoundation.org/poems/52227/w...
1,To a Wreath of Snow,By Emily Brontë,O transient voyager of heaven!⁠ ⁠ ⁠ O silent s...,O transient voyager of heaven!\n ⁠ ⁠ ⁠ O silen...,https://www.poetryfoundation.org/poems/161908/...
2,Preludes,By T. S. Eliot,IThe winter evening settles downWith smell of ...,I\n\n The winter evening settles down\n With s...,https://www.poetryfoundation.org/poems/44214/p...
3,Growing Season,By Angela Voras-Hills,"On the bike path, a bunny's body and bloodwher...","On the bike path, a bunny's body and blood\n w...",https://www.poetryfoundation.org/poems/154760/...
4,The Snow Arrives After Long Silence,By Nancy Willard,The snow arrives after long silencefrom its hi...,The snow arrives after long silence\n from its...,https://www.poetryfoundation.org/poems/154170/...


In [32]:
row = df.iloc[10]
print(f"Title: {row['title']}\n")
print(f"Author: {row['author']}\n")
print(f"Summary: {row['summary']}\n")
print(f"Poem: {row['poem']}\n")
print(f"Link: {row['link']}\n")

Title: poetry-magazineFebruary & my love is in another state

Author: By José Olivarez

Summary: so when i walk down the street, i hold handswith the wind. there’s a chimney coughingup ahead & a sky so honey, i could almost taste it.a cat struts away from me & two yellow eyesbecome four: just like that,i’m...

Poem: so when i walk down the street, i hold hands
 with the wind. there’s a chimney coughing
 up ahead & a sky so honey, i could almost taste it.
 a cat struts away from me & two yellow eyes

 become four: just like that,
 i’m the loneliest creature on this block.
 soon the streetlights will come alive
 & television sets will light up with blues.

 stay with me. while the sky is still golden,
 hold the ladder so i can climb, & from
 the highest rung, i can scrape away a drizzle
 of light to wear around my neck. alone

 is the star i follow. in love & in solitude:
 alone is the home with the warmest glow.Source: Poetry (December 2019)

Link: https://www.poetryfoundation.org/poetr

In [33]:
df['title'][10]

'poetry-magazineFebruary & my love is in another state'

In [34]:
df['author'][10]

'By José Olivarez'

In [35]:
df['summary'][10]

'so when i walk down the street, i hold handswith the wind. there’s a chimney coughingup ahead & a sky so honey, i could almost taste it.a cat struts away from me & two yellow eyesbecome four: just like that,i’m...'

In [36]:
row = df.iloc[10]
print(f"Title: {row['title']}\n")
print(f"Author: {row['author']}\n")
print(f"Summary: {row['summary']}\n")
print(f"Poem: {row['poem']}\n")
print(f"Link: {row['link']}\n")

Title: poetry-magazineFebruary & my love is in another state

Author: By José Olivarez

Summary: so when i walk down the street, i hold handswith the wind. there’s a chimney coughingup ahead & a sky so honey, i could almost taste it.a cat struts away from me & two yellow eyesbecome four: just like that,i’m...

Poem: so when i walk down the street, i hold hands
 with the wind. there’s a chimney coughing
 up ahead & a sky so honey, i could almost taste it.
 a cat struts away from me & two yellow eyes

 become four: just like that,
 i’m the loneliest creature on this block.
 soon the streetlights will come alive
 & television sets will light up with blues.

 stay with me. while the sky is still golden,
 hold the ladder so i can climb, & from
 the highest rung, i can scrape away a drizzle
 of light to wear around my neck. alone

 is the star i follow. in love & in solitude:
 alone is the home with the warmest glow.Source: Poetry (December 2019)

Link: https://www.poetryfoundation.org/poetr

In [37]:
df['poem'][49]

'My brother holds a snake by its head. The whole\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 length of the snake is the length\n \xa0\n of my brother’s body. The snake’s head\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 is held safely, securely, as if my brother\n \xa0\n is showing him something in the distant high grass.\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 I don’t know why he wants to hold them,\n \xa0\n their strong bodies wrapping themselves around\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 the warmth of his arm. Constricting and made\n \xa0\n of circles and momentum; slippery coolness smooth\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 against the ground. Still, this image of him,\n \xa0\n holding a snake as it snakes as snakes\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 do, both a noun and verb and a story\n \xa0\n that doesn’t end well. Once, we stole an egg\n \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 from the backyard chicken coop\n \xa0\n and cracked

In [38]:
# # load the webdriver
# options = webdriver.ChromeOptions()
# options.add_argument('headless')
# options.add_argument('disable-gpu')

# driver = webdriver.Chrome(options=options)
# # Check if there is a next page by checking the page number element
# driver.get("https://www.poetryfoundation.org/categories/winter")
# soup = BeautifulSoup(driver.page_source, 'html.parser')


# filters_btns = soup.find('div', class_="flex w-auto items-center gap-2")

# current_page = filters_btns.find('button', class_="relative flex w-full cursor-default items-center justify-between gap-x-1.5 rounded-sm border-0 bg-gray-50 px-2.5 py-2.5 text-left text-sm leading-none text-black ring-1 ring-inset ring-gray-350 ring-offset-white placeholder:text-gray-500 focus:bg-white focus:outline-none focus:ring-1 focus:ring-red disabled:cursor-not-allowed disabled:opacity-75")
# current_page = current_page.find_all('span', style="pointer-events:none;")
# if current_page and current_page[0].text.strip().isdigit():
# 	current_page = int(current_page[0].text.strip())
# else:
# 	current_page = None



# # current_page = int(soup.find('span', style="pointer-events:none;").text)
# # next_page_button = driver.find_element(By.XPATH, f"//button[contains(@aria-controls, 'radix-vue-select-content') and contains(., '{current_page + 1}')]")
# # next_page_button.click()
# # time.sleep(2)


# # print(filters_btns)

# # print(sort_by)
# # print(jump_to)
# print(current_page)

In [39]:
df['link'][49]

'https://www.poetryfoundation.org/poems/162176/cyrus-the-snakes'

In [None]:
# def scrape_poems():
#     """
#     This function scrapes poems from the poetry foundation website
#     and exports the data to a csv file

#     Returns:
#     df: a dataframe of the scraped data
#     """
#     BASE_URL = "https://www.poetryfoundation.org"
#     urls = [
#         "https://www.poetryfoundation.org/categories/winter",
#         "https://www.poetryfoundation.org/categories/love",
#         "https://www.poetryfoundation.org/categories/youth",
#         "https://www.poetryfoundation.org/categories/relationships",
#         "https://www.poetryfoundation.org/categories/travels-journeys",
#         "https://www.poetryfoundation.org/categories/history-politics",
#     ]

#     data = []  # to store the data
#     # loop through the urls, then scrape the poems
#     for url in urls:
#         driver.get(url)
#         time.sleep(2)

#         while True:
#             soup = BeautifulSoup(driver.page_source, 'html.parser')
#             poems = soup.find_all('li', class_="col-span-full pt-6")
#             for p in poems:
#                 title = p.find('h3').text
#                 author = p.find('div', class_="type-kappa text-gray-600")
#                 if author:
#                     author = author.text
#                 else:
#                     author = None
#                 summary = p.find('div', class_="rich-text line-clamp-[var(--line-clamp)]")
#                 if summary:
#                     summary = summary.text
#                 else:
#                     summary = None

#                 link = p.find('a')['href']
#                 link = BASE_URL + link
    
#                 # simulate a click action to click the link to access the entire poem in the next page!
#                 driver.get(link)
#                 time.sleep(2)
#                 soup = BeautifulSoup(driver.page_source, 'html.parser')
#                 poem = soup.find('div', class_="poem")
#                 if poem:
#                     poem = poem.text
#                 else:
#                     poem = None

#                 data.append({"title": title, "author": author, "summary": summary, "poem": poem, "link": link})

#             # Check if there is a next page by checking the page number element
#             try:
#                 current_page = int(soup.find('span', style="pointer-events:none;").text)
#                 next_page_button = driver.find_element(By.XPATH, f"//button[contains(@aria-controls, 'radix-vue-select-content') and contains(., '{current_page + 1}')]")
#                 next_page_button.click()
#                 time.sleep(2)
#             except:
#                 break

#     df = pd.DataFrame(data)  # create a dataframe
#     df.to_csv('poems.csv', mode='a', header=False, index=False)  # export to csv file
#     return df  # returns a dataframe of the scraped data

# # Initialize the webdriver
# driver = webdriver.Chrome(options=options)
# scrape_poems()
# driver.quit()
