In [1]:
pip install bs4 requests

Collecting bs4
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl.metadata (35 kB)
Using cached bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Using cached charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl (199 kB)
Installing collected packages: charset-normalizer, requests, bs4
Successfully installed bs4-0.0.2 charset-normalizer-3.4.2 requests-2.32.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import the necessary libraries
from bs4 import BeautifulSoup 
# Define the HTML string
html = """<html><head><title>My Simple HTML Page</title></head><body><div class="my-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div></body></html>"""

In [3]:
# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

In [4]:
print(soup.prettify())

<html>
 <head>
  <title>
   My Simple HTML Page
  </title>
 </head>
 <body>
  <div class="my-class">
   <span>
    This is some text in a span tag
   </span>
   <p>
    This is some text in a paragraph tag
   </p>
   <p>
    And yet again another paragraph
   </p>
  </div>
 </body>
</html>



In [5]:
# Example 1: Find the first occurrence of a tag
first_span = soup.find('span')
print(first_span)

<span>This is some text in a span tag</span>


In [6]:
# Example 2: Find all occurrences of a tag
all_p = soup.find_all('p')
print(all_p)

[<p>This is some text in a paragraph tag</p>, <p>And yet again another paragraph</p>]


In [12]:
len(all_p)

2

In [13]:
# Example 3: Navigate to the parent tag
parent_div = first_span.parent
print(parent_div)



<div class="my-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div>


In [14]:
# Example 4: Navigate to the next sibling tag
next_sibling = first_span.next_sibling
print(next_sibling)

<p>This is some text in a paragraph tag</p>


In [15]:
# Example 5: Navigate to the previous sibling tag
previous_sibling = next_sibling.previous_sibling
print(previous_sibling)



<span>This is some text in a span tag</span>


In [16]:
# Example 6: Get the tag name
tag_name = first_span.name
print(tag_name)



span


In [17]:
# Example 7: Get the tag attributes
class_name = parent_div['class']
print(class_name)



['my-class']


In [18]:
# Example 8: Get the text content of a tag
span_text = first_span.text
print(span_text)



This is some text in a span tag


In [19]:
# Example 9: Get the text content of multiple tags
div_text = parent_div.get_text()
print(div_text)



This is some text in a span tagThis is some text in a paragraph tagAnd yet again another paragraph


In [20]:
# Example 10: Modify the HTML
parent_div['class'] = 'new-class'
print(parent_div)

<div class="new-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div>


In [21]:
# Example 11: Iterate over all <p> elements in <body>
for i, p in enumerate(soup.body.find_all('p')):
  print(f"{i+1}. tag: {p.name}, text:", p.get_text())

1. tag: p, text: This is some text in a paragraph tag
2. tag: p, text: And yet again another paragraph


# Wikipedia

Real-world example!

We will scrape the main page of wikipedia. In doing this, we will:

1. Get the featured article content
2. Get all the `did you know` content along with some additional data
3. Do the same as 2. but for the `in the news` section
4. Repeat this process for the `on this day` section.

In [9]:
import requests as re

In [None]:
# Get the main page content

req = re.get("https://en.wikipedia.org/wiki/Main_Page") 

soup = BeautifulSoup(req.content, "html.parser")

In [25]:
# Get the featured article content 

featured_article_text = soup.find(class_="MainPageBG mp-box").p.get_text()


In [27]:
# Get all "did you know" facts 

did_you_know = soup.find(id="mp-dyk").find_all("li")

# For each, make a tuple, that stores the text and the first link
did_you_know_list = []
for li in did_you_know:
  text = li.get_text()
  link = li.find("a")["href"]
  did_you_know_list.append({"text": text, "link": link})

# for each link, get the first paragraph text 

for dyk in did_you_know_list:
  # URL suffix: '/wiki/foo_bar'
  link = dyk['link']
  url = f"https://en.wikipedia.org/{link}"
  intro_text_soup = BeautifulSoup(re.get(url).content)
  intro = intro_text_soup.find_all("p")[1].get_text()
  dyk['intro'] = intro

In [23]:
did_you_know_list

[{'text': '... that precursors to the killer toy include ventriloquist dummies such as Otto (pictured) in the 1929 film The Great Gabbo?',
  'link': '/wiki/Killer_toy',
  'intro': 'A killer toy is a stock character in horror fiction. They include toys, such as dolls and ventriloquist dummies, that come to life and seek to kill or otherwise carry out violence. The killer toy subverts the associations of childhood with innocence and lack of agency while invoking the uncanny nature of a lifelike toy. Killer toy fiction often invokes ideas of companionship and the corruption of children, sometimes taking place in dysfunctional or single parent homes. They have historically been associated with occultism and spirit possession, though artificial intelligence became more common in later works.\n'},
 {'text': '... that Armenian-Turkish soprano Sibil Pektorosoğlu released her first album after singing in a church choir for almost twenty years?',
  'link': '/wiki/Armenians_in_Turkey',
  'intro':

In [29]:
# let's do it again for the in the news section
in_the_news = soup.find(id="mp-itn").find_all("li")

in_the_news_list = []
for itn in in_the_news:
  text = itn.get_text()
  link = itn.find("a")["href"]

  # notice that this part is different from above
  # which do you think makes the most sense performance-wise? 
  url = f"https://en.wikipedia.org/{link}"
  intro_text_soup = BeautifulSoup(re.get(url).content)
  intro = intro_text_soup.find_all("p")[1].get_text()
  itn['intro'] = intro
  in_the_news_list.append({
      "text": text,
      "link": link,
      "intro": intro
  })


In [30]:
in_the_news_list[0]

{'text': 'In Canada, the Liberal Party, led by Prime Minister Mark Carney (pictured), wins the most seats in the federal election.',
 'link': '/wiki/Liberal_Party_of_Canada',
 'intro': 'The Liberal Party of Canada (LPC; French: Parti libéral du Canada, PLC) is a federal political party in Canada. The party espouses the principles of liberalism,[6][7][8] and generally sits at the centre[6][9][10] to centre-left[10][11] of the Canadian political spectrum, with their main rival, the Conservative Party, positioned to their right and the New Democratic Party positioned to their left.[6][12][13] The party is described as "big tent",[14] practising "brokerage politics",[c] attracting support from a broad spectrum of voters.[20] The Liberal Party is the longest-serving and oldest active federal political party in the country, and has dominated federal politics of Canada for much of its history, holding power for almost 70 years of the 20th century.[21][12] As a result, it has sometimes been re

In [None]:
# Your turn! Do it for the "On this day" section!
# Take into account that the first item might not be in a <li> tag


In [None]:
links = soup.find_all('a', href=lambda x: x and x.startswith('/wiki/'), title=True)
numeric_links = [a for a in links if a.text.strip().isdigit()] 
for a in numeric_links:
    print(f"Text: {a.text.strip()} | Href: {a['href']}")


Text: 1947 | Href: /wiki/1947
Text: 1791 | Href: /wiki/1791
Text: 1481 | Href: /wiki/1481
Text: 1848 | Href: /wiki/1848
Text: 1939 | Href: /wiki/1939
Text: 1999 | Href: /wiki/1999


In [None]:
wiki_links = soup.find_all('a', href=lambda x: x and x.startswith('/wiki/'), title=True) 
numeric_links = [a for a in wiki_links if a.text.strip().isdigit()] 
for a in numeric_links:
    year_text = a.text.strip()
    href = a['href']
    full_url = f"https://en.wikipedia.org{href}"
    print(f"Year: {year_text} | Link: {full_url}")

Year: 1947 | Link: https://en.wikipedia.org/wiki/1947
Year: 1791 | Link: https://en.wikipedia.org/wiki/1791
Year: 1481 | Link: https://en.wikipedia.org/wiki/1481
Year: 1848 | Link: https://en.wikipedia.org/wiki/1848
Year: 1939 | Link: https://en.wikipedia.org/wiki/1939
Year: 1999 | Link: https://en.wikipedia.org/wiki/1999


## NY Times

Exercise:

1. Get all article titles from the main page `https://www.nytimes.com/international/`
2. For each, get:
   - title
   - summary of the article
   - reading time
   - link to it

What would be the follow up to this?


In [45]:
req = re.get("https://www.nytimes.com/international/") 
soup = BeautifulSoup(req.content, "html.parser") 

In [36]:
story_wrappers = soup.find_all("section", class_="story-wrapper")
stories = []
for story in story_wrappers:
  try:
    title = story.find(class_="indicate-hover").get_text()
  except:
    title = ""
  try:
    summary = story.find("p", class_="summary-class").get_text()
  except:
    summary = ""
  try:
    reading_time = story.find("p", class_="css-1esztn").get_text()
  except:
    reading_time = ""
  try:
    link = story.find("a")["href"]
  except:
    link = ""
  stories.append((title, summary, reading_time, link))


In [39]:
print(story_wrappers)

[<section class="story-wrapper"><a aria-hidden="false" class="css-9mylee" data-uri="nyt://article/4dfbb7bb-e100-5799-8259-0313f5404834" href="https://www.nytimes.com/2025/05/03/us/harvard-alan-garber-trump-administration.html"><div class="css-xdandi"><p class="indicate-hover css-1gg6cw2">Harvard’s President Is Fighting Trump. He Also Agrees With Him.</p></div><p class="summary-class css-ofqxyv">Alan Garber became a hero to liberals after Harvard resisted the government. He is also trying to remake campus culture in ways President Trump might appreciate.</p><div class="css-1tic89u"><div><p class="css-1a0ymrn" data-ttr="1">10 min read</p></div></div></a></section>, <section class="story-wrapper"><a aria-hidden="false" class="css-9mylee" data-uri="nyt://article/7140a783-f2fa-5dad-9f64-22f1d7218989" href="https://www.nytimes.com/2025/05/02/us/politics/trump-harvard-tax-exempt-status.html"><div class="css-xdandi"><p class="indicate-hover css-91bpc3">Harvard Signals It Will Resist Trump’s Ef

In [37]:
parsed_sections = []
for section in soup.find_all("section", class_="story-wrapper"):
  try:
    # get the url of the article
    link = section.find("a")["href"]
  except:
    link = ""
  try:
    # get the title of the article
    title = section.find("h3").get_text()
  except:
    try:
      title = section.find("h4").get_text()
    except:
      title = ""
  try:
    # get the description
    description = section.find("p").get_text()
  except:
    description = ""
  # Append to the list my parsed section
  parsed_sections.append((link, title, description))

In [40]:
print(parsed_sections)

[('https://www.nytimes.com/2025/05/03/us/harvard-alan-garber-trump-administration.html', '', 'Harvard’s President Is Fighting Trump. He Also Agrees With Him.'), ('https://www.nytimes.com/2025/05/02/us/politics/trump-harvard-tax-exempt-status.html', '', 'Harvard Signals It Will Resist Trump’s Efforts to Revoke Tax-Exempt Status'), ('https://www.nytimes.com/2025/05/03/business/trump-auto-parts-tariffs.html', '', 'Car Prices Expected to Rise as Tariffs on Parts Kick In'), ('https://www.nytimes.com/2025/05/03/world/asia/us-china-tariffs-fentanyl.html', '', 'As China Looks for Way Out of U.S. Trade Deadlock, Fentanyl Could Be Key'), ('https://www.nytimes.com/2025/05/03/business/china-tariffs-temu-shein.html', '', 'U.S. Tariff on Cheap Chinese Imports Will Cost Big Tech Billions'), ('https://www.nytimes.com/2025/05/03/business/opec-plus-oil-production-trump.html', '', 'Why OPEC Plus Is Increasing Oil Supplies Despite Falling Prices'), ('', '', 'Why OPEC Plus Is Increasing Oil Supplies Despit

In [38]:
urls = []
a_tags = soup.find_all("a")
for a_tag in a_tags:
  link = a_tag["href"]
  text = a_tag.get_text()
  urls.append((link, text))

In [41]:
print(urls)

[('#site-content', 'Skip to content'), ('#site-index', 'Skip to site index'), ('#after-dfp-ad-top', 'SKIP ADVERTISEMENT'), ('#site-content', 'Skip to content'), ('#site-index', 'Skip to site index'), ('/', ''), ('/', 'U.S.'), ('/international/', 'International'), ('/ca/', 'Canada'), ('https://www.nytimes.com/es/', 'Español'), ('https://cn.nytimes.com', '中文'), ('https://www.nytimes.com/section/todayspaper', 'Today’s Paper'), ('/', ''), ('https://www.nytimes.com/international/section/us', 'U.S.'), ('https://www.nytimes.com/international/section/us', 'U.S.'), ('https://www.nytimes.com/international/section/politics', 'Politics'), ('https://www.nytimes.com/international/section/nyregion', 'New York'), ('https://www.nytimes.com/spotlight/california-news', 'California'), ('https://www.nytimes.com/international/section/education', 'Education'), ('https://www.nytimes.com/international/section/health', 'Health'), ('https://www.nytimes.com/international/section/obituaries', 'Obituaries'), ('http

In [None]:
import re

matches = soup.find_all(lambda tag: tag.name and re.search(r'trade', tag.get_text(), re.IGNORECASE))
for tag in matches:
    print(f"<{tag.name}> → {tag.get_text(strip=True)}")


<html> → The New York Times International - Breaking News, US News, World News, VideosSkip to contentSkip to site indexSKIP ADVERTISEMENTSkip to contentSkip to site indexU.S.InternationalCanadaEspañol中文Today’s PaperU.S.SectionsU.S.PoliticsNew YorkCaliforniaEducationHealthObituariesScienceClimateWeatherSportsBusinessTechThe UpshotThe MagazineTop StoriesDonald TrumpSupreme CourtCongressImmigrationAbortionNewslettersThe MorningMake sense of the day’s news and ideas.The UpshotAnalysis that explains politics, policy and everyday life.See all newslettersPodcastsThe DailyThe biggest stories of our time, in 20 minutes a day.See all podcastsWorldSectionsWorldAfricaAmericasAsiaAustraliaCanadaEuropeMiddle EastScienceClimateWeatherHealthObituariesTop StoriesMiddle East CrisisRussia-Ukraine WarChina International RelationsThe Global ProfileLeer en EspañolNewslettersMorning Briefing: EuropeGet what you need to know to start your day.The InterpreterOriginal analysis on the week’s biggest global stori

In [51]:
headlines = [h.get_text(strip=True) for h in soup.find_all(re.compile('^h[1-6]$'))]
print(headlines)

['Sections', '', 'Top Stories', 'Newsletters', 'Podcasts', 'Sections', 'Top Stories', 'Newsletters', '', 'Sections', '', 'Top Stories', 'Newsletters', 'Podcasts', 'Sections', '', 'Recommendations', 'Newsletters', 'Podcasts', 'Sections', '', 'Columns', 'Newsletters', 'Podcasts', 'Sections', 'Topics', 'Columnists', 'Podcasts', 'Audio', 'Listen', 'Featured', 'Newsletters', 'Games', 'Play', '', 'Community', 'Newsletters', 'Cooking', 'Recipes', "Editors' Picks", 'Newsletters', '', 'Wirecutter', 'Reviews', '', 'The Best...', 'Newsletters', 'The Athletic', 'Leagues', 'Top Stories', 'Newsletters', 'Play', 'Sections', '', 'Top Stories', 'Newsletters', 'Podcasts', 'Sections', 'Top Stories', 'Newsletters', '', 'Sections', '', 'Top Stories', 'Newsletters', 'Podcasts', 'Sections', '', 'Recommendations', 'Newsletters', 'Podcasts', 'Sections', '', 'Columns', 'Newsletters', 'Podcasts', 'Sections', 'Topics', 'Columnists', 'Podcasts', 'Audio', 'Listen', 'Featured', 'Newsletters', 'Games', 'Play', '', 'C

In [52]:
keywords = ['election', 'climate', 'trade']
for word in keywords:
    matches = soup.find_all(string=re.compile(word, re.IGNORECASE))
    for match in matches:
        print(f"Found '{word}':", match.strip())
print(keywords)

Found 'election': {"@context":"https://schema.org","@type":"WebPage","image":[{"@context":"https://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/vi-assets/images/share/1200x675_nameplate.png","height":675,"width":1200,"contentUrl":"https://static01.nyt.com/vi-assets/images/share/1200x675_nameplate.png","creditText":"The New York Times"},{"@context":"https://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/vi-assets/images/share/1200x900_t.png","height":900,"width":1200,"contentUrl":"https://static01.nyt.com/vi-assets/images/share/1200x900_t.png","creditText":"The New York Times"},{"@context":"https://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/vi-assets/images/share/1200x1200_t.png","height":1200,"width":1200,"contentUrl":"https://static01.nyt.com/vi-assets/images/share/1200x1200_t.png","creditText":"The New York Times"}],"name":"The New York Times","alternateName":["NYT","new york times","nytimes","ny times"],"mainEntity":{"@c