# Webpage data extraction using Beautiful Soup 4

For a list of references see:

https://blog.hartleybrody.com/web-scraping-cheat-sheet/#using-beautifulsoup


In [1]:
!pip install beautifulsoup4
# pip install requests
import requests
from bs4 import BeautifulSoup



In [2]:
# Request the webpage
url = "https://www.spiegel.de/international"
req = requests.get(url)

In [3]:
# Inspect the structure of the article using Chrome / Devtools
req.text



In [4]:
dir(req)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [5]:
req.cookies

<RequestsCookieJar[]>

In [6]:
# Create the BS4 Object
soup = BeautifulSoup(req.text, 'html')

In [7]:
# Use HTML Selector
events = soup.findAll('article')
events

[<article aria-label="The Double Life of Former Wirecard Executive Jan Marsalek" class="lg:p-24 md:py-24 sm:py-16" data-has-linked-author="" data-sara-article-id="7e667c03-6690-41e6-92ad-583d94ba97e0" data-target-teaser="xxl-news">
 <header class="lg:flex lg:justify-between md:flex md:justify-between md:mx-24 sm:mx-16">
 <h2 class="lg:grow md:grow">
 <a class="text-black dark:text-shade-lightest block" href="https://www.spiegel.de/international/business/jan-marsalek-an-agent-for-russia-the-double-life-of-the-former-wirecard-executive-a-7e667c03-6690-41e6-92ad-583d94ba97e0" target="_self" title="The Double Life of Former Wirecard Executive Jan Marsalek">
 <span class="block text-primary-base hover:text-primary-dark focus:text-primary-darker dark:text-shade-lightest dark:hover:opacity-moderate dark:hover:text-shade-lightest dark:focus:opacity-moderate dark:focus:text-shade-lightest font-sansUI font-normal lg:text-base md:text-base sm:text-s leading-normal mb-4" data-target-teaser-el="top

In [8]:
len(events)

33

In [15]:
# View the second article
event=events[1]
event

<article aria-label="The Horrors of Trench Warfare" class="lg:p-24 md:py-24 sm:py-16" data-has-linked-author="" data-sara-article-id="b382bd59-8cbd-443c-9e7b-e1877d7e4a5b" data-target-teaser="xxl-news">
<header class="lg:flex lg:justify-between md:flex md:justify-between md:mx-24 sm:mx-16">
<h2 class="lg:grow md:grow">
<a class="text-black dark:text-shade-lightest block" href="https://www.spiegel.de/international/world/the-russian-invasion-a-visit-to-the-ukrainian-troops-in-the-trenches-on-the-front-a-b382bd59-8cbd-443c-9e7b-e1877d7e4a5b" target="_self" title="The Horrors of Trench Warfare">
<span class="block text-primary-base hover:text-primary-dark focus:text-primary-darker dark:text-shade-lightest dark:hover:opacity-moderate dark:hover:text-shade-lightest dark:focus:opacity-moderate dark:focus:text-shade-lightest font-sansUI font-normal lg:text-base md:text-base sm:text-s leading-normal mb-4" data-target-teaser-el="topMark">
Ukrainian Soldiers at the Front
</span>
<span class="bloc

In [16]:
# Get the headline
x=event.find('h2')
x

<h2 class="lg:grow md:grow">
<a class="text-black dark:text-shade-lightest block" href="https://www.spiegel.de/international/world/the-russian-invasion-a-visit-to-the-ukrainian-troops-in-the-trenches-on-the-front-a-b382bd59-8cbd-443c-9e7b-e1877d7e4a5b" target="_self" title="The Horrors of Trench Warfare">
<span class="block text-primary-base hover:text-primary-dark focus:text-primary-darker dark:text-shade-lightest dark:hover:opacity-moderate dark:hover:text-shade-lightest dark:focus:opacity-moderate dark:focus:text-shade-lightest font-sansUI font-normal lg:text-base md:text-base sm:text-s leading-normal mb-4" data-target-teaser-el="topMark">
Ukrainian Soldiers at the Front
</span>
<span class="block lg:mb-24 md:mb-16 sm:mb-16">
<span class="font-brandUI font-extrabold lg:text-5xl md:text-5xl sm:text-3xl leading-tight"><span class="align-middle hover:opacity-moderate focus:opacity-moderate">The Horrors of Trench Warfare</span>
</span>
</span>
</a>
</h2>

In [12]:
# Get the title from the Anchor tag within the headline
title=x.find("a")["title"]
title

'The Horrors of Trench Warfare'

In [13]:
# Get the article link from the Anchor tag within the headline
href=x.find("a")["href"]
href

'https://www.spiegel.de/international/world/the-russian-invasion-a-visit-to-the-ukrainian-troops-in-the-trenches-on-the-front-a-b382bd59-8cbd-443c-9e7b-e1877d7e4a5b'

In [14]:
# Get the figure HTML element
x=event.find('figure')
x

<figure class="lg:mb-16 relative">
<a class="block" href="https://www.spiegel.de/international/world/the-russian-invasion-a-visit-to-the-ukrainian-troops-in-the-trenches-on-the-front-a-b382bd59-8cbd-443c-9e7b-e1877d7e4a5b" target="_self" title="The Horrors of Trench Warfare">
<div class="relative bg-transparent" data-sara-component='{"id":"b5b7a696-6cb6-4d47-a012-efba963f8e49","name":"image","title":"The Horrors of Trench Warfare - Johanna Maria Fritz / Agentur Ostkreuz / DER SPIEGEL","type":"media"}'>
<picture>
<source data-sizes="(max-width: 519px) 100vw, (min-width: 520px) and (max-width: 719px) 520px, (min-width: 720px) and (max-width: 919px) 100vw, (min-width: 920px) and (max-width: 1019px) 920px, (min-width: 1020px) 960px" data-srcset="https://cdn.prod.www.spiegel.de/images/b5b7a696-6cb6-4d47-a012-efba963f8e49_w960_r2.194_fpx49_fpy48.webp 960w" type="image/webp"/>
<img alt="" class="lazyload rounded md:hidden sm:hidden" data-image-el="img" data-src="https://cdn.prod.www.spiegel.d

In [23]:
# Get the image srce using HTML and CSS selection
image = x.find("img",{'data-image-el':'img'})
print(image)
image_url=""
if image.has_attr('data-src'):
    if image["data-src"].startswith("https://"):
         image_url=image["data-src"]
if image.has_attr('src'):
    if image["src"].startswith("https://"):
         image_url=image["src"]
print(image_url)

<img alt="" class="lazyload rounded" data-image-el="img" data-src="https://cdn.prod.www.spiegel.de/images/d1af8767-e27e-4e20-9e89-5dd5da114ab8_w288_r1.778_fpx67.63_fpy44.98.jpg" height="274" src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 488 274' width='488' height='274' %3E%3C/svg%3E" title="Arrests Suggest Terrorist Organization Is Operating in Europe - Paul Zinken / dpa" width="488"/>
https://cdn.prod.www.spiegel.de/images/d1af8767-e27e-4e20-9e89-5dd5da114ab8_w288_r1.778_fpx67.63_fpy44.98.jpg


In [18]:
event=events[0]
x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})

In [19]:
if (x!=None):
    print(x.text)

In [20]:
my_events = []
for event in events:
    event_details = dict()
    x=event.find('h2')
    if (x!=None):
        event_details['title'] = x.find("a")["title"]
    x=event.find('h2')
    if (x!=None):
        event_details['href'] = x.find("a")["href"]
    x=event.find('figure')
    
    if (x!=None):
        image = x.find("img",{'data-image-el':'img'})
        if (image!=None):
            if image.has_attr('data-src'):
                if image["data-src"].startswith("https://"):
                     event_details['image']=image["data-src"]
            if x.has_attr('src'):
                if image["src"].startswith("https://"):
                     event_details['image']=image["src"]
    my_events.append(event_details) 


In [21]:
my_events

[{'title': 'The Double Life of Former Wirecard Executive Jan Marsalek',
  'href': 'https://www.spiegel.de/international/business/jan-marsalek-an-agent-for-russia-the-double-life-of-the-former-wirecard-executive-a-7e667c03-6690-41e6-92ad-583d94ba97e0'},
 {'title': 'The Horrors of Trench Warfare',
  'href': 'https://www.spiegel.de/international/world/the-russian-invasion-a-visit-to-the-ukrainian-troops-in-the-trenches-on-the-front-a-b382bd59-8cbd-443c-9e7b-e1877d7e4a5b',
  'image': 'https://cdn.prod.www.spiegel.de/images/b5b7a696-6cb6-4d47-a012-efba963f8e49_w960_r2.194_fpx49_fpy48.jpg'},
 {'title': 'Macron Attempts to Save a City Rocked by Drug Violence',
  'href': 'https://www.spiegel.de/international/europe/the-marseille-experiment-macron-attempts-to-save-a-city-overtaken-by-drug-violence-a-bade2006-b7d7-433c-abf1-f053d56680aa',
  'image': 'https://cdn.prod.www.spiegel.de/images/a7729fd4-503d-43b6-a854-0bac27c93504_w960_r2.194_fpx30_fpy55.01.jpg'},
 {'title': 'How Vladimir Putin Contro

In [22]:
# Get one link
details_url=my_events[2]["href"]
details_url

'https://www.spiegel.de/international/europe/the-marseille-experiment-macron-attempts-to-save-a-city-overtaken-by-drug-violence-a-bade2006-b7d7-433c-abf1-f053d56680aa'

Check the website manually in Chrome with Xpath

//div[contains(@class,"RichText")]/p/text()

In [24]:
# Request the website
details_req = requests.get(details_url)

In [25]:
# Create a Soup Object
details_soup = BeautifulSoup(details_req.text, 'html')

In [26]:
import re
# AND expression with look aheads
regex = re.compile('(?=.*RichText.*)(?=.*word-wrap.*)')
# OR expression with look aheads
# regex = re.compile('(?=.*RichText.*|.*word-wrap.*)')

In [28]:
# Use HTML Selector
details_events = details_soup.findAll('div',{'class':regex})
details_events

[<div class="RichText lg:w-8/12 md:w-10/12 lg:mx-auto md:mx-auto lg:px-24 md:px-24 sm:px-16 break-words word-wrap">
 <p><em>From the police files on the victims of the 2023 drug war: Rayan, 23, found dead and burned in the trunk of a car. Mohamad, 18, shot to death in April in Cité du Mail. Hugo, 22, hit in the head by a bullet in June. Socayana, a 24-year-old university student, killed in September by a stray bullet.</em></p>
 </div>,
 <div class="RichText lg:w-8/12 md:w-10/12 lg:mx-auto md:mx-auto lg:px-24 md:px-24 sm:px-16 break-words word-wrap">
 <p>It’s hard to say how Emmanuel Macron decided that Marseille was his favorite city. Not the more middle-class metropolis of Bordeaux, not mundane Cannes, but the colorful, loud and, in some quarters, extremely dangerous Mediterranean city is his choice. It is something the French president emphasizes every chance he gets.</p>
 </div>,
 <div class="RichText lg:w-8/12 md:w-10/12 lg:mx-auto md:mx-auto lg:px-24 md:px-24 sm:px-16 break-words 

In [29]:
# Iterate over all p tags
for devents in details_events:
    x=devents.findAll("p")
    for p in x:
        print(p.text)

From the police files on the victims of the 2023 drug war: Rayan, 23, found dead and burned in the trunk of a car. Mohamad, 18, shot to death in April in Cité du Mail. Hugo, 22, hit in the head by a bullet in June. Socayana, a 24-year-old university student, killed in September by a stray bullet.
It’s hard to say how Emmanuel Macron decided that Marseille was his favorite city. Not the more middle-class metropolis of Bordeaux, not mundane Cannes, but the colorful, loud and, in some quarters, extremely dangerous Mediterranean city is his choice. It is something the French president emphasizes every chance he gets.
The chances of being robbed in Marseille are twice as high as the countrywide average. The risk of violent attack is also much higher than elsewhere. In the 3rd Arrondissement of this city of 870,000, every second resident lives below the poverty line.
In the notorious quartiers nord, the poorer sections of town in the northern part of the city, there are schools where the cla

In [30]:
# Create a function for the code above
# Define a function to automatically extract the text
def downloadText(url):
    details_req = requests.get(url)
    details_soup = BeautifulSoup(details_req.text, 'html')
    regex = re.compile('.*RichText.*')
    details_events = details_soup.findAll('div',{'class':regex})
    text="";
    for devents in details_events:
        paragraphs=devents.findAll("p")
        for paragraph in paragraphs:
            text = text + " " + paragraph.text
    return text

In [31]:
# Test the function
print(downloadText(details_url))



In [None]:
# Download all images - Helper method
def downloadImage(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = requests.get(url)
        # write to file
        file.write(response.content)

In [None]:
# Download all images
i=0;
for e in my_events:
    if (e.get("image")!=None):
        downloadImage(e["image"],str(i)+".jpg")
        i = i + 1