<b>Web scraping, small examples</b>

In [2]:
import json
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# pip install beatifulsoup4
# pip install requests
from bs4 import BeautifulSoup
import requests

In [4]:
# define the URL of the webpage and download the raw HTML
url = "https://en.wikipedia.org/wiki/Rovaniemi"
page = requests.get(url)

In [5]:
# create a "soup object" of the raw web-page
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
# find the city name in the website
# Note: you have to use the developer tools
# of your web browser (often F12) and find the details
# what you are trying to scrape
# in this case => <h1> with the id of firstHeading + get text inside this element
cityname = soup.find("h1", id="firstHeading").text
cityname

# if using class => class_
# if using id => use id

'Rovaniemi'

In [7]:
# getting multiple elements, <tr> with class "mergedtoprow"
# this is the small summary table on the right side of the page
elements = soup.find_all("tr", class_="mergedtoprow")

In [8]:
# loop through all that matched with <tr> and class="mergedtoprow"
for e in elements:
    print(e.text)

Rovaniemen kaupunkiRovaniemi stadCity of Rovaniemi
Clockwise from top: the Rovaniemi Church, the Rovaniemi Airport, the Santa Claus Village, downtown Rovaniemi, a view of the city from Ounasvaara, the Arktikum Science Museum, and aurora borealis in Someroharju.
.mw-parser-output .ib-settlement-cols{text-align:center;display:table;width:100%}.mw-parser-output .ib-settlement-cols-row{display:table-row}.mw-parser-output .ib-settlement-cols-cell{display:table-cell;vertical-align:middle}.mw-parser-output .ib-settlement-cols-cellt{display:table-cell;vertical-align:top}

FlagCoat of arms
Nickname(s): Arctic Capital; Hometown of Santa Claus
Location of Rovaniemi in Finland
CountryFinland
Charter1960
Government
Area (2018-01-01)[1]
Population (2023-12-31)[2]
Population by native language[3]
Population by age[4]
Time zoneUTC+02:00 (EET)
Websitewww.rovaniemi.fi


In [9]:
# get the municipality, a <span> with the title "English-language text"
municipality = soup.find("span", title="English-language text").text
municipality

'City of Rovaniemi'

In [10]:
# div class ib-settlement-caption
caption_text = soup.find("div", class_="ib-settlement-caption").text
caption_text

'Clockwise from top: the Rovaniemi Church, the Rovaniemi Airport, the Santa Claus Village, downtown Rovaniemi, a view of the city from Ounasvaara, the Arktikum Science Museum, and aurora borealis in Someroharju.'

<b>Let's try the timezone, which doesn't have a single identifier</b>
<p>Instead we have to loop through a pile of elements to find the correct one</p>

In [11]:
# getting multiple elements, <tr> with class "mergedtoprow"
# this is the small summary table on the right side of the page
elements = soup.find_all("tr", class_="mergedtoprow")

# we have 14 elements (4.4.2024)
# len(elements)

# loop through the 14 elements
for e in elements:
    # print(e.text)

    # if we found an <th> which is not empty
    if e.find("th") != None:
        # if this <th> has the text => "Time Zone"
        if e.find('th').find(string=True) == "Time zone":
            info = e.findNext("td")
            print(info.text)
            break

UTC+02:00 (EET)


In [47]:
e

<tr class="mergedtoprow"><th class="infobox-label" scope="row"><a href="/wiki/Time_zone" title="Time zone">Time zone</a></th><td class="infobox-data"><a href="/wiki/UTC%2B02:00" title="UTC+02:00">UTC+02:00</a> (<a href="/wiki/Eastern_European_Time" title="Eastern European Time">EET</a>)</td></tr>

In [46]:
elements

[<tr class="mergedtoprow ib-settlement-official"><td class="infobox-full-data" colspan="2"><span title="Finnish-language text"><i lang="fi">Rovaniemen kaupunki</i></span><br/><span title="Swedish-language text"><i lang="sv">Rovaniemi stad</i></span><br/><span title="English-language text"><span lang="en">City of Rovaniemi</span></span></td></tr>,
 <tr class="mergedtoprow"><td class="infobox-full-data" colspan="2"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Rovaniemi_Montage.jpg" title="Clockwise from top: the Rovaniemi Church, the Rovaniemi Airport, the Santa Claus Village, downtown Rovaniemi, a view of the city from Ounasvaara, the Arktikum Science Museum, and aurora borealis in Someroharju."><img alt="Clockwise from top: the Rovaniemi Church, the Rovaniemi Airport, the Santa Claus Village, downtown Rovaniemi, a view of the city from Ounasvaara, the Arktikum Science Museum, and aurora borealis in Someroharju." class="mw-file-element" data-file-height="2832" 

<b>Let's try scraping an HTML table instead</b>

In [12]:
# page with multiple tables
url = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"

# use pandas to read all HTML-tables from a website
tables = pd.read_html(url)

In [13]:
# in this case, we're interested in the 2nd table
# (the 1st one is the small menu on the top left)
actual_data = tables[1]

In [14]:
# and this is a DataFrame
actual_data

Unnamed: 0,Title,Sales,Series,Platform(s),Initial release date,Developer(s)[b],Publisher(s)[b],Ref.
0,Minecraft,300000000,Minecraft,Multi-platform,"November 18, 2011[c]",Mojang Studios,Mojang Studios,[4]
1,Grand Theft Auto V,195000000,Grand Theft Auto,Multi-platform,"September 17, 2013",Rockstar North,Rockstar Games,[5]
2,Tetris (EA),100000000,Tetris,Multi-platform,"September 12, 2006",EA Mobile,Electronic Arts,[6]
3,Wii Sports,82900000,Wii,Wii,"November 19, 2006",Nintendo EAD,Nintendo,[7]
4,PUBG: Battlegrounds,75000000,PUBG Universe,Multi-platform,"December 20, 2017",PUBG Studios,Krafton,[8]
5,Mario Kart 8 / Deluxe,69040000,Mario Kart,Wii U / Switch,"May 29, 2014",Nintendo EAD / Nintendo EPD (Deluxe),Nintendo,[d]
6,Red Dead Redemption 2,61000000,Red Dead,Multi-platform,"October 26, 2018",Rockstar Games,Rockstar Games,[5]
7,Super Mario Bros.,58000000,Super Mario,Multi-platform,"September 13, 1985",Nintendo R&D4,Nintendo,[e]
8,Overwatch,50000000,Overwatch,Multi-platform,"May 24, 2016",Blizzard Entertainment,Blizzard Entertainment,[14]
9,Human: Fall Flat,50000000,Human: Fall Flat,Multi-platform,"July 22, 2016",No Brakes Games,Curve Digital,[15]


In [45]:
import json
import numpy as np
import pandas as pd
import seaborn as sns

from bs4 import BeautifulSoup
import requests

# define the URL of the webpage and download the raw HTML
url = "https://en.wikipedia.org/wiki/Rovaniemi"
page = requests.get(url)

# create a "soup object" of the raw web-page
soup = BeautifulSoup(page.content, "html.parser")

coordinates_element = soup.find("span", class_="geo")
coordinates = coordinates_element.get_text()

nickname_element = soup.find("div", class_="ib-settlement-nickname nickname")
nickname = nickname_element.get_text() if nickname_element else "Nickname not found"

print(f"Coordinates of Rovaniemi: {coordinates}")
print(f"Nicknames of Rovaniemi: {nickname}")

Coordinates of Rovaniemi: 66.500; 25.733
Nicknames of Rovaniemi: Arctic Capital; Hometown of Santa Claus
