In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# 1.&nbsp; Population Scraping

## 1.1.&nbsp; Scrapping the populations

### Berlin

In [3]:
url = 'https://www.wikipedia.org/wiki/Berlin'
response = requests.get(url)
berlin_soup = BeautifulSoup(response.content, 'html.parser')

> By using Google Chrome and using the developer tool (`View > Developer > Developer Tools`) we can see that the HTML tag surrounding the Population number that we want contains no attributes to use. So let's try searching for `Population` and using the `find_next()` method.

In [17]:
berlin_soup.find(string="Population").find_next()

# """ If we want to make our code more robust and protect against the word `population`
# being by itself somewhere else on the page, we could first select the information box on the right using: """

#berlin_soup.find('table', class_='vcard').find(string="Population").find_next()

# """ The code works without it, so we'll leave it up to you whether you think this protection/enhancement makes sense. """

<div class="ib-settlement-fn"><span class="nowrap"> </span>(2024)<sup class="reference" id="cite_ref-Amt_für_Statistik_Berlin-Brandenburg_4-0"><a href="#cite_note-Amt_für_Statistik_Berlin-Brandenburg-4">[4]</a></sup></div>

The output doesn't look right! That's not a population!

Looking at the html again, we can see that this `div` is the next tag, but the number we want is in the next `td` tag. Let's update our code to reflect this.

In [8]:
berlin_soup.find(string="Population").find_next("td")

<td class="infobox-data">3,576,873</td>

Got it that time. Now let's extract the text

In [9]:
berlin_population = berlin_soup.find(string="Population").find_next("td").get_text()
berlin_population

'3,576,873'

And now we need to tidy the text so that it's a number and not a string.

In [10]:
berlin_population_clean = berlin_population.replace(",", "")
berlin_population_clean

'3576873'

In [11]:
berlin_population_clean = int(berlin_population_clean)
berlin_population_clean

3576873

### Hamburg
Let's see if the lessons we learnt from Berlin apply to Hamburg

In [12]:
url = 'https://www.wikipedia.org/wiki/Hamburg'
response = requests.get(url)
hamburg_soup = BeautifulSoup(response.content, 'html.parser')

hamburg_population = hamburg_soup.find(string="Population").find_next("td").get_text()
hamburg_population_clean = int(hamburg_population.replace(",", ""))
hamburg_population_clean

1945532

Fantastic the same rules apply. It's nice to find general rules like this as then you can loop through things and make your life easier. If finding the population for Hamburg was significantly different this could add complexity to our code. Unfortunately, web scrapping is very fragile - if someone redesigns the page, our code breaks and we have to start again - this is why APIs are preferred when they're available.

### Munich

In [13]:
url = 'https://www.wikipedia.org/wiki/Munich'
response = requests.get(url)
munich_soup = BeautifulSoup(response.content, 'html.parser')

munich_population = munich_soup.find(string="Population").find_next("td").get_text()
munich_population_clean = int(munich_population.replace(",", ""))
munich_population_clean

1512491

Great we've found a rule that generalises well, this helps us to automate our code with a loop.

## 1.2.&nbsp; making a loop

In [15]:
cities = ["Berlin", "Hamburg", "Munich"]

populations = []

for city in cities:
  url = f"https://www.wikipedia.org/wiki/{city}"
  response = requests.get(url)
  city_soup = BeautifulSoup(response.content, 'html.parser')

  city_population = city_soup.find(string="Population").find_next("td").get_text()

  city_population_clean = int(city_population.replace(",", ""))

  populations.append(city_population_clean)

populations

[3576873, 1945532, 1512491]

Although we now know this rule works well for German cities, it might not work for all cities on Wikipedia. For now we are happy with just Germany, but if you want to test your skills later, and maybe also give yourself a headache, see if you can find a general rule for all cities worldwide. Also, how could you automate the testing of this on a mass scale?

# 2.&nbsp; Data Organisation

In [16]:
cities_df = pd.DataFrame({"City": cities,
                          "Population": populations})

cities_df

Unnamed: 0,City,Population
0,Berlin,3576873
1,Hamburg,1945532
2,Munich,1512491


In [None]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City        3 non-null      object
 1   Population  3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


# 3.&nbsp; Further Enhancement

# 3.1.&nbsp; Expand the scope

#### Country
Again, by exploring the HTML using developer tools, we can see that the country is in the tag with the attribute `class` `infobox-data`.

In [None]:
berlin_soup.find_all(class_="infobox-data")

[<td class="infobox-data"><a href="/wiki/Germany" title="Germany">Germany</a></td>,
 <td class="infobox-data"><a href="/wiki/Boroughs_and_neighborhoods_of_Berlin" title="Boroughs and neighborhoods of Berlin">Berlin</a></td>,
 <td class="infobox-data agent"><a href="/wiki/Abgeordnetenhaus_of_Berlin" title="Abgeordnetenhaus of Berlin">Abgeordnetenhaus of Berlin</a></td>,
 <td class="infobox-data"><a href="/wiki/Kai_Wegner" title="Kai Wegner">Kai Wegner</a> (<a href="/wiki/Christian_Democratic_Union_of_Germany" title="Christian Democratic Union of Germany">CDU</a>)</td>,
 <td class="infobox-data">4 (of 69)</td>,
 <td class="infobox-data"><a href="/wiki/Results_of_the_2021_German_federal_election#Berlin" title="Results of the 2021 German federal election">29 (of 736)</a></td>,
 <td class="infobox-data">891.3 km<sup>2</sup> (344.1 sq mi)</td>,
 <td class="infobox-data">3,743 km<sup>2</sup> (1,445 sq mi)</td>,
 <td class="infobox-data">30,546 km<sup>2</sup> (11,794 sq mi)</td>,
 <td class="i

As we only want the first of all of these, let's make our lives easy and use `find()` instead of `find_all()`.

In [None]:
berlin_soup.find(class_="infobox-data").get_text()

'Germany'

Let's test this rule for Hamburg and Munich.

In [None]:
hamburg_soup.find(class_="infobox-data").get_text()

'Germany'

In [None]:
munich_soup.find(class_="infobox-data").get_text()

'Germany'

#### Latitude and Longitude

In [None]:
berlin_soup.find(class_="latitude").get_text()

'52°31′12″N'

In [None]:
berlin_soup.find(class_="longitude").get_text()

'13°24′18″E'

In [None]:
hamburg_soup.find(class_="latitude").get_text(), hamburg_soup.find(class_="longitude").get_text()

('53°33′N', '10°00′E')

In [None]:
munich_soup.find(class_="latitude").get_text(), munich_soup.find(class_="longitude").get_text()

('48°08′15″N', '11°34′30″E')

# 3.2.&nbsp; Create a function

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def cities_dataframe(cities):
  cities_data = []

  for city in cities:
    city_data = {}

    # city
    city_data["City"] = city

    # create the soup
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # country
    city_data["Country"] = city_soup.find(class_="infobox-data").get_text()

    # population
    city_population = city_soup.find(string="Population").find_next("td").get_text()
    city_population_clean = int(city_population.replace(",", ""))
    city_data["Population"] = city_population_clean

    # latitude and longitude
    city_data["Latitude"] = city_soup.find(class_="latitude").get_text()
    city_data["Longitude"] = city_soup.find(class_="longitude").get_text()

    # append this city's data to the cities list
    cities_data.append(city_data)

  return pd.DataFrame(cities_data)

In [None]:
cities_dataframe(["Berlin", "Hamburg", "Munich"])

Unnamed: 0,City,Country,Population,Latitude,Longitude
0,Berlin,Germany,3850809,52°31′12″N,13°24′18″E
1,Hamburg,Germany,1945532,53°33′N,10°00′E
2,Munich,Germany,1512491,48°08′15″N,11°34′30″E


# 4.&nbsp; Global Data Scraping
This isn't necessary for this project, even though it can be a fun exercise. We won't spoil it here by giving you the answer. Come back to this at the end of the project and see if you can get a working answer.