In [14]:
# colab has an older version of beautifulsoup by default
# here we upgrade it
# if you are working on your own computer, you can probably comment this step out and skip it
!pip install --upgrade beautifulsoup4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# 1. import libraries
!pip install --upgrade beautifulsoup4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# 2. find url and store it in a variable
url = "https://en.wikipedia.org/wiki/Berlin"

In [4]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [5]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
#soup

In [6]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector" you copied before to get the element that belongs to the top movie)

# let's first try to get the name of the city
# by copying the selector we can see that it has the id firstHeading (it also has a class by the same name!)
soup.select("#firstHeading")

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Berlin</span></h1>]

In [7]:
soup.select("#firstHeading")[0].get_text()

'Berlin'

In [8]:
# Let's use this class, infobox-data, to target the information country
soup.select(".infobox-data")[0].get_text()

'\xa0Germany'

In [9]:
soup.select(".infobox-data")[0].get_text()

'\xa0Germany'

Now we just carry on exploring the html, finding classes, ids, and selectors to target the information we need. Hopefully these classes and selectors will be universal across all cities on wikipedia, but it is likely that they will change in a few places, and we will have to try to make our code robust to this

In [10]:
!pip install --upgrade beautifulsoup4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

def recreate_wiki(cities):
   list_for_df = []
   for city in cities:
     url = f'https://en.wikipedia.org/wiki/{city}'
     r = requests.get(url)
     soup = BeautifulSoup(r.content, 'html.parser')
     response_dict = {}
     response_dict['city'] = soup.select(".firstHeading")[0].get_text()
     response_dict['country'] = soup.select(".infobox-data")[0].get_text()
     response_dict['latitude'] = soup.select(".latitude")[0].get_text()
     response_dict['longitude'] = soup.select(".longitude")[0].get_text()
     response_dict['population'] = soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(text=re.compile(r'\d+'))
     list_for_df.append(response_dict)
     cities_df = pd.DataFrame(list_for_df)
     cities_df['latitude'] = cities_df['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
     cities_df['longitude'] = cities_df['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
   return cities_df

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
list_of_cities = ['Berlin','Hamburg','London','Manchester','Munich','Stuttgart']
cities=recreate_wiki(list_of_cities)
cities

Unnamed: 0,city,country,latitude,longitude,population
0,Berlin,Germany,52.3112,13.2418,3769495
1,Hamburg,Germany,53.33,10.0,1845229
2,London,United Kingdom,51.3026,0.739,9002488
3,Manchester,United Kingdom,53.2846,2.1443,552858
4,Munich,Germany,48.0815,11.343,1488202
5,Stuttgart,Germany,48.4639,9.1048,630305


## Alternate to get population:

In [12]:
#df=pd.DataFrame(list(zip(city, country, population, coords)), columns =['city', 'country', 'population', 'coords'])

In [13]:
# here we make our soup for the city
r = requests.get("https://en.wikipedia.org/wiki/Berlin")
soup = BeautifulSoup(r.content, 'html.parser')
soup.select('.infobox-label ')

[<th class="infobox-label" scope="row">Country</th>,
 <th class="infobox-label" scope="row"><a href="/wiki/States_of_Germany" title="States of Germany">State</a></th>,
 <th class="infobox-label" scope="row"> • Body</th>,
 <th class="infobox-label" scope="row"> • <a href="/wiki/Governing_Mayor_of_Berlin" title="Governing Mayor of Berlin">Governing Mayor</a></th>,
 <th class="infobox-label" scope="row"> • <a class="mw-redirect" href="/wiki/Bundesrat_of_Germany" title="Bundesrat of Germany">Bundesrat votes</a></th>,
 <th class="infobox-label" scope="row"> • <a href="/wiki/Bundestag" title="Bundestag">Bundestag seats</a></th>,
 <th class="infobox-label" scope="row"> • City/State</th>,
 <th class="infobox-label" scope="row"> • Urban<div class="ib-settlement-fn"></div></th>,
 <th class="infobox-label" scope="row"> • Metro<div class="ib-settlement-fn"></div></th>,
 <th class="infobox-label" scope="row">Elevation<div class="ib-settlement-fn"></div></th>,
 <th class="infobox-label" scope="row">