In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sqlalchemy

## Berlin

In [4]:
# get the berlin_soup
url = 'https://www.wikipedia.org/wiki/Berlin'
response = requests.get(url)
berlin_soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
berlin_soup.find(class_="infobox-data").get_text()

'\xa0Germany'

In [8]:
berlin_country = berlin_soup.find(class_="infobox-data").get_text()
berlin_country

'\xa0Germany'

In [10]:
berlin_soup.find(class_="latitude")

<span class="latitude">52°31′12″N</span>

In [12]:
berlin_latitude = berlin_soup.find(class_="latitude").get_text()
berlin_latitude

'52°31′12″N'

In [14]:
berlin_longitude = berlin_soup.find(class_="longitude").get_text()
berlin_longitude

'13°24′18″E'

## Hamburg

In [16]:
url = 'https://www.wikipedia.org/wiki/Hamburg'
response = requests.get(url)
hamburg_soup = BeautifulSoup(response.content, 'html.parser')

hamburg_country = hamburg_soup.find(class_="infobox-data").get_text()
hamburg_latitude = hamburg_soup.find(class_="latitude").get_text()
hamburg_longitude = hamburg_soup.find(class_="longitude").get_text()

hamburg_country, hamburg_latitude, hamburg_longitude

('Germany', '53°33′N', '10°00′E')

## Munich

In [18]:
url = 'https://www.wikipedia.org/wiki/Munich'
response = requests.get(url)
munich_soup = BeautifulSoup(response.content, 'html.parser')

munich_country = munich_soup.find(class_="infobox-data").get_text()
munich_latitude = munich_soup.find(class_="latitude").get_text()
munich_longitude = munich_soup.find(class_="longitude").get_text()

munich_country, munich_latitude, munich_longitude


('Germany', '48°08′15″N', '11°34′30″E')

## Making a loop

In [20]:
cities = ["Berlin", "Hamburg", "Munich"]

countries = []
latitudes = []
longitudes = []

for city in cities:
  # get the soup for the city
  url = f"https://www.wikipedia.org/wiki/{city}"
  response = requests.get(url)
  city_soup = BeautifulSoup(response.content, 'html.parser')

  # extract the data
  city_country = city_soup.find(class_="infobox-data").get_text(strip=True) #Removing the unicode from our results (if they exist)
  city_latitude = city_soup.find(class_="latitude").get_text()
  city_longitude = city_soup.find(class_="longitude").get_text()

  # append data to a list
  countries.append(city_country)
  latitudes.append(city_latitude)
  longitudes.append(city_longitude)

In [22]:
print(f"The cities are in the following countries: {countries}")
print(f"The cities have the following latitudes: {latitudes}")
print(f"The cities have the following longitudes: {longitudes}")

The cities are in the following countries: ['Germany', 'Germany', 'Germany']
The cities have the following latitudes: ['52°31′12″N', '53°33′N', '48°08′15″N']
The cities have the following longitudes: ['13°24′18″E', '10°00′E', '11°34′30″E']


### Creating a Dataframe

In [24]:
cities_df = pd.DataFrame({"City": cities,
                          "Country": countries,
                          "Latitude": latitudes,
                          "Longitude": longitudes})

cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


In [26]:
cities = ["Berlin", "Hamburg", "Munich"]

# create one single list to keep track of all values (instead of having separate lists)
city_data = []

for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    country = city_soup.find(class_="infobox-data").get_text(strip=True) #Removing the unicode from our results (if they exist)
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()

    # for each city we append a dictionary of values to the list
    city_data.append({"City": city,
                     "Country": country,
                     "Latitude": city_latitude,
                     "Longitude": city_longitude
                    })

cities_df = pd.DataFrame(city_data)
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


### Changing the latitude and longitude to decimal format

In [28]:
!pip install lat-lon-parser



In [30]:
from lat_lon_parser import parse

parse(berlin_latitude)

52.519999999999996

### Wrapping the code in a function

In [32]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse    # for decimal coordinates


def cities_dataframe(cities):

  city_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text(strip=True) #Removing the unicode from our results (if they exist)

    # keep track of data per city
    city_data.append({"City": city,
                    "Country": country,
                    "Latitude": parse(city_latitude), # latitude in decimal format
                    "Longitude": parse(city_longitude), # longitude in decimal format
                    })

  return pd.DataFrame(city_data)

In [34]:
# call the function
list_of_cities = ["Berlin", "Hamburg", "Munich"]

cities_df = cities_dataframe(list_of_cities)
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575


In [36]:
new_cities = ["Cologne", "Amsterdam", "Paris"]
new_cities_df = cities_dataframe(new_cities)

In [37]:
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575


In [40]:
new_cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Cologne,Germany,50.936389,6.952778
1,Amsterdam,Netherlands,52.372778,4.893611
2,Paris,France,48.856667,2.352222


In [42]:
combined_cities_df = pd.concat([cities_df,new_cities_df],ignore_index = True)

In [44]:
cities_df = combined_cities_df

In [46]:
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575
3,Cologne,Germany,50.936389,6.952778
4,Amsterdam,Netherlands,52.372778,4.893611
5,Paris,France,48.856667,2.352222


## Scraping population

In [63]:
berlin_soup.find(string="Population").find_next()

<div class="ib-settlement-fn"><span class="nowrap"> </span>(2023-12-31)<sup class="reference" id="cite_ref-4"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup></div>

In [65]:
berlin_population = berlin_soup.find(string="Population").find_next("td").get_text()
berlin_population

'3,878,100'

In [67]:
berlin_population_clean = berlin_population.replace(",", "")
berlin_population_clean

'3878100'

In [69]:
berlin_population_clean = int(berlin_population_clean)
berlin_population_clean

3878100

### Making a loop

In [106]:
from datetime import datetime # to get today's date

cities = ["Berlin", "Hamburg", "Munich","Cologne", "Amsterdam", "Paris"]

# create one single list to keep track of all values (instead of having separate lists)
population_data = []

for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_population = city_soup.find(string="Population").find_next("td").get_text()
    city_population_clean = city_population.replace(",", "")
    today = datetime.today().strftime("%Y.%d.%m")

    # for each city we append a dictionary of values to the list
    population_data.append({"City": city,
                     "Population": int(city_population_clean),
                     "Population_Timestamp": today
                    })

population_data = pd.DataFrame(population_data)
population_data

Unnamed: 0,City,Population,Population_Timestamp
0,Berlin,3878100,2024.10.09
1,Hamburg,1964021,2024.10.09
2,Munich,1510378,2024.10.09
3,Cologne,1087353,2024.10.09
4,Amsterdam,921402,2024.10.09
5,Paris,2102650,2024.10.09


## Wrapping into Function

In [108]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime # to get today's date


def populations_dataframe(cities):

    population_data = []

    for city in cities:
        url = f"https://www.wikipedia.org/wiki/{city}"
        response = requests.get(url)
        city_soup = BeautifulSoup(response.content, 'html.parser')

        # extract the relevant information
        city_population = city_soup.find(string="Population").find_next("td").get_text()
        city_population_clean = int(city_population.replace(",", ""))
        today = datetime.today().strftime("%Y-%m-%d")

        # keep track of data per city
        population_data.append({"City": city,
                        "Population": city_population_clean,
                        "Timestamp_Population": today
                        })

    return pd.DataFrame(population_data)

In [110]:
population_data

Unnamed: 0,City,Population,Population_Timestamp
0,Berlin,3878100,2024.10.09
1,Hamburg,1964021,2024.10.09
2,Munich,1510378,2024.10.09
3,Cologne,1087353,2024.10.09
4,Amsterdam,921402,2024.10.09
5,Paris,2102650,2024.10.09


In [115]:
# call the populations function
cities = ["Berlin", "Hamburg", "Munich","Cologne", "Amsterdam", "Paris"]

population_df = populations_dataframe(cities)
population_df

Unnamed: 0,City,Population,Timestamp_Population
0,Berlin,3878100,2024-09-10
1,Hamburg,1964021,2024-09-10
2,Munich,1510378,2024-09-10
3,Cologne,1087353,2024-09-10
4,Amsterdam,921402,2024-09-10
5,Paris,2102650,2024-09-10


### Pushing the cities and population information to SQL 

In [88]:
schema = "sql_workshop"
host = "127.0.0.1"
user = "root"
password = "forgot1234"
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [48]:
schema = "sql_workshop"
host = "34.77.20.25"
user = "root"
password = "sushma25121998"
port = 3306
connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [50]:
cities_df.to_sql('cities',
                  if_exists='append',
                  con=connection_string,
                  index=False)

6

In [99]:
population_df.to_sql('population',
                  if_exists='append',
                  con=connection_string,
                  index=False)

6