#DATA SCRAPING#
# **Collecting** the data

In [None]:
#importing and updating the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
!pip install --upgrade beautifulsoup4



In [None]:
# safe access to password
# create .env file with your own key! and save it in the same directory as the notebook!
import os
from dotenv import load_dotenv

load_dotenv()
PASSWORD = os.environ.get('PASSWORD1')
API_key = os.environ.get('API_key')

In [None]:
#doing grocery 🧾
url = "https://en.wikipedia.org/wiki/List_of_European_cities_by_population_within_city_limits"

In [None]:
#requesting data from the webpage
#chopping the vegetables:D 🔪🥔🍅🥕🌽🍅🥕🥬🥔
response = requests.get(url)
response.status_code

200

In [None]:
#cooking the soup (;]D) 🌿
soup = BeautifulSoup(response.content, "html.parser")
len(soup)

3

In [None]:
#retrieving and cleaning the links 
#serving the soup 🥣
prefix = "https://en.wikipedia.org/wiki/"
all_links = []
for i in range(90):
  try:
    city_link = soup.find_all("tbody")[0].find_all("td")[i].find("a")["title"]
    all_links.append(prefix+city_link.replace(" ", "_"))
  except:
    continue
all_links

['https://en.wikipedia.org/wiki/Istanbul',
 'https://en.wikipedia.org/wiki/Turkey',
 'https://en.wikipedia.org/wiki/Moscow',
 'https://en.wikipedia.org/wiki/Russia',
 'https://en.wikipedia.org/wiki/London',
 'https://en.wikipedia.org/wiki/United_Kingdom',
 'https://en.wikipedia.org/wiki/Saint_Petersburg',
 'https://en.wikipedia.org/wiki/Russia',
 'https://en.wikipedia.org/wiki/Berlin',
 'https://en.wikipedia.org/wiki/Germany',
 'https://en.wikipedia.org/wiki/Madrid',
 'https://en.wikipedia.org/wiki/Spain',
 'https://en.wikipedia.org/wiki/Kyiv',
 'https://en.wikipedia.org/wiki/Ukraine',
 'https://en.wikipedia.org/wiki/Rome',
 'https://en.wikipedia.org/wiki/Italy',
 'https://en.wikipedia.org/wiki/Bucharest',
 'https://en.wikipedia.org/wiki/Romania',
 'https://en.wikipedia.org/wiki/Paris',
 'https://en.wikipedia.org/wiki/France']

In [None]:
#getting rid of countries links
all_links_cl = []
for i in enumerate(all_links):
  if i[0] % 2 == 0:
    all_links_cl.append(i[1])
  else:
    continue
all_links_cl

['https://en.wikipedia.org/wiki/Istanbul',
 'https://en.wikipedia.org/wiki/Moscow',
 'https://en.wikipedia.org/wiki/London',
 'https://en.wikipedia.org/wiki/Saint_Petersburg',
 'https://en.wikipedia.org/wiki/Berlin',
 'https://en.wikipedia.org/wiki/Madrid',
 'https://en.wikipedia.org/wiki/Kyiv',
 'https://en.wikipedia.org/wiki/Rome',
 'https://en.wikipedia.org/wiki/Bucharest',
 'https://en.wikipedia.org/wiki/Paris']

In [None]:
#creating lists with required data
names = []
countries = []
latitudes = []
longitudes = []
populations = []
wiki_links = []
for i in all_links_cl:
  response1 = requests.get(i)
  soup1 = BeautifulSoup(response1.content, "html.parser")
  #name
  names.append(soup1.select(".mw-page-title-main")[0].get_text())
  #country
  countries.append(soup1.select(".infobox-data")[0].get_text())
  #latitude
  latitudes.append(soup1.select(".latitude")[0].get_text())
  #longitude
  longitudes.append(soup1.select(".longitude")[0].get_text())
  #population
  try:
    populations.append(soup1.select_one(".infobox-header:-soup-contains('Population')").parent.find_next_sibling().td.get_text())
  except:
    populations.append("NaN")
  #wiki_limk
  wiki_links.append(i)

In [None]:
len(countries), len(latitudes), len(longitudes), len(names), len(populations), len(wiki_links)

(10, 10, 10, 10, 10, 10)

In [None]:
#transforming the data into a data frame
cities_tbl = pd.DataFrame({
    "city_name":names, 
    "countries":countries,
    "latitudes":latitudes,
    "longitudes":longitudes,
    "populations":populations,
    "wiki_links": wiki_links

})
cities_tbl 

Unnamed: 0,city_name,countries,latitudes,longitudes,populations,wiki_links
0,Istanbul,Turkey,41°00′49″N,28°57′18″E,15840900,https://en.wikipedia.org/wiki/Istanbul
1,Moscow,Russia,55°45′21″N,37°37′2″E,13010112,https://en.wikipedia.org/wiki/Moscow
2,London,United Kingdom,51°30′26″N,0°7′39″W,"8,799,800[1]",https://en.wikipedia.org/wiki/London
3,Saint Petersburg,Russia,59°56′15″N,30°18′31″E,5351935,https://en.wikipedia.org/wiki/Saint_Petersburg
4,Berlin,Germany,52°31′12″N,13°24′18″E,3677472,https://en.wikipedia.org/wiki/Berlin
5,Madrid,Spain,40°25′00″N,03°42′09″W,3223334,https://en.wikipedia.org/wiki/Madrid
6,Kyiv,Ukraine,50°27′00″N,30°31′24″E,"2,962,180[2]",https://en.wikipedia.org/wiki/Kyiv
7,Rome,Italy[a],41°53′36″N,12°28′58″E,1st in Italy (3rd in the EU),https://en.wikipedia.org/wiki/Rome
8,Bucharest,Romania,44°25′57″N,26°6′14″E,1883425,https://en.wikipedia.org/wiki/Bucharest
9,Paris,France,48°51′24″N,2°21′08″E,,https://en.wikipedia.org/wiki/Paris


In [None]:
cities_tbl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city_name    10 non-null     object
 1   countries    10 non-null     object
 2   latitudes    10 non-null     object
 3   longitudes   10 non-null     object
 4   populations  10 non-null     object
 5   wiki_links   10 non-null     object
dtypes: object(6)
memory usage: 608.0+ bytes


# **Cleaning** the data

In [None]:
#🍨desser is pending...
#data cleaning

In [None]:
#dealing with extra  numerical datain "population" column
cities_tbl["populations"] = cities_tbl.populations.str.replace(r"(\[\d\])", "").str.replace(",", "")
cities_tbl

  cities_tbl["populations"] = cities_tbl.populations.str.replace(r"(\[\d\])", "").str.replace(",", "")


Unnamed: 0,city_name,countries,latitudes,longitudes,populations,wiki_links
0,Istanbul,Turkey,41°00′49″N,28°57′18″E,15840900,https://en.wikipedia.org/wiki/Istanbul
1,Moscow,Russia,55°45′21″N,37°37′2″E,13010112,https://en.wikipedia.org/wiki/Moscow
2,London,United Kingdom,51°30′26″N,0°7′39″W,8799800,https://en.wikipedia.org/wiki/London
3,Saint Petersburg,Russia,59°56′15″N,30°18′31″E,5351935,https://en.wikipedia.org/wiki/Saint_Petersburg
4,Berlin,Germany,52°31′12″N,13°24′18″E,3677472,https://en.wikipedia.org/wiki/Berlin
5,Madrid,Spain,40°25′00″N,03°42′09″W,3223334,https://en.wikipedia.org/wiki/Madrid
6,Kyiv,Ukraine,50°27′00″N,30°31′24″E,2962180,https://en.wikipedia.org/wiki/Kyiv
7,Rome,Italy[a],41°53′36″N,12°28′58″E,1st in Italy (3rd in the EU),https://en.wikipedia.org/wiki/Rome
8,Bucharest,Romania,44°25′57″N,26°6′14″E,1883425,https://en.wikipedia.org/wiki/Bucharest
9,Paris,France,48°51′24″N,2°21′08″E,,https://en.wikipedia.org/wiki/Paris


In [None]:
#dealing with non numerical datain "population" column
for i in range(10):
  if len(cities_tbl.populations[i]) > 15:
    cities_tbl.populations[i] = "NaN"
  else:
    continue

cities_tbl.populations.inplace = True

cities_tbl


Unnamed: 0,city_name,countries,latitudes,longitudes,populations,wiki_links
0,Istanbul,Turkey,41°00′49″N,28°57′18″E,15840900.0,https://en.wikipedia.org/wiki/Istanbul
1,Moscow,Russia,55°45′21″N,37°37′2″E,13010112.0,https://en.wikipedia.org/wiki/Moscow
2,London,United Kingdom,51°30′26″N,0°7′39″W,8799800.0,https://en.wikipedia.org/wiki/London
3,Saint Petersburg,Russia,59°56′15″N,30°18′31″E,5351935.0,https://en.wikipedia.org/wiki/Saint_Petersburg
4,Berlin,Germany,52°31′12″N,13°24′18″E,3677472.0,https://en.wikipedia.org/wiki/Berlin
5,Madrid,Spain,40°25′00″N,03°42′09″W,3223334.0,https://en.wikipedia.org/wiki/Madrid
6,Kyiv,Ukraine,50°27′00″N,30°31′24″E,2962180.0,https://en.wikipedia.org/wiki/Kyiv
7,Rome,Italy[a],41°53′36″N,12°28′58″E,,https://en.wikipedia.org/wiki/Rome
8,Bucharest,Romania,44°25′57″N,26°6′14″E,1883425.0,https://en.wikipedia.org/wiki/Bucharest
9,Paris,France,48°51′24″N,2°21′08″E,,https://en.wikipedia.org/wiki/Paris


In [None]:
#latitude % longitude for airports API
Airport = []
for i in range(10):
  a = cities_tbl.latitudes[i].split("°")[0] + "/" + cities_tbl.longitudes[i].split("°")[0]
  Airport.append(a)
cities_tbl["geo_airports"] = Airport

In [None]:
#adding date and time
from datetime import datetime
cities_tbl["time_stamps"] = datetime.now()

In [None]:
#creating a copy for comfortable cleaning(to not alter the alredy existing table in case smth goes wrong)
city1 = cities_tbl.copy()

In [None]:
city1["latitudes"] = city1["latitudes"].str.split("′").str[0].str.replace("°", ".", regex=False)

In [None]:
city1["longitudes"] = city1["longitudes"].str.split("′").str[0].str.replace("°", ".", regex=False)

In [None]:
#creating static(the table will not be regularly update) table out of the general dataframe
city3 = city1[["city_name", "countries", "latitudes", "longitudes", "wiki_links", "geo_airports"]]

NameError: name 'city1' is not defined

In [None]:
#creating dynamic(the table will be regularly update) table out of the general dataframe
citi_populations1 = city1[["time_stamps", "city_name", "populations"]]

In [None]:
city3

Unnamed: 0,city_name,countries,latitudes,longitudes,wiki_links,geo_airports
0,Istanbul,Turkey,41.0,28.57,https://en.wikipedia.org/wiki/Istanbul,41/28
1,Moscow,Russia,55.45,37.37,https://en.wikipedia.org/wiki/Moscow,55/37
2,London,United Kingdom,51.3,0.7,https://en.wikipedia.org/wiki/London,51/0
3,Saint Petersburg,Russia,59.56,30.18,https://en.wikipedia.org/wiki/Saint_Petersburg,59/30
4,Berlin,Germany,52.31,13.24,https://en.wikipedia.org/wiki/Berlin,52/13
5,Madrid,Spain,40.25,3.42,https://en.wikipedia.org/wiki/Madrid,40/03
6,Kyiv,Ukraine,50.27,30.31,https://en.wikipedia.org/wiki/Kyiv,50/30
7,Rome,Italy[a],41.53,12.28,https://en.wikipedia.org/wiki/Rome,41/12
8,Bucharest,Romania,44.25,26.6,https://en.wikipedia.org/wiki/Bucharest,44/26
9,Paris,France,48.51,2.21,https://en.wikipedia.org/wiki/Paris,48/2


In [None]:
citi_populations1

Unnamed: 0,time_stamps,city_name,populations
0,2022-11-30 16:57:40.096690,Istanbul,15840900.0
1,2022-11-30 16:57:40.096690,Moscow,13010112.0
2,2022-11-30 16:57:40.096690,London,8799800.0
3,2022-11-30 16:57:40.096690,Saint Petersburg,5351935.0
4,2022-11-30 16:57:40.096690,Berlin,3677472.0
5,2022-11-30 16:57:40.096690,Madrid,3223334.0
6,2022-11-30 16:57:40.096690,Kyiv,2962180.0
7,2022-11-30 16:57:40.096690,Rome,
8,2022-11-30 16:57:40.096690,Bucharest,1883425.0
9,2022-11-30 16:57:40.096690,Paris,


In [None]:
import numpy as np
citi_populations1.populations = citi_populations1.populations.replace('NaN', np.nan).astype("float")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  citi_populations1.populations = citi_populations1.populations.replace('NaN', np.nan).astype("float")


## The PUSHES

In [None]:
import sqlalchemy
!pip install pymysql
import pymysql




In [None]:
schema = "P3_Cities"   # name of the database you want to use here
host = "city-project3-db.cf7wpiecfnwf.us-east-1.rds.amazonaws.com"        # to connect to your local server
user = "admin"
password = PASSWORD # your password!!!!
port = 3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

#### The cities P3.PUSH

In [None]:
city3.to_sql('cities',         # 'iss_logs'-> table name;
              if_exists='append', # if_exists -> will create new table if doesn't exist, otherwise, 'append' - will append data to existing table;
              con=con,            # con-> connection string;
              index=False)       

10

#### The cities P3.PUSH

In [None]:
citi_populations1.to_sql("city_populations",
                         if_exists='append',
                         con=con,
                         index=False)

10

# Creating the airport_df

In [None]:
#creating the list of data from scrapping that is needed for AIP requests
list= []
for idx in range(city3.shape[0]):
    list.append((city3.loc[idx,"city_name"],city3.loc[idx,"geo_airports"]))
    
print(list)

[('Istanbul', '41/28'), ('Moscow', '55/37'), ('London', '51/0'), ('Saint Petersburg', '59/30'), ('Berlin', '52/13'), ('Madrid', '40/03'), ('Kyiv', '50/30'), ('Rome', '41/12'), ('Bucharest', '44/26'), ('Paris', '48/2')]


In [None]:
list[0][1]

'41/28'

In [None]:
def get_airport_loop(city_geo):

    airport_loc_dict = {'city_name': [],
                        'airport_name': [],
                        'icao': [],
                        'municipality_name': [],
                        'latitude': [],
                        'longitude': [],
                        'country_code': []}

    distance = 100
    limit = 10

    for idx in range(len(city_geo)): 
        url = (f"https://aerodatabox.p.rapidapi.com/airports/search/location/{city_geo[idx][1]}/km/{distance}/{limit}")
        querystring = {"withFlightInfoOnly":"true"}
        headers = {
            "X-RapidAPI-Key": "293ffca8e9msh49630c2e60ef2dep1e3990jsn37f988bec4ee",
            "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
                }
        response = requests.request("GET", url, headers=headers, params=querystring)
        #airports.append(response.json())
        print(response)
    

        for i in response.json()['items']:
            # defining the cities around which we are looking for airports
            airport_loc_dict['city_name'].append(city_geo[idx][0])
            airport_loc_dict['airport_name'].append(i['name'])
            airport_loc_dict['country_code'].append(i['countryCode'])
            airport_loc_dict['icao'].append(i['icao'])
            try:
                airport_loc_dict['municipality_name'].append(i['municipalityName'])
            except:
                airport_loc_dict['municipality_name'].append(0)
            airport_loc_dict['latitude'].append(i['location']['lat'])
            airport_loc_dict['longitude'].append(i['location']['lon'])
    #     print(airport_loc_dict)
    # print(airport_loc_dict)
        
    return pd.DataFrame(airport_loc_dict)

In [None]:
airport_df = pd.DataFrame(get_airport_loop(list))

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [None]:
airport_df

### Making the push

In [None]:
# !pip install sqlalchemy
# !pip install pymysql 

# import sqlalchemy # install if needed
# import pymysql

In [None]:
# # defining the connection variables
# schema="SQL_Cities"   # name of the database you want to use here
# host="127.0.0.1"        # to connect to your local server
# user="root"
# password= PASSWORD # variable Password from first cell
# port= 3306
# con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

### P1.Push to SQL of Airport table

In [None]:
airport_df.to_sql("city_airports",
                 if_exists="append", # or "replace"
                 con=con,
                 index=False)

22