In [32]:
import json
import os
from pathlib import Path

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

In [33]:
api_key = "c07526f63bee4d248b5e3ce31f9d3435"
api_path = f"https://api.opencagedata.com/geocode/v1/json"

In [34]:
def get_proper_api_path(place, api_key=api_key, api_path=api_path):
    return f"{api_path}?q={place}&key={api_key}"

In [35]:
def fetch_place(place, **params):
    path = get_proper_api_path(place, **params)
    res = requests.get(path)

    if res.status_code != 200:
        print(f"Cannot fetch {place}")
        raise ValueError

    return json.loads(res.content.decode("utf8"))

In [36]:
na_path = "https://en.wikipedia.org/wiki/List_of_busiest_airports_in_North_America"
hi_path = "https://en.wikipedia.org/wiki/List_of_the_busiest_airports_in_India"
pak_path = "https://en.wikipedia.org/wiki/List_of_the_busiest_airports_in_Pakistan"

In [37]:
def fetch_airport_data(path, **params):
    res = requests.get(path)

    if res.status_code != 200:
        print(f"Cannot fetch {path}")
        raise ValueError

    html = res.content.decode("utf8")

    soup = BeautifulSoup(html, "html")
    rows = soup.find("table").find_all("tr")[1:]
    airports = []
    for row in tqdm(rows):
        obj = parse_row(row, **params)
        if obj != 0:
            airports.append(parse_row(row, **params))

    return airports

In [42]:
def parse_row(row, idxs=(2, 3, 4, 5), is_na=True):
    tds = row.find_all("td")
    if is_na:
        country = tds[idxs[2]].find_all("a")[0]["title"]
        if country != 'Mexico' and tds[idxs[1]].a.string != 'Honolulu' and country != 'Andaman and Nicobar Islands':

            airport = {
                "name": tds[idxs[0]].a.string,
                "city": tds[idxs[1]].a.string,
                "country": country,
                "passengers": int(str(tds[idxs[3]].contents[0]).replace(",", ""))
            }
            query  = f"{airport['city']},{airport['country']}"
            res_json = fetch_place(query)["results"]
            coords = res_json[0]["geometry"]
            airport["lng"] = coords["lng"]
            airport["lat"] = coords["lat"]
            return airport
        else: 
            return 0
    else:
        country = tds[idxs[2]].contents[0].rstrip()

        airport = {
            "name": tds[idxs[0]].a.string,
            "city": tds[idxs[1]].a.string,
            "country": country,
            "passengers": int(str(tds[idxs[3]].contents[0]).replace(",", ""))
        }
        query  = f"{airport['city']},{airport['country']}"
        res_json = fetch_place(query)["results"]
        if len(res_json) == 0:
            if (airport['city'] == 'Mohenjodaro'):
                query = 'Moenjodaro'
                res_json = fetch_place(query)["results"]
            elif (airport['city'] == 'Dalbadin'):
                query = 'Dalbandin,DBA'
                res_json = fetch_place(query)["results"]
        coords = res_json[0]["geometry"]
        airport["lng"] = coords["lng"]
        airport["lat"] = coords["lat"]
        return airport

In [27]:
na_air = fetch_airport_data(na_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [43]:
hi_air = fetch_airport_data(hi_path, idxs=(1, 2, 3, 5))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [29]:
pak_air = fetch_airport_data(pak_path, idxs=(1, 2, 3, 5), is_na=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [44]:
na_df = pd.DataFrame(na_air)
hi_df = pd.DataFrame(hi_air)
pak_df = pd.DataFrame(pak_air)

In [45]:
data_path = Path(os.getcwd()).parent/"data"

na_df.to_csv(data_path/"na_airports.csv")
hi_df.to_csv(data_path/"hi_airports.csv")
pak_df.to_csv(data_path/"pak_airports.csv")