# Airports

### Import Packages

In [1]:
import pandas as pd
import sqlalchemy
import os
from dotenv import load_dotenv, find_dotenv
from functools import wraps
import datetime as dt

### Load variables from .env file

In [2]:
# load env data from .env file.
load_dotenv(find_dotenv(filename='.env'))

True

### Logging Wrapper

In [3]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

### Get Airports from CSV

In [4]:
all_airports = pd.read_csv("../data/csv/airports.csv")
all_airports.head(2)

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total Rf Heliport,40.070801,-74.933601,11.0,,US,US-PA,Bensalem,no,00A,,00A,,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,


## Data Cleaning Pipeline

### Init Pipeline

In [5]:
@log_step
def init_pipeline(df):
    return df.copy()

### Rename Columns

In [6]:
@log_step
def rename_columns(df):
    return  (
    df.rename(columns={
        "id": "airport_id",
        "ident": "airport_ident",
        "name": "airport_name",
        "type": "airport_type",
        "latitude_deg": "airport_latitude",
        "longitude_deg": "airport_longitude",
        "elevation_ft": "airport_elevation_ft",
        "continent": "airport_continent",
        "iso_country": "airport_iso_country",
        "iso_region": "airport_iso_region",
        "municipality": "airport_municipality",
        "gps_code": "airport_gps_code",
        "iata_code": "airport_iata_code",
        "home_link": "airport_home_link",
        "wikipedia_link": "airport_wikipedia_link",
        "kewords": "airport_kewords"
        
    })
    )

### Drop Columns

In [7]:
@log_step
def drop_columns(df):
    return  df.drop(columns=["local_code", "scheduled_service"])

### Add Columns

In [8]:
@log_step
def add_columns(df):
    return  (
        df
        .assign(municipality_country = lambda x: x["airport_municipality"] + "," + x["airport_iso_country"])
        .assign(created_at = dt.datetime.now()) 
    )

### Drop Duplicates

In [9]:
@log_step
def drop_duplicates(df):
    return  df.drop_duplicates()

### Get relevant Airports

In [10]:
@log_step
def get_relevant_airports(df):
    sql = '''
    SELECT municipality_country 
    FROM cities;
    '''

    con = f'mysql+pymysql://{os.environ["DB_USER"]}:{os.environ["DB_PASSWORD"]}@{os.environ["DB_HOST"]}:{os.environ["DB_PORT"]}/{os.environ["DB_SCHEMA"]}'
    german_cities = pd.read_sql(sql, con)["municipality_country"].to_list()
    df = (
        df
        .loc[df["municipality_country"].isin(german_cities)]
        .loc[df["airport_type"] != "closed"]
    )

    return df

### Adjust Datatypes

In [11]:
@log_step
def adjust_datatypes(df):
   df["airport_id"] = df["airport_id"].astype("string")
   df["airport_ident"] = df["airport_ident"].astype("string")
   df["airport_type"] = df["airport_type"].astype("category")
   df["airport_name"] = df["airport_name"].astype("string")
   df["airport_latitude"] = df["airport_latitude"].astype("float32")
   df["airport_longitude"] = df["airport_longitude"].astype("float32")
   df["airport_elevation_ft"] = df["airport_elevation_ft"].astype("Int32")
   df["airport_continent"] = df["airport_continent"].astype("string")
   df["airport_iso_country"] = df["airport_iso_country"].astype("string")
   df["airport_iso_region"] = df["airport_iso_region"].astype("string")
   df["airport_municipality"] = df["airport_municipality"].astype("string")
   df["airport_gps_code"] = df["airport_gps_code"].astype("string")
   df["airport_iata_code"] = df["airport_iata_code"].astype("string")
   df["airport_home_link"] = df["airport_home_link"].astype("string")
   df["airport_wikipedia_link"] = df["airport_wikipedia_link"].astype("string")
   df["keywords"] = df["keywords"].astype("string")
   df["municipality_country"] = df["municipality_country"].astype("category")              
   
   return df

### Send to DB

In [12]:
def send_to_DB(df, table_name, if_exists="replace"):
    con = f'mysql+pymysql://{os.environ["DB_USER"]}:{os.environ["DB_PASSWORD"]}@{os.environ["DB_HOST"]}:{os.environ["DB_PORT"]}/{os.environ["DB_SCHEMA"]}'
    df.to_sql(
        table_name, 
        con=con, 
        if_exists=if_exists, 
        index=False,
        dtype= {
            "airport_id" : sqlalchemy.types.VARCHAR(length=30),
            "airport_ident" : sqlalchemy.types.VARCHAR(length=10),
            "airport_type" : sqlalchemy.types.VARCHAR(length=30),
            "airport_name" : sqlalchemy.types.VARCHAR(length=60),
            "airport_latitude" : sqlalchemy.types.Float(precision=8, asdecimal=True),
            "airport_longitude" : sqlalchemy.types.Float(precision=8, asdecimal=True),
            "airport_elevation_ft" : sqlalchemy.types.Integer(),
            "airport_continent" : sqlalchemy.types.VARCHAR(length=30),
            "airport_iso_country" : sqlalchemy.types.VARCHAR(length=30),
            "airport_iso_region" : sqlalchemy.types.VARCHAR(length=30),
            "airport_municipality" : sqlalchemy.types.VARCHAR(length=50),
            "airport_gps_code" : sqlalchemy.types.VARCHAR(length=10),
            "airport_iata_code" : sqlalchemy.types.VARCHAR(length=5),
            "airport_home_link" : sqlalchemy.types.VARCHAR(length=200),
            "airport_wikipedia_link" : sqlalchemy.types.VARCHAR(length=200),
            "keywords" : sqlalchemy.types.VARCHAR(length=200),
            "municipality_country" : sqlalchemy.types.VARCHAR(length=100)  
        }
    )
    engine = sqlalchemy.create_engine(con)
    with engine.connect() as engine:
        # Add primary key
        engine.execute('''
        ALTER TABLE airports 
        ADD PRIMARY KEY (airport_id);
        '''
        )
        # Add foreign key
        engine.execute('''
        ALTER TABLE airports 
        ADD FOREIGN KEY (municipality_country) REFERENCES cities(municipality_country);
        ''')
    return df


### Fill Null Values

In [13]:
def fill_null_values(df):
    df["airport_elevation_ft"] = df["airport_elevation_ft"].fillna(-9999)
    df["airport_gps_code"] = df["airport_gps_code"].fillna(("UNKNOWN"))
    df["airport_iata_code"] = df["airport_iata_code"].fillna(("UNKNOWN"))
    df["airport_iata_code"] = df["airport_iata_code"].fillna(("UNKNOWN"))
    df["airport_home_link"] = df["airport_home_link"].fillna((""))
    df["airport_wikipedia_link"] = df["airport_wikipedia_link"].fillna((""))
    df["keywords"] = df["keywords"].fillna((""))
    return df

## RUN Pipeline

In [14]:
airports = (
    all_airports
        .pipe(init_pipeline)
        .pipe(rename_columns)
        .pipe(drop_columns)
        .pipe(add_columns)
        .pipe(drop_duplicates)
        .pipe(get_relevant_airports)
        .pipe(adjust_datatypes)
        .pipe(fill_null_values)
        # Send to DB in cell below
)

init_pipeline:
 shape=(68333, 18) took 0:00:00.010401s

rename_columns:
 shape=(68333, 18) took 0:00:00.009486s

drop_columns:
 shape=(68333, 16) took 0:00:00.006625s

add_columns:
 shape=(68333, 18) took 0:00:00.071548s

drop_duplicates:
 shape=(68333, 18) took 0:00:00.162005s

get_relevant_airports:
 shape=(419, 18) took 0:00:02.201210s

adjust_datatypes:
 shape=(419, 18) took 0:00:00.009448s



In [15]:
send_to_DB(
    df=airports, 
    table_name="airports", 
    if_exists="replace"
    )

Unnamed: 0,airport_id,airport_ident,airport_type,airport_name,airport_latitude,airport_longitude,airport_elevation_ft,airport_continent,airport_iso_country,airport_iso_region,airport_municipality,airport_gps_code,airport_iata_code,airport_home_link,airport_wikipedia_link,keywords,municipality_country,created_at
19215,299694,DE-0003,small_airport,August-Euler Flugplatz,49.853756,8.586243,-9999,EU,DE,DE-HE,Griesheim,UNKNOWN,UNKNOWN,http://www.sla.tu-darmstadt.de/windkanal/einri...,http://de.wikipedia.org/wiki/August-Euler-Flug...,,"Griesheim,DE",2022-04-07 10:33:17.927390
19216,28576,DE-0004,small_airport,Flugplatz Gransee,53.006699,13.205000,164,EU,DE,DE-BR,Gransee,EDUG,UNKNOWN,http://www.gojump.de/,,,"Gransee,DE",2022-04-07 10:33:17.927390
19226,44583,DE-0014,heliport,Rostock University Helipad,54.083302,12.100500,-9999,EU,DE,DE-MV,Rostock,UNKNOWN,UNKNOWN,,,,"Rostock,DE",2022-04-07 10:33:17.927390
19227,44584,DE-0015,heliport,Dresden Friedrichstadt Hospital Helipad,51.058102,13.719000,-9999,EU,DE,DE-SN,Dresden,UNKNOWN,UNKNOWN,,,,"Dresden,DE",2022-04-07 10:33:17.927390
19228,44585,DE-0016,heliport,Erlangen Ebrardstrasse Helipad,49.602001,11.022500,-9999,EU,DE,DE-BY,Erlangen,UNKNOWN,UNKNOWN,,,Erlangen Ebrardstraße Helipad,"Erlangen,DE",2022-04-07 10:33:17.927390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22078,2739,ETNL,medium_airport,Rostock-Laage Airport,53.918201,12.278300,138,EU,DE,DE-MV,Rostock,ETNL,RLG,,https://en.wikipedia.org/wiki/Rostock-Laage_Ai...,,"Rostock,DE",2022-04-07 10:33:17.927390
22087,2747,ETOU,medium_airport,Wiesbaden Army Airfield,50.049801,8.325400,461,EU,DE,DE-HE,Wiesbaden,ETOU,WIE,,https://en.wikipedia.org/wiki/Wiesbaden_Army_A...,,"Wiesbaden,DE",2022-04-07 10:33:17.927390
22093,2753,ETSI,medium_airport,Ingolstadt Manching Airport,48.715698,11.534000,1202,EU,DE,DE-BY,Manching,ETSI,IGS,,https://en.wikipedia.org/wiki/Ingolstadt_Manch...,,"Manching,DE",2022-04-07 10:33:17.927390
22097,309402,ETT1,small_airport,Etting-Adelmannsberg Glider Field,48.810280,11.420960,1253,EU,DE,DE-BY,Ingolstadt,UNKNOWN,UNKNOWN,http://www.aero-club-ingolstadt.de/,,,"Ingolstadt,DE",2022-04-07 10:33:17.927390
