# Prepare more data for sending to database

In [2]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://images.freecreatives.com/wp-content/uploads/2016/02/Airplane-Landing-Wallpaper.jpg", width = 600)

**In this notebook, we will prepare two other datasets and send them to our database.**  
We now want to expand our flights data by adding more information about the airports and airlines included in our nyflights table.  
Therefore, we will set up two new tables, which we can later on combine with our nyflights table.  
The first table is about **airports** and their locations, the second about **airlines**.



In [3]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import requests
from zipfile import *
from configdef import *
from sqlalchemy import exc #SQLAlchemy provides a nice “Pythonic” way of interacting with databases.
from sqlalchemy import event

# 1. Set up a connection

Again we start with connecting to our sql database.

In [4]:
# Establish db connection

# Get connection details from configdef file into a list
params = config(section='postgres')

# Use sql alchemy to create connection to database, which is contained within the engine object
engine = pg_engine_connection(**params)

# Cleans up unnecessary database connections
engine.dispose()

Postgres Database connection successful


# 2. Download, prepare and send data on airports to the database

We now read in and prepare data on [airports](https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat).

The following function loads the data, does some data cleaning and sends the data to our database.  
Sice we have set ```if_exists='replace'```, the data in the existing table are replaced. As we don't change anything within the data, nothing will change.

In [12]:
# All columns in source table
airport_columns = ['id', 'name', 'city', 'country', 'faa', 'icao', 'lat', 'lon', 'alt', 'tz', 'dst', 'region', 'airport', 'source']
# Define which columns to keep in sql db
airport_columns_to_keep = ['faa', 'name', 'lat', 'lon', 'alt', 'tz', 'dst', 'city', 'country']

def run_aiports_table_pipeline(engine):
    ''' Create the airports table in SQL based on remote source'''
    
    df_airports = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', 
                              names=airport_columns)
    df_airports = df_airports.loc[:,airport_columns_to_keep]
    
    # Clean NULL values by replacing '\\N' by NaN
    df_airports = df_airports.loc[df_airports['faa']!='\\N']
    df_airports.replace('\\N',np.NaN, inplace=True)
    
    # Convert datatype of timezone 
    df_airports['tz']= pd.to_numeric(df_airports['tz'], errors='coerce', downcast='integer')
    
    # The following try-except loop
    if engine!=None:

        try:
            # sending df to spl
            df_airports.to_sql('airports', con=engine, if_exists='replace', index=False, 
                method='multi', chunksize=5000)
            print(f'airports table is imported successfully.')

        except exc.SQLAlchemyError as e:
            print(type(e))
            # set the connection to 'None' in case of error
            engine = None

In [13]:
# Execute the function
run_aiports_table_pipeline(engine)

airports table is imported successfully.


In [7]:
# Check the top rows of this table
airports = engine.execute('select * from airports limit 5').fetchall()
airports

[('GKA', 'Goroka Airport', -6.081689834590001, 145.391998291, 5282, 10.0, 'U', 'Goroka', 'Papua New Guinea'),
 ('MAG', 'Madang Airport', -5.20707988739, 145.789001465, 20, 10.0, 'U', 'Madang', 'Papua New Guinea'),
 ('HGU', 'Mount Hagen Kagamuga Airport', -5.826789855957031, 144.29600524902344, 5388, 10.0, 'U', 'Mount Hagen', 'Papua New Guinea'),
 ('LAE', 'Nadzab Airport', -6.569803, 146.725977, 239, 10.0, 'U', 'Nadzab', 'Papua New Guinea'),
 ('POM', 'Port Moresby Jacksons International Airport', -9.44338035583496, 147.22000122070312, 146, 10.0, 'U', 'Port Moresby', 'Papua New Guinea')]

# 3. Download, prepare and send data on airlines to the database

We now read in and prepare data on [airlines/carriers](https://raw.githubusercontent.com/dannguyen/bts-transstats-t100-domestic-demo/master/data/lookup-tables/L_UNIQUE_CARRIERS.csv).

The following function loads the data and sends the data to our database.  
Sice we have set ```if_exists='replace'```, the data in the existing table are replaced. As we don't change anything within the data, nothing will change.

In [11]:
carrier_columns = ['carrier', 'name']

def run_carrier_table_pipeline(engine):
    ''' Create the carriers table in SQL based on remote source'''
    
    df_carriers = pd.read_csv("https://raw.githubusercontent.com/dannguyen/bts-transstats-t100-domestic-demo/master/data/lookup-tables/L_UNIQUE_CARRIERS.csv", 
                              names=carrier_columns, skiprows = 1)
    
    if engine!=None:

        try:
            # sending df to spl
            df_carriers.to_sql('carriers', con=engine, if_exists='replace', index=False, 
                method='multi', chunksize=5000)
            print(f"airline table is imported successfully.")

        except exc.SQLAlchemyError as e:
            print(type(e))
            # set the connection to 'None' in case of error
            engine = None

In [15]:
# Execute the function
run_carrier_table_pipeline(engine)

airline table is imported successfully.


In [10]:
# Check the top rows of this table
airlines = engine.execute('select * from carriers limit 5').fetchall()
airlines

[('02Q', 'Titan Airways'),
 ('04Q', 'Tradewind Aviation'),
 ('05Q', 'Comlux Aviation, AG'),
 ('06Q', 'Master Top Linhas Aereas Ltd.'),
 ('07Q', 'Flair Airlines Ltd.')]