# Data Engineer Case Study
Model developed based on a case study of data about flights.

In [139]:
__author__ = 'Phil Baltazar'
__email__  = 'phillusnow@gmail.com'
__website__= 'www.github.com/pbswe'

In [266]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sqlalchemy
from sqlalchemy import create_engine
import datetime
import psycopg2

Loading the data and EDA (exploratory data analysis)

In [296]:
url = "../Data_Engineer_Flights/flights.txt"
interDF = pd.read_csv(url, sep="|", encoding="utf-8")

In [297]:
interDF.head()

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,AIRLINECODE,AIRLINENAME,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGAIRPORTNAME,ORIGINCITYNAME,ORIGINSTATE,...,WHEELSON,TAXIIN,CRSARRTIME,ARRTIME,ARRDELAY,CRSELAPSEDTIME,ACTUALELAPSEDTIME,CANCELLED,DIVERTED,DISTANCE
0,54548800,20020101,WN,Southwest Airlines Co.: WN,N103@@,1425,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1648.0,4.0,1655,1652.0,-3.0,90.0,87.0,F,False,580 miles
1,55872300,20020101,CO,Continental Air Lines Inc.: CO,N83872,150,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1419.0,16.0,1426,1435.0,9.0,116.0,119.0,False,F,744 miles
2,54388800,20020101,WN,Southwest Airlines Co.: WN,N334@@,249,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1618.0,2.0,1500,1620.0,80.0,105.0,102.0,F,False,718 miles
3,54486500,20020101,WN,Southwest Airlines Co.: WN,N699@@,902,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1947.0,1.0,1950,1948.0,-2.0,85.0,83.0,0,0,487 miles
4,55878700,20020103,CO,Continental Air Lines Inc.: CO,N58606,234,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1742.0,5.0,1750,1747.0,-3.0,115.0,114.0,F,False,744 miles


In [298]:
interDF.columns
#headerNames = "TRANSACTIONID|FLIGHTDATE|AIRLINECODE|AIRLINENAME|TAILNUM|FLIGHTNUM|ORIGINAIRPORTCODE|ORIGAIRPORTNAME|ORIGINCITYNAME|ORIGINSTATE|ORIGINSTATENAME|DESTAIRPORTCODE|DESTAIRPORTNAME|DESTCITYNAME|DESTSTATE|DESTSTATENAME|CRSDEPTIME|DEPTIME|DEPDELAY|TAXIOUT|WHEELSOFF|WHEELSON|TAXIIN|CRSARRTIME|ARRTIME|ARRDELAY|CRSELAPSEDTIME|ACTUALELAPSEDTIME|CANCELLED|DIVERTED|DISTANCE"

Index(['TRANSACTIONID', 'FLIGHTDATE', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM',
       'FLIGHTNUM', 'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'ORIGINCITYNAME',
       'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
       'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME',
       'DEPDELAY', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME',
       'ARRTIME', 'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME',
       'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [299]:
interDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191805 entries, 0 to 1191804
Data columns (total 31 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   TRANSACTIONID      1191805 non-null  int64  
 1   FLIGHTDATE         1191805 non-null  int64  
 2   AIRLINECODE        1191805 non-null  object 
 3   AIRLINENAME        1191805 non-null  object 
 4   TAILNUM            1034988 non-null  object 
 5   FLIGHTNUM          1191805 non-null  int64  
 6   ORIGINAIRPORTCODE  1191805 non-null  object 
 7   ORIGAIRPORTNAME    1191805 non-null  object 
 8   ORIGINCITYNAME     1191805 non-null  object 
 9   ORIGINSTATE        1180963 non-null  object 
 10  ORIGINSTATENAME    1180963 non-null  object 
 11  DESTAIRPORTCODE    1191805 non-null  object 
 12  DESTAIRPORTNAME    1191805 non-null  object 
 13  DESTCITYNAME       1191805 non-null  object 
 14  DESTSTATE          1180967 non-null  object 
 15  DESTSTATENAME      1180967 non-n

Correcting / fixing data types. 

In [300]:
interDF['CANCELLED'].value_counts()

False    637289
0        347545
F        178357
True      16370
1          8160
T          4084
Name: CANCELLED, dtype: int64

In [301]:
interDF['CANCELLED'].replace('False', 0, inplace=True)
interDF['CANCELLED'].replace('F', 0, inplace=True)
interDF['CANCELLED'].replace('0', 0, inplace=True)
interDF['CANCELLED'].replace('True', 1, inplace=True)
interDF['CANCELLED'].replace('T', 1, inplace=True)
interDF['CANCELLED'].replace('1', 1, inplace=True)


In [302]:
interDF['DIVERTED'].value_counts()

F        426572
False    407681
0        354906
T           966
True        881
1           799
Name: DIVERTED, dtype: int64

In [303]:
interDF['DIVERTED'].replace('False', 0, inplace=True)
interDF['DIVERTED'].replace('F', 0, inplace=True)
interDF['DIVERTED'].replace('0', 0, inplace=True)
interDF['DIVERTED'].replace('True', 1, inplace=True)
interDF['DIVERTED'].replace('T', 1, inplace=True)
interDF['DIVERTED'].replace('1', 1, inplace=True)

In [304]:
interDF['DISTANCE'].value_counts()

337 miles     10113
370 miles      6851
236 miles      5726
328 miles      5721
224 miles      5509
              ...  
3379 miles        1
1401 miles        1
18 miles          1
1993 miles        1
2046 miles        1
Name: DISTANCE, Length: 1706, dtype: int64

In [305]:
# The lines below handles the 'DISTANCE' attribute that contains: \n
# the number of miles (as it should be used as a numerical data for calculation), and \n
# the word "miles" next to the number, which is measurement lable in a string format. 

distCol = interDF.DISTANCE.str.split(expand=True)
distColRename = ['DISTNUM', 'MEASURE']
distCol.columns = distColRename
distCol

Unnamed: 0,DISTNUM,MEASURE
0,580,miles
1,744,miles
2,718,miles
3,487,miles
4,744,miles
...,...,...
1191800,721,miles
1191801,731,miles
1191802,731,miles
1191803,508,miles


In [306]:
airNameCol = interDF.AIRLINENAME.str.split(':', expand=True)

In [307]:
interDF.rename(columns = {'AIRLINENAME':'AIRLINENAMEDROP'}, inplace = True)

In [308]:
airNameCol
airColRename = ['AIRLINENAME', 'AIRLINECODE1']
airNameCol.columns = airColRename
airNameCol

Unnamed: 0,AIRLINENAME,AIRLINECODE1
0,Southwest Airlines Co.,WN
1,Continental Air Lines Inc.,CO
2,Southwest Airlines Co.,WN
3,Southwest Airlines Co.,WN
4,Continental Air Lines Inc.,CO
...,...,...
1191800,ExpressJet Airlines Inc.,EV
1191801,Delta Air Lines Inc.,DL
1191802,Delta Air Lines Inc.,DL
1191803,ExpressJet Airlines Inc.,EV


In [309]:
origAirCol = interDF.ORIGAIRPORTNAME.str.split(':', expand=True)

In [310]:
origAirColRename = ['CITYSTATE', 'AIRPORTNAME']
origAirCol.columns = origAirColRename
origAirCol

Unnamed: 0,CITYSTATE,AIRPORTNAME
0,AlbuquerqueNM,Albuquerque International Sunport
1,AlbuquerqueNM,Albuquerque International Sunport
2,AlbuquerqueNM,Albuquerque International Sunport
3,AlbuquerqueNM,Albuquerque International Sunport
4,AlbuquerqueNM,Albuquerque International Sunport
...,...,...
1191800,AtlantaGA,Hartsfield-Jackson Atlanta International
1191801,AtlantaGA,Hartsfield-Jackson Atlanta International
1191802,AtlantaGA,Hartsfield-Jackson Atlanta International
1191803,AtlantaGA,Hartsfield-Jackson Atlanta International


In [311]:
newDF = pd.concat([interDF, airNameCol, origAirCol, distCol], axis=1)
interDF = newDF

In [312]:
# Column 'MEASURE' would be useful if we ever have an entry showing something else \n
# such as kilometers. For now, since they're all miles, we can drop this column, \n
# as well as 'DISTANCE' since 'DISTNUM' will contain the number of miles. 
# Also dropping 'AIRLINENAME' and 'AIRLINECODE' duplicates when split to atomize columns.

interDF = interDF.drop(['AIRLINENAMEDROP'], axis=1)
interDF = interDF.drop(['AIRLINECODE'], axis=1)
interDF = interDF.drop(['MEASURE'], axis=1)
interDF = interDF.drop(['DISTANCE'], axis=1) 
interDF = interDF.drop(['ORIGAIRPORTNAME'], axis=1)

In [313]:
interDF.rename(columns = {'AIRPORTNAME':'ORIGAIRPORTNAME'}, inplace = True)
interDF.rename(columns = {'AIRLINECODE1':'AIRLINECODE'}, inplace = True)

In [314]:
interDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191805 entries, 0 to 1191804
Data columns (total 32 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   TRANSACTIONID      1191805 non-null  int64  
 1   FLIGHTDATE         1191805 non-null  int64  
 2   TAILNUM            1034988 non-null  object 
 3   FLIGHTNUM          1191805 non-null  int64  
 4   ORIGINAIRPORTCODE  1191805 non-null  object 
 5   ORIGINCITYNAME     1191805 non-null  object 
 6   ORIGINSTATE        1180963 non-null  object 
 7   ORIGINSTATENAME    1180963 non-null  object 
 8   DESTAIRPORTCODE    1191805 non-null  object 
 9   DESTAIRPORTNAME    1191805 non-null  object 
 10  DESTCITYNAME       1191805 non-null  object 
 11  DESTSTATE          1180967 non-null  object 
 12  DESTSTATENAME      1180967 non-null  object 
 13  CRSDEPTIME         1191805 non-null  int64  
 14  DEPTIME            1163470 non-null  float64
 15  DEPDELAY           1163470 non-n

In [315]:
categoricCols = ['TRANSACTIONID', 'FLIGHTDATE', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM',
                 'FLIGHTNUM', 'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'ORIGINCITYNAME',
                 'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
                 'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CITYSTATE']

numericCols = ['CRSDEPTIME', 'DEPTIME', 'DEPDELAY', 'TAXIOUT', 'TAXIIN', 'ARRDELAY', 
               'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'ARRTIME', 'DISTNUM', 
               'ARRTIME', 'CRSARRTIME', 'WHEELSOFF', 'WHEELSON'] #timeCols as int for now.

boolCols = ['CANCELLED', 'DIVERTED']

# timeCols = ['ARRTIME', 'CRSARRTIME', 'WHEELSOFF', 'WHEELSON']

In [316]:
interDF[categoricCols] = interDF[categoricCols].astype('category')
interDF[numericCols] = interDF[numericCols].astype('float')
interDF[boolCols] = interDF[boolCols].astype('bool')
# interDF[timeCols] = interDF[timeCols].astype('datetime64[ns]')

In [317]:
interDF.describe(include=['category'])

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGINCITYNAME,ORIGINSTATE,ORIGINSTATENAME,DESTAIRPORTCODE,DESTAIRPORTNAME,DESTCITYNAME,DESTSTATE,DESTSTATENAME,AIRLINENAME,AIRLINECODE,CITYSTATE,ORIGAIRPORTNAME
count,1191805,1191805,1034988,1191805,1191805,1191805,1180963,1180963,1191805,1191805,1191805,1180967,1180967,1191805,1191805,1191805,1191805
unique,1191805,1902,13869,8034,363,344,51,51,364,364,345,51,51,26,26,353,363
top,147818200,20040924,UNKNOW,505,ATL,Chicago,CA,California,ATL,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Chicago,CA,California,Southwest Airlines Co.,WN,ChicagoIL,Hartsfield-Jackson Atlanta International
freq,1,849,7640,849,64421,76508,141852,141852,64036,64036,76986,141630,141630,189985,189985,76508,64421


In [318]:
interDF['DISTNUM'] = interDF['DISTNUM'].astype(int)

In [319]:
binwidth = int((max(interDF['DISTNUM'])-min(interDF['DISTNUM']))/4)
bins = range(min(interDF['DISTNUM']), max(interDF['DISTNUM']), binwidth)
group_names = ['Short', 'Medium', 'Long']

interDF['DISTANCEGROUP'] = pd.cut(interDF['DISTNUM'], bins, labels=group_names)

In [320]:
# Create a DEPDELAYGT15 column for delays greater than 15 minutes.

interDF['DEPDELAYGT15'] = np.where(interDF['DEPDELAY'] >= 15.0, True, False)


In [321]:
interDF['DEPDELAYGT15'].value_counts()

False    1014794
True      177011
Name: DEPDELAYGT15, dtype: int64

In [322]:
# Create a NEXTDAYARR column for next day arrivals.

interDF['NEXTDAYARR'] = (interDF['ARRTIME'] > 15) 

interDF['DEPTIME'].fillna(0, inplace=True)
interDF['DEPTIME'] = interDF['DEPTIME'].astype(int)

interDF['ACTUALELAPSEDTIME'].fillna(0, inplace=True)
interDF['ACTUALELAPSEDTIME'] = interDF['ACTUALELAPSEDTIME'].astype(int)

interDF['ARRTIME'].fillna(0, inplace=True)
interDF['ARRTIME'] = interDF['ARRTIME'].astype(int)

interDF['NEXTDAYARR'].fillna(0, inplace=True)
interDF['NEXTDAYARR'] = interDF['NEXTDAYARR'].astype(int)

interDF['NEXTDAYARR'] = np.where(interDF['ARRTIME'] >= 15.0, True, False)

In [323]:
interDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191805 entries, 0 to 1191804
Data columns (total 35 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   TRANSACTIONID      1191805 non-null  category
 1   FLIGHTDATE         1191805 non-null  category
 2   TAILNUM            1034988 non-null  category
 3   FLIGHTNUM          1191805 non-null  category
 4   ORIGINAIRPORTCODE  1191805 non-null  category
 5   ORIGINCITYNAME     1191805 non-null  category
 6   ORIGINSTATE        1180963 non-null  category
 7   ORIGINSTATENAME    1180963 non-null  category
 8   DESTAIRPORTCODE    1191805 non-null  category
 9   DESTAIRPORTNAME    1191805 non-null  category
 10  DESTCITYNAME       1191805 non-null  category
 11  DESTSTATE          1180967 non-null  category
 12  DESTSTATENAME      1180967 non-null  category
 13  CRSDEPTIME         1191805 non-null  float64 
 14  DEPTIME            1191805 non-null  int64   
 15  DEPDELAY       

The data is cleaned up, but still not ready to save into a SQL database. The next steps will include slicing this dataframe into smaller, atomic tables for data integrity and space optimization, with 1NF, 2NF and 3NF. 

In [328]:
#interDF.set_index('TRANSACTIONID', inplace=False)
#interDF.reset_index(inplace=True)

In [354]:
# Creating separate tables to remove duplicates and enforce data integrity.

# Tansactions Table
transactions = interDF[['TRANSACTIONID']]
transactions.reset_index(inplace=True)
transactions.rename(columns = {'index':'TRINDEX'}, inplace = True)

# FlightDate Table
flightDate = interDF[['FLIGHTDATE']]
flightDate.reset_index(inplace=True)
flightDate.rename(columns = {'index':'FDINDEX'}, inplace = True)

# CityState Table
cityState = interDF[['ORIGINCITYNAME', 'ORIGINSTATENAME', 'ORIGINSTATE',
                     'DESTCITYNAME', 'DESTSTATENAME', 'DESTSTATE']]
cityState.reset_index(inplace=True)
cityState.rename(columns = {'index':'CSINDEX'}, inplace = True)

# Airport Table
airport = interDF[['ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 
                   'DESTAIRPORTCODE', 'DESTAIRPORTNAME']]
airport.reset_index(inplace=True)
airport.rename(columns = {'index':'APINDEX'}, inplace=True)

In [355]:
# Now add foreign keys from tables created above into others for relation.

In [332]:
interDF.columns

Index(['TRANSACTIONID', 'FLIGHTDATE', 'TAILNUM', 'FLIGHTNUM',
       'ORIGINAIRPORTCODE', 'ORIGINCITYNAME', 'ORIGINSTATE', 'ORIGINSTATENAME',
       'DESTAIRPORTCODE', 'DESTAIRPORTNAME', 'DESTCITYNAME', 'DESTSTATE',
       'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME', 'DEPDELAY', 'TAXIOUT',
       'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME', 'ARRTIME', 'ARRDELAY',
       'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED', 'DIVERTED',
       'AIRLINENAME', 'AIRLINECODE', 'CITYSTATE', 'ORIGAIRPORTNAME', 'DISTNUM',
       'DISTANCEGROUP', 'DEPDELAYGT15', 'NEXTDAYARR'],
      dtype='object')

In [41]:
# Save the formatted, clean data just in case.

interDF.to_csv('cleaned_flights.txt', sep='|', index=False)

The Data has been cleaned up and formatted, and is ready to be saved into PostgreSQL.

Getting SQL engine started and connection established. 

In [42]:
sqlalchemy.create_engine('postgres://INFORMATIONHIDDEN')

Engine(postgres://INFORMATIONHIDDEN)

In [43]:
# Load data into PostgreSQL.
%load_ext sql

In [None]:
%sql postgresql://INFORMATIONHIDDEN:INFORMATIONHIDDEN@INFORMATIONHIDDEN/tests_data_engineering     
'''
POSTGRES_ADDRESS = 'XXX'
POSTGRES_USERNAME = 'XXX'
POSTGRES_PASSWORD = 'XXX'
POSTGRES_DBNAME = 'tests_data_engineering'
'''

In [None]:
# I've been facing some issues while pushing all the DF to PostgreSQL. I've found a \n
# workaround that pushes data in increments so I can be sure data was successfully \n
# saved in the database. Not as pretty as a single push, but it works for now. -PB

In [None]:
engine = create_engine('postgresql://INFORMATIONHIDDEN:INFORMATIONHIDDEN@INFORMATIONHIDDEN/tests_data_engineering') 

In [None]:
interDF.iloc[:100,:].to_sql('flights', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
interDF.iloc[101:500,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
interDF.iloc[501:5000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
interDF.iloc[5001:10000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# If time allows
# interDF.iloc[10001:50000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [44]:
# In SQL, create a FACT_FLIGHTS with flights information.

factDF = interDF[['TRANSACTIONID', 'DISTANCEGROUP', 'DISTNUM', 'MEASURE', 'DEPDELAYGT15', 
                 'NEXTDAYARR', 'AIRLINENAME', 'ORIGAIRPORTNAME', 'DESTAIRPORTNAME']]
factDF

Unnamed: 0,TRANSACTIONID,DISTANCEGROUP,DISTNUM,MEASURE,DEPDELAYGT15,NEXTDAYARR,AIRLINENAME,ORIGAIRPORTNAME,DESTAIRPORTNAME
0,54548800,Short,580,miles,False,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,DallasTX: Dallas Love Field
1,55872300,Short,744,miles,False,True,Continental Air Lines Inc.,AlbuquerqueNM: Albuquerque International Sunport,HoustonTX: George Bush Intercontinental/Houston
2,54388800,Short,718,miles,True,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,Kansas CityMO: Kansas City International
3,54486500,Short,487,miles,False,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,Las VegasNV: McCarran International
4,55878700,Short,744,miles,False,True,Continental Air Lines Inc.,AlbuquerqueNM: Albuquerque International Sunport,HoustonTX: George Bush Intercontinental/Houston
...,...,...,...,...,...,...,...,...,...
1191800,126750200,Short,721,miles,False,True,ExpressJet Airlines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,DallasTX: Dallas Love Field
1191801,127294500,Short,731,miles,False,True,Delta Air Lines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Dallas/Fort WorthTX: Dallas/Fort Worth Interna...
1191802,127294900,Short,731,miles,False,True,Delta Air Lines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Dallas/Fort WorthTX: Dallas/Fort Worth Interna...
1191803,126594900,Short,508,miles,False,True,ExpressJet Airlines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Fort WayneIN: Fort Wayne International


In [None]:
factDF.iloc[:10000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# factDF.iloc[10001:50000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# factDF.iloc[50001:500000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='append', chunksize=500)

In [45]:
# In SQL, create DIM_DATE and DIM_AIRPORT dimension tables. 

dimAirDF = interDF[['TRANSACTIONID', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM', 'FLIGHTNUM',
                     'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
                     'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'ARRDELAY', 'CANCELLED',
                     'DIVERTED', 'NEXTDAYARR', 'DEPDELAYGT15']]

dimDateDF = interDF[['TRANSACTIONID', 'FLIGHTDATE', 'ORIGINAIRPORTCODE', 'ORIGINCITYNAME',
                    'ORIGINSTATE', 'DESTAIRPORTCODE', 'DESTCITYNAME', 'DESTSTATE', 'DEPDELAY',
                    'ARRTIME', 'ARRDELAY', 'DISTANCEGROUP']]

In [None]:
'''
# This is a different approach to create the alternative DFs when the previous
# method did not work, but it has now been resolved - so this one is not used. 

#factDF.columns = interDF.columns
factDF = factDF.drop(['FLIGHTDATE', 'AIRLINECODE', 'TAILNUM', 'FLIGHTNUM', 'ORIGINAIRPORTCODE',
                      'ORIGINCITYNAME', 'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE',
                      'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME', 
                      'DEPDELAY', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME', 
                      'ARRTIME', 'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED',
                      'DIVERTED', 'DISTANCE'], axis=1)

dimAirDF.columns = interDF.columns
dimAirDF = dimAirDF.drop(['FLIGHTDATE', 'ORIGINCITYNAME', 'ORIGINSTATE', 'ORIGINSTATENAME',
                        'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME',
                        'DEPDELAY', 'CRSARRTIME', 'ARRTIME', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME',
                        'DISTANCE', 'DISTNUM', 'MEASURE','DISTANCEGROUP'], axis=1)

dimDateDF = interDF
dimDateDF.columns = interDF.columns
dimDateDF = dimDateDF.drop(['AIRLINECODE', 'TAILNUM', 'FLIGHTNUM', 'ORIGAIRPORTNAME',
                          'ORIGINSTATENAME', 'DESTAIRPORTNAME', 'DESTSTATENAME', 'CRSDEPTIME',
                          'DEPTIME', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME',
                          'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED', 'DIVERTED',
                          'DISTANCE', 'AIRLINENAME', 'DISTNUM', 'MEASURE', 'DEPDELAYGT15', 
                          'NEXTDAYARR'], axis=1)
'''

In [None]:
dimAirDF.iloc[:10000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# dimAirDF.iloc[10001:50000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# dimAirDF.iloc[50001:500000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
dimDateDF.iloc[:10000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# dimDateDF.iloc[10001:50000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# dimDateDF.iloc[50001:500000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# In SQL, created a view named VW_FLIGHTS that joins the fact and dimension tables and \n
# returns columns useful for analysis. 

In [None]:
%%sql

CREATE OR REPLACE VIEW XXX.VW_FLIGHTS AS
SELECT *
FROM flights
;

In [None]:
# VW_FLIGHTS filtered columns:
# TRANSACTIONID, DISTANCEGROUP, DEPDELAYGT15, NEXTDAYARR, 
# AIRLINENAME, ORIGAIRPORTNAME, DESTAIRPORTNAME

In [1]:
# Final considerations in the presentation. 

# The code below represents a second iteration with potential improvements from the first iteration above.¶

In [4]:
betaurl = "../Data_Engineer_Flights/cleaned_flights.txt"
betaDF = pd.read_csv(betaurl, sep="|", encoding="utf-8")

In [5]:
betaDF['FLIGHTNUM'].value_counts().describe

<bound method NDFrame.describe of 505     849
711     821
343     810
407     803
493     797
       ... 
9565      1
7975      1
7945      1
7840      1
7745      1
Name: FLIGHTNUM, Length: 8034, dtype: int64>

In [6]:
betaDF['AIRLINECODE'].value_counts

<bound method IndexOpsMixin.value_counts of 0          WN
1          CO
2          WN
3          WN
4          CO
           ..
1191800    EV
1191801    DL
1191802    DL
1191803    EV
1191804    EV
Name: AIRLINECODE, Length: 1191805, dtype: object>

In [7]:
betaDF.columns

Index(['TRANSACTIONID', 'FLIGHTDATE', 'AIRLINECODE', 'TAILNUM', 'FLIGHTNUM',
       'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'ORIGINCITYNAME', 'ORIGINSTATE',
       'ORIGINSTATENAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME', 'DESTCITYNAME',
       'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME', 'DEPDELAY',
       'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME', 'ARRTIME',
       'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED',
       'DIVERTED', 'DISTANCE', 'AIRLINENAME', 'DISTNUM', 'MEASURE',
       'DISTANCEGROUP', 'DEPDELAYGT15', 'NEXTDAYARR'],
      dtype='object')

In [None]:
# Create surrogate PKs to slice many columns into atomic tables (1NF).
'''
For example:

transaction table with
TRANSACTIONINDEX, TRANSACTION ID
TR001, 54548800
TR002, 55555555

flight_date table with
FLIGHTDATEINDEX, FLIGHTDATE
FD001, 20020101
FD002, 20020103

origairportcode table with 
ORIGINDEX, ORIGINAIRPORTCODE
OAC001, ABQ
OAC002, ATL

measure table with
MEINDEX, MEASURE
ME001, MILES
ME002, KILOMETERS

city_state table with 
CSINDEX, CITY_ST_NAME
GA001, ATLANTA, GA
NM001, ALBUQUERQUE, NM

airport_name table with
ANINDEX, AIRPORTNAME, (FK)CSINDEX
ABQ001, ALBUQ. INTL AIRP, NM001
ATL001, ATLANTA INTL AIRP, GA001

origin_destination
(FK)TRANSACTIONINDEX, (FK)CSINDEX, (FK)CSINDEX
TR009, GA001, NM001
TR008, CO001, NY001

...

'''