# InterWorks - Data Engineer Case Study
Model developed exclusively to InterWorks with InterWorks provided data and instructions. 

In [1]:
__author__ = 'Phil Baltazar'
__email__  = 'phillusnow@gmail.com'
__website__= 'www.github.com/pbswe'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sqlalchemy
from sqlalchemy import create_engine
import datetime as dt
import psycopg2

Loading the data and EDA (exploratory data analysis)

In [44]:
url = "../InterWorks_DE/flights.txt"
interDF = pd.read_csv(url, sep="|", encoding="utf-8")

In [45]:
interDF.head()

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,AIRLINECODE,AIRLINENAME,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGAIRPORTNAME,ORIGINCITYNAME,ORIGINSTATE,...,WHEELSON,TAXIIN,CRSARRTIME,ARRTIME,ARRDELAY,CRSELAPSEDTIME,ACTUALELAPSEDTIME,CANCELLED,DIVERTED,DISTANCE
0,54548800,20020101,WN,Southwest Airlines Co.: WN,N103@@,1425,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1648.0,4.0,1655,1652.0,-3.0,90.0,87.0,F,False,580 miles
1,55872300,20020101,CO,Continental Air Lines Inc.: CO,N83872,150,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1419.0,16.0,1426,1435.0,9.0,116.0,119.0,False,F,744 miles
2,54388800,20020101,WN,Southwest Airlines Co.: WN,N334@@,249,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1618.0,2.0,1500,1620.0,80.0,105.0,102.0,F,False,718 miles
3,54486500,20020101,WN,Southwest Airlines Co.: WN,N699@@,902,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1947.0,1.0,1950,1948.0,-2.0,85.0,83.0,0,0,487 miles
4,55878700,20020103,CO,Continental Air Lines Inc.: CO,N58606,234,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,...,1742.0,5.0,1750,1747.0,-3.0,115.0,114.0,F,False,744 miles


In [46]:
interDF.columns
#headerNames = "TRANSACTIONID|FLIGHTDATE|AIRLINECODE|AIRLINENAME|TAILNUM|FLIGHTNUM|ORIGINAIRPORTCODE|ORIGAIRPORTNAME|ORIGINCITYNAME|ORIGINSTATE|ORIGINSTATENAME|DESTAIRPORTCODE|DESTAIRPORTNAME|DESTCITYNAME|DESTSTATE|DESTSTATENAME|CRSDEPTIME|DEPTIME|DEPDELAY|TAXIOUT|WHEELSOFF|WHEELSON|TAXIIN|CRSARRTIME|ARRTIME|ARRDELAY|CRSELAPSEDTIME|ACTUALELAPSEDTIME|CANCELLED|DIVERTED|DISTANCE"

Index(['TRANSACTIONID', 'FLIGHTDATE', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM',
       'FLIGHTNUM', 'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'ORIGINCITYNAME',
       'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
       'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME',
       'DEPDELAY', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME',
       'ARRTIME', 'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME',
       'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

In [47]:
interDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191805 entries, 0 to 1191804
Data columns (total 31 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   TRANSACTIONID      1191805 non-null  int64  
 1   FLIGHTDATE         1191805 non-null  int64  
 2   AIRLINECODE        1191805 non-null  object 
 3   AIRLINENAME        1191805 non-null  object 
 4   TAILNUM            1034988 non-null  object 
 5   FLIGHTNUM          1191805 non-null  int64  
 6   ORIGINAIRPORTCODE  1191805 non-null  object 
 7   ORIGAIRPORTNAME    1191805 non-null  object 
 8   ORIGINCITYNAME     1191805 non-null  object 
 9   ORIGINSTATE        1180963 non-null  object 
 10  ORIGINSTATENAME    1180963 non-null  object 
 11  DESTAIRPORTCODE    1191805 non-null  object 
 12  DESTAIRPORTNAME    1191805 non-null  object 
 13  DESTCITYNAME       1191805 non-null  object 
 14  DESTSTATE          1180967 non-null  object 
 15  DESTSTATENAME      1180967 non-n

Correcting / fixing data types. 

In [48]:
interDF['CANCELLED'].value_counts()

False    637289
0        347545
F        178357
True      16370
1          8160
T          4084
Name: CANCELLED, dtype: int64

In [49]:
interDF['CANCELLED'].replace('False', 0, inplace=True)
interDF['CANCELLED'].replace('F', 0, inplace=True)
interDF['CANCELLED'].replace('0', 0, inplace=True)
interDF['CANCELLED'].replace('True', 1, inplace=True)
interDF['CANCELLED'].replace('T', 1, inplace=True)
interDF['CANCELLED'].replace('1', 1, inplace=True)


In [50]:
interDF['DIVERTED'].value_counts()

F        426572
False    407681
0        354906
T           966
True        881
1           799
Name: DIVERTED, dtype: int64

In [51]:
interDF['DIVERTED'].replace('False', 0, inplace=True)
interDF['DIVERTED'].replace('F', 0, inplace=True)
interDF['DIVERTED'].replace('0', 0, inplace=True)
interDF['DIVERTED'].replace('True', 1, inplace=True)
interDF['DIVERTED'].replace('T', 1, inplace=True)
interDF['DIVERTED'].replace('1', 1, inplace=True)

In [52]:
interDF['DISTANCE'].value_counts()

337 miles     10113
370 miles      6851
236 miles      5726
328 miles      5721
224 miles      5509
              ...  
1077 miles        1
1401 miles        1
560 miles         1
1882 miles        1
2045 miles        1
Name: DISTANCE, Length: 1706, dtype: int64

In [53]:
# The lines below handles the 'DISTANCE' attribute that contains: \n
# the number of miles (as it should be used as a numerical data for calculation), and \n
# the word "miles" next to the number, which is measurement lable in a string format. 

# I decided to preserve the original "DISTANCE" according to the Case Study document, \n
# but also split it into a numerical column with the numbers only and another with the string.

distCol = interDF.DISTANCE.str.split(expand=True)
distColRename = ['DISTNUM', 'MEASURE']
distCol.columns = distColRename
distCol

Unnamed: 0,DISTNUM,MEASURE
0,580,miles
1,744,miles
2,718,miles
3,487,miles
4,744,miles
...,...,...
1191800,721,miles
1191801,731,miles
1191802,731,miles
1191803,508,miles


In [54]:
airNameCol = interDF.AIRLINENAME.str.split(':', expand=True)
airColRename = ['AIRLINENAME1', 'AIRLINECODE1']
airNameCol.columns = airColRename
airNameCol

Unnamed: 0,AIRLINENAME1,AIRLINECODE1
0,Southwest Airlines Co.,WN
1,Continental Air Lines Inc.,CO
2,Southwest Airlines Co.,WN
3,Southwest Airlines Co.,WN
4,Continental Air Lines Inc.,CO
...,...,...
1191800,ExpressJet Airlines Inc.,EV
1191801,Delta Air Lines Inc.,DL
1191802,Delta Air Lines Inc.,DL
1191803,ExpressJet Airlines Inc.,EV


In [55]:
newDF = pd.concat([interDF, airNameCol, distCol], axis=1)
interDF = newDF
interDF = interDF.drop(['AIRLINENAME'], axis=1)
interDF = interDF.drop(['AIRLINECODE1'], axis=1)

In [60]:
interDF.rename(columns = {'AIRLINENAME1':'AIRLINENAME'}, inplace = True)

In [61]:
interDF.head(5)

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,AIRLINECODE,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGAIRPORTNAME,ORIGINCITYNAME,ORIGINSTATE,ORIGINSTATENAME,...,ARRTIME,ARRDELAY,CRSELAPSEDTIME,ACTUALELAPSEDTIME,CANCELLED,DIVERTED,DISTANCE,AIRLINENAME,DISTNUM,MEASURE
0,54548800,20020101,WN,N103@@,1425,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,1652.0,-3.0,90.0,87.0,0,0,580 miles,Southwest Airlines Co.,580,miles
1,55872300,20020101,CO,N83872,150,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,1435.0,9.0,116.0,119.0,0,0,744 miles,Continental Air Lines Inc.,744,miles
2,54388800,20020101,WN,N334@@,249,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,1620.0,80.0,105.0,102.0,0,0,718 miles,Southwest Airlines Co.,718,miles
3,54486500,20020101,WN,N699@@,902,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,1948.0,-2.0,85.0,83.0,0,0,487 miles,Southwest Airlines Co.,487,miles
4,55878700,20020103,CO,N58606,234,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,1747.0,-3.0,115.0,114.0,0,0,744 miles,Continental Air Lines Inc.,744,miles


In [62]:
interDF['DISTNUM'].value_counts()

337     10113
370      6851
236      5726
328      5721
224      5509
        ...  
1997        1
953         1
1882        1
1573        1
1993        1
Name: DISTNUM, Length: 1706, dtype: int64

In [63]:
categoricCols = ['TRANSACTIONID', 'FLIGHTDATE', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM',
                 'FLIGHTNUM', 'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'ORIGINCITYNAME',
                 'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
                 'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'DISTANCE', 'MEASURE']

numericCols = ['CRSDEPTIME', 'DEPTIME', 'DEPDELAY', 'TAXIOUT', 'TAXIIN', 
               'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'ARRTIME', 
               'CRSARRTIME', 'WHEELSOFF', 'WHEELSON', 'DISTNUM']

boolCols = ['CANCELLED', 'DIVERTED']

#timeCols = ['ARRTIME', 'CRSARRTIME', 'WHEELSOFF', 'WHEELSON']

In [64]:
interDF[categoricCols] = interDF[categoricCols].astype('category')
interDF[numericCols] = interDF[numericCols].astype('float')
interDF[boolCols] = interDF[boolCols].astype('bool')
#interDF[timeCols] = interDF[timeCols].astype('dateutil')

In [65]:
interDF.describe(include=['category'])

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,AIRLINECODE,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGAIRPORTNAME,ORIGINCITYNAME,ORIGINSTATE,ORIGINSTATENAME,DESTAIRPORTCODE,DESTAIRPORTNAME,DESTCITYNAME,DESTSTATE,DESTSTATENAME,DISTANCE,AIRLINENAME,MEASURE
count,1191805,1191805,1191805,1034988,1191805,1191805,1191805,1191805,1180963,1180963,1191805,1191805,1191805,1180967,1180967,1191805,1191805,1191805
unique,1191805,1902,26,13869,8034,363,363,344,51,51,364,364,345,51,51,1706,26,1
top,147818200,20040924,WN,UNKNOW,505,ATL,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Chicago,CA,California,ATL,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Chicago,CA,California,337 miles,Southwest Airlines Co.,miles
freq,1,849,189985,7640,849,64421,64421,76508,141852,141852,64036,64036,76986,141630,141630,10113,189985,1191805


In [66]:
interDF['DISTNUM'] = interDF['DISTNUM'].astype(int)

In [67]:
interDF['DISTNUM'].describe()

count    1.191805e+06
mean     7.336410e+02
std      5.647814e+02
min      1.100000e+01
25%      3.250000e+02
50%      5.810000e+02
75%      9.660000e+02
max      4.983000e+03
Name: DISTNUM, dtype: float64

In [68]:
binwidth = int((max(interDF['DISTNUM'])-min(interDF['DISTNUM']))/4)
bins = range(min(interDF['DISTNUM']), max(interDF['DISTNUM']), binwidth)
group_names = ['Short', 'Medium', 'Long']

interDF['DISTANCEGROUP'] = pd.cut(interDF['DISTNUM'], bins, labels=group_names)

In [69]:
# Create a DEPDELAYGT15 column for delays greater than 15 minutes.

interDF['DEPDELAYGT15'] = np.where(interDF['DEPDELAY'] >= 15.0, True, False)


In [70]:
interDF['DEPDELAYGT15'].value_counts()

False    1014794
True      177011
Name: DEPDELAYGT15, dtype: int64

In [75]:
# Create a NEXTDAYARR column for next day arrivals.

interDF['NEXTDAYARR'] = (interDF['ARRTIME'] > 15) 

interDF['DEPTIME'].fillna(0, inplace=True)
interDF['DEPTIME'] = interDF['DEPTIME'].astype(int)

interDF['ACTUALELAPSEDTIME'].fillna(0, inplace=True)
interDF['ACTUALELAPSEDTIME'] = interDF['ACTUALELAPSEDTIME'].astype(int)

interDF['ARRTIME'].fillna(0, inplace=True)
interDF['ARRTIME'] = interDF['ARRTIME'].astype(int)

interDF['NEXTDAYARR'].fillna(0, inplace=True)
interDF['NEXTDAYARR'] = interDF['NEXTDAYARR'].astype(int)

interDF['NEXTDAYARR'] = np.where(interDF['ARRTIME'] >= 15.0, True, False)

In [76]:
interDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191805 entries, 0 to 1191804
Data columns (total 36 columns):
 #   Column             Non-Null Count    Dtype   
---  ------             --------------    -----   
 0   TRANSACTIONID      1191805 non-null  category
 1   FLIGHTDATE         1191805 non-null  category
 2   AIRLINECODE        1191805 non-null  category
 3   TAILNUM            1034988 non-null  category
 4   FLIGHTNUM          1191805 non-null  category
 5   ORIGINAIRPORTCODE  1191805 non-null  category
 6   ORIGAIRPORTNAME    1191805 non-null  category
 7   ORIGINCITYNAME     1191805 non-null  category
 8   ORIGINSTATE        1180963 non-null  category
 9   ORIGINSTATENAME    1180963 non-null  category
 10  DESTAIRPORTCODE    1191805 non-null  category
 11  DESTAIRPORTNAME    1191805 non-null  category
 12  DESTCITYNAME       1191805 non-null  category
 13  DESTSTATE          1180967 non-null  category
 14  DESTSTATENAME      1180967 non-null  category
 15  CRSDEPTIME     

In [141]:
# Column 'MEASURE' would be useful if we ever have an entry showing something else \n
# such as kilometers. However, if that is unlikely to happen and this column becomes \n
# useless, we can simply remove the hash/comment from the line below and drop it.

#interDF = interDF.drop(['MEASURE'], axis=1)
interDF.head(5)

Unnamed: 0,TRANSACTIONID,FLIGHTDATE,AIRLINECODE,TAILNUM,FLIGHTNUM,ORIGINAIRPORTCODE,ORIGAIRPORTNAME,ORIGINCITYNAME,ORIGINSTATE,ORIGINSTATENAME,...,ACTUALELAPSEDTIME,CANCELLED,DIVERTED,DISTANCE,AIRLINENAME,DISTNUM,MEASURE,DISTANCEGROUP,DEPDELAYGT15,NEXTDAYARR
0,54548800,20020101,WN,N103@@,1425,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,87,False,False,580 miles,Southwest Airlines Co.,580,miles,Short,False,True
1,55872300,20020101,CO,N83872,150,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,119,False,False,744 miles,Continental Air Lines Inc.,744,miles,Short,False,True
2,54388800,20020101,WN,N334@@,249,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,102,False,False,718 miles,Southwest Airlines Co.,718,miles,Short,True,True
3,54486500,20020101,WN,N699@@,902,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,83,False,False,487 miles,Southwest Airlines Co.,487,miles,Short,False,True
4,55878700,20020103,CO,N58606,234,ABQ,AlbuquerqueNM: Albuquerque International Sunport,Albuquerque,NM,New Mexico,...,114,False,False,744 miles,Continental Air Lines Inc.,744,miles,Short,False,True


In [79]:
# Save the formatted, clean data just in case.

interDF.to_csv('cleaned_flights.txt', sep='|', index=False)

The Data has been cleaned up and formatted, and is ready to be saved into PostgreSQL.

Getting SQL engine started and connection established. 

In [80]:
sqlalchemy.create_engine('postgres://INFORMATIONHIDDEN')

Engine(postgres://iw-recruiting-test.cygkjm9anrym.us-west-2.rds.amazonaws.com)

In [81]:
# Load data into PostgreSQL.
%load_ext sql

In [82]:
%sql postgresql://INFORMATIONHIDDEN:INFORMATIONHIDDEN@INFORMATIONHIDDEN/tests_data_engineering     
'''
POSTGRES_ADDRESS = 'XXX'
POSTGRES_USERNAME = 'XXX'
POSTGRES_PASSWORD = 'XXX'
POSTGRES_DBNAME = 'tests_data_engineering'
'''

"\nPOSTGRES_ADDRESS = 'iw-recruiting-test.cygkjm9anrym.us-west-2.rds.amazonaws.com'\nPOSTGRES_USERNAME = 'candidate3195'\nPOSTGRES_PASSWORD = '21QqOx1aev3R9MEq'\nPOSTGRES_DBNAME = 'tests_data_engineering'\n"

In [83]:
# I've been facing some issues while pushing all the DF to PostgreSQL. I've found a \n
# workaround that pushes data in increments so I can be sure data was successfully \n
# saved in the database. Not as pretty as a single push, but it works for now. -PB

In [84]:
engine = create_engine('postgresql://candidate3195:21QqOx1aev3R9MEq@iw-recruiting-test.cygkjm9anrym.us-west-2.rds.amazonaws.com/tests_data_engineering') 

In [85]:
interDF.iloc[:100,:].to_sql('flights', con=engine, index=False, if_exists='replace', chunksize=500)

In [86]:
interDF.iloc[101:500,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [87]:
interDF.iloc[501:5000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [89]:
interDF.iloc[5001:10000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# If time allows
# interDF.iloc[10001:50000,:].to_sql('flights', con=engine, index=False, if_exists='append', chunksize=500)

In [114]:
# In SQL, create a FACT_FLIGHTS with flights information.

factDF = interDF[['TRANSACTIONID', 'DISTANCEGROUP', 'DISTNUM', 'MEASURE', 'DEPDELAYGT15', 
                 'NEXTDAYARR', 'AIRLINENAME', 'ORIGAIRPORTNAME', 'DESTAIRPORTNAME']]
factDF

Unnamed: 0,TRANSACTIONID,DISTANCEGROUP,DISTNUM,MEASURE,DEPDELAYGT15,NEXTDAYARR,AIRLINENAME,ORIGAIRPORTNAME,DESTAIRPORTNAME
0,54548800,Short,580,miles,False,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,DallasTX: Dallas Love Field
1,55872300,Short,744,miles,False,True,Continental Air Lines Inc.,AlbuquerqueNM: Albuquerque International Sunport,HoustonTX: George Bush Intercontinental/Houston
2,54388800,Short,718,miles,True,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,Kansas CityMO: Kansas City International
3,54486500,Short,487,miles,False,True,Southwest Airlines Co.,AlbuquerqueNM: Albuquerque International Sunport,Las VegasNV: McCarran International
4,55878700,Short,744,miles,False,True,Continental Air Lines Inc.,AlbuquerqueNM: Albuquerque International Sunport,HoustonTX: George Bush Intercontinental/Houston
...,...,...,...,...,...,...,...,...,...
1191800,126750200,Short,721,miles,False,True,ExpressJet Airlines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,DallasTX: Dallas Love Field
1191801,127294500,Short,731,miles,False,True,Delta Air Lines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Dallas/Fort WorthTX: Dallas/Fort Worth Interna...
1191802,127294900,Short,731,miles,False,True,Delta Air Lines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Dallas/Fort WorthTX: Dallas/Fort Worth Interna...
1191803,126594900,Short,508,miles,False,True,ExpressJet Airlines Inc.,AtlantaGA: Hartsfield-Jackson Atlanta Internat...,Fort WayneIN: Fort Wayne International


In [97]:
factDF.iloc[:10000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# factDF.iloc[10001:50000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# factDF.iloc[50001:500000,:].to_sql('FACT_FLIGHTS', con=engine, index=False, if_exists='append', chunksize=500)

In [116]:
# In SQL, create DIM_DATE and DIM_AIRPORT dimension tables. 

dimAirDF = interDF[['TRANSACTIONID', 'AIRLINECODE', 'AIRLINENAME', 'TAILNUM', 'FLIGHTNUM',
                     'ORIGINAIRPORTCODE', 'ORIGAIRPORTNAME', 'DESTAIRPORTCODE', 'DESTAIRPORTNAME',
                     'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'ARRDELAY', 'CANCELLED',
                     'DIVERTED', 'NEXTDAYARR', 'DEPDELAYGT15']]

dimDateDF = interDF[['TRANSACTIONID', 'FLIGHTDATE', 'ORIGINAIRPORTCODE', 'ORIGINCITYNAME',
                    'ORIGINSTATE', 'DESTAIRPORTCODE', 'DESTCITYNAME', 'DESTSTATE', 'DEPDELAY',
                    'ARRTIME', 'ARRDELAY', 'DISTANCEGROUP']]

In [None]:
'''
# This is a different approach to create the alternative DFs - not used. 

#factDF.columns = interDF.columns
factDF = factDF.drop(['FLIGHTDATE', 'AIRLINECODE', 'TAILNUM', 'FLIGHTNUM', 'ORIGINAIRPORTCODE',
                      'ORIGINCITYNAME', 'ORIGINSTATE', 'ORIGINSTATENAME', 'DESTAIRPORTCODE',
                      'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME', 
                      'DEPDELAY', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME', 
                      'ARRTIME', 'ARRDELAY', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED',
                      'DIVERTED', 'DISTANCE'], axis=1)

dimAirDF.columns = interDF.columns
dimAirDF = dimAirDF.drop(['FLIGHTDATE', 'ORIGINCITYNAME', 'ORIGINSTATE', 'ORIGINSTATENAME',
                        'DESTCITYNAME', 'DESTSTATE', 'DESTSTATENAME', 'CRSDEPTIME', 'DEPTIME',
                        'DEPDELAY', 'CRSARRTIME', 'ARRTIME', 'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME',
                        'DISTANCE', 'DISTNUM', 'MEASURE','DISTANCEGROUP'], axis=1)

dimDateDF = interDF
dimDateDF.columns = interDF.columns
dimDateDF = dimDateDF.drop(['AIRLINECODE', 'TAILNUM', 'FLIGHTNUM', 'ORIGAIRPORTNAME',
                          'ORIGINSTATENAME', 'DESTAIRPORTNAME', 'DESTSTATENAME', 'CRSDEPTIME',
                          'DEPTIME', 'TAXIOUT', 'WHEELSOFF', 'WHEELSON', 'TAXIIN', 'CRSARRTIME',
                          'CRSELAPSEDTIME', 'ACTUALELAPSEDTIME', 'CANCELLED', 'DIVERTED',
                          'DISTANCE', 'AIRLINENAME', 'DISTNUM', 'MEASURE', 'DEPDELAYGT15', 
                          'NEXTDAYARR'], axis=1)
'''

In [117]:
dimAirDF.iloc[:10000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# dimAirDF.iloc[10001:50000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# dimAirDF.iloc[50001:500000,:].to_sql('DIM_AIRPORT', con=engine, index=False, if_exists='append', chunksize=500)

In [118]:
dimDateDF.iloc[:10000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='replace', chunksize=500)

In [None]:
# If time allows
# dimDateDF.iloc[10001:50000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='append', chunksize=500)

In [None]:
# dimDateDF.iloc[50001:500000,:].to_sql('DIM_DATE', con=engine, index=False, if_exists='append', chunksize=500)

In [190]:
# In SQL, created a view named VW_FLIGHTS that joins the fact and dimension tables and \n
# returns columns useful for analysis. 

In [140]:
%%sql

CREATE OR REPLACE VIEW candidate3195.VW_FLIGHTS AS
SELECT *
FROM flights
;

 * postgresql://candidate3195:***@iw-recruiting-test.cygkjm9anrym.us-west-2.rds.amazonaws.com/tests_data_engineering
Done.


[]

In [None]:
# VW_FLIGHTS filtered columns:
# TRANSACTIONID, DISTANCEGROUP, DEPDELAYGT15, NEXTDAYARR, 
# AIRLINENAME, ORIGAIRPORTNAME, DESTAIRPORTNAME

In [178]:
# Final considerations in the presentation. 