In [1]:
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import psycopg2
import numpy as np

In [2]:
# Read csv file into notebook
transport_emp_df = pd.read_csv("CSVCleanup/Cleaned_Transportation_Employment.csv")
transport_emp_df

Unnamed: 0.1,Unnamed: 0,Index,Date,Pipeline,Water,Rail,Air,Transit_and_Ground_Passenger_Transportation,Truck,Unemployment_Rate,Labor_Force_Participation_Rate,Unemployed
0,696,696,1/1/2005 0:00,38100.0,55500.0,191000.0,505200.0,407900.0,1348500.0,0.053,0.658,7784000.0
1,697,697,2/1/2005 0:00,37700.0,55100.0,191400.0,503400.0,407700.0,1349000.0,0.054,0.659,7980000.0
2,698,698,3/1/2005 0:00,37600.0,56700.0,191900.0,504200.0,409200.0,1359900.0,0.052,0.659,7737000.0
3,699,699,4/1/2005 0:00,37800.0,59400.0,193300.0,507300.0,413700.0,1377300.0,0.052,0.661,7672000.0
4,700,700,5/1/2005 0:00,37500.0,61100.0,193900.0,507800.0,415300.0,1391700.0,0.051,0.661,7651000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
182,879,736,5/1/2008 0:00,41500.0,67200.0,196600.0,498400.0,449100.0,1405100.0,0.054,0.661,8395000.0
183,880,735,4/1/2008 0:00,40400.0,65500.0,195900.0,499700.0,448100.0,1394400.0,0.050,0.659,7637000.0
184,881,734,3/1/2008 0:00,40500.0,62800.0,194600.0,502500.0,440800.0,1389800.0,0.051,0.661,7822000.0
185,882,733,2/1/2008 0:00,40400.0,63200.0,194300.0,501800.0,439900.0,1377300.0,0.049,0.660,7497000.0


In [3]:
# Drop extra index column (unnamed)
transport_emp_df = transport_emp_df.drop(['Unnamed: 0'], axis = 1)

In [4]:
# Check data types
transport_emp_df.dtypes

Index                                            int64
Date                                            object
Pipeline                                       float64
Water                                          float64
Rail                                           float64
Air                                            float64
Transit_and_Ground_Passenger_Transportation    float64
Truck                                          float64
Unemployment_Rate                              float64
Labor_Force_Participation_Rate                 float64
Unemployed                                     float64
dtype: object

In [5]:
# Change data type to integer
transport_emp_df[['Pipeline', 'Water', 'Rail', 'Air', 'Transit_and_Ground_Passenger_Transportation', 'Truck', 'Unemployed']] = transport_emp_df[['Pipeline', 'Water', 'Rail', 'Air', 'Transit_and_Ground_Passenger_Transportation', 'Truck', 'Unemployed']].astype(int)

In [6]:
# Convert Date column data type to datetime
transport_emp_df['Date'] = pd.to_datetime(transport_emp_df['Date'])
transport_emp_df['Date']

0     2005-01-01
1     2005-02-01
2     2005-03-01
3     2005-04-01
4     2005-05-01
         ...    
182   2008-05-01
183   2008-04-01
184   2008-03-01
185   2008-02-01
186   2008-01-01
Name: Date, Length: 187, dtype: datetime64[ns]

In [7]:
# Recheck data types
transport_emp_df.dtypes

Index                                                   int64
Date                                           datetime64[ns]
Pipeline                                                int32
Water                                                   int32
Rail                                                    int32
Air                                                     int32
Transit_and_Ground_Passenger_Transportation             int32
Truck                                                   int32
Unemployment_Rate                                     float64
Labor_Force_Participation_Rate                        float64
Unemployed                                              int32
dtype: object

In [8]:
# Look at data over the past decade
transport_emp_df = transport_emp_df.loc[transport_emp_df['Date']>='2010-1-1']
transport_emp_df

Unnamed: 0,Index,Date,Pipeline,Water,Rail,Air,Transit_and_Ground_Passenger_Transportation,Truck,Unemployment_Rate,Labor_Force_Participation_Rate,Unemployed
36,882,2020-07-01,49700,58100,142000,398600,271400,1457100,0.102,0.614,16338000
37,881,2020-06-01,50000,57200,144600,382200,307000,1449600,0.111,0.615,17750000
38,880,2020-05-01,50400,57100,148000,385300,328900,1430000,0.133,0.608,20985000
39,879,2020-04-01,50500,57900,154900,432700,331300,1414600,0.147,0.602,23078000
40,878,2020-03-01,51300,62000,159200,510700,515300,1499400,0.044,0.627,7140000
...,...,...,...,...,...,...,...,...,...,...,...
158,760,2010-05-01,41700,62500,180300,460500,445700,1238700,0.096,0.649,14849000
159,759,2010-04-01,42400,60500,179700,461300,443400,1220700,0.099,0.652,15325000
160,758,2010-03-01,42400,60300,178300,461700,442800,1208300,0.099,0.649,15202000
161,757,2010-02-01,43500,58100,177300,461500,435800,1200300,0.098,0.649,15113000


In [9]:
# Reset index
transport_emp_df = transport_emp_df.reset_index(drop=True)
transport_emp_df

Unnamed: 0,Index,Date,Pipeline,Water,Rail,Air,Transit_and_Ground_Passenger_Transportation,Truck,Unemployment_Rate,Labor_Force_Participation_Rate,Unemployed
0,882,2020-07-01,49700,58100,142000,398600,271400,1457100,0.102,0.614,16338000
1,881,2020-06-01,50000,57200,144600,382200,307000,1449600,0.111,0.615,17750000
2,880,2020-05-01,50400,57100,148000,385300,328900,1430000,0.133,0.608,20985000
3,879,2020-04-01,50500,57900,154900,432700,331300,1414600,0.147,0.602,23078000
4,878,2020-03-01,51300,62000,159200,510700,515300,1499400,0.044,0.627,7140000
...,...,...,...,...,...,...,...,...,...,...,...
122,760,2010-05-01,41700,62500,180300,460500,445700,1238700,0.096,0.649,14849000
123,759,2010-04-01,42400,60500,179700,461300,443400,1220700,0.099,0.652,15325000
124,758,2010-03-01,42400,60300,178300,461700,442800,1208300,0.099,0.649,15202000
125,757,2010-02-01,43500,58100,177300,461500,435800,1200300,0.098,0.649,15113000


In [10]:
# Read csv file into notebook
highway_df = pd.read_csv("CSVCleanUp/Cleaned_Highway_Vehicle_Miles_Traveled.csv")
highway_df.head()

Unnamed: 0.1,Unnamed: 0,Index,Date,HVMT_All_Systems,HVMT_Total_Rural,HVMT_Other_Rural,HVMT_Rural_Other_Arterial,HVMT_Rural_Interstate
0,740,875,12/1/2019 0:00,274000000000.0,80395000000.0,27483000000.0,31257000000.0,21654000000.0
1,741,874,11/1/2019 0:00,260000000000.0,77563000000.0,26809000000.0,30302000000.0,20452000000.0
2,742,873,10/1/2019 0:00,284000000000.0,86765000000.0,30402000000.0,33957000000.0,22406000000.0
3,743,872,9/1/2019 0:00,272000000000.0,83642000000.0,29248000000.0,32912000000.0,21482000000.0
4,744,871,8/1/2019 0:00,287000000000.0,90787000000.0,31380000000.0,35215000000.0,24192000000.0


In [11]:
# Drop extra index column (unnamed)
highway_df = highway_df.drop(['Unnamed: 0'], axis = 1)

In [12]:
#Check data types
highway_df.dtypes

Index                          int64
Date                          object
HVMT_All_Systems             float64
HVMT_Total_Rural             float64
HVMT_Other_Rural             float64
HVMT_Rural_Other_Arterial    float64
HVMT_Rural_Interstate        float64
dtype: object

In [13]:
# Change data type to integer
highway_df[['HVMT_All_Systems','HVMT_Total_Rural', 'HVMT_Other_Rural', 'HVMT_Rural_Other_Arterial', 'HVMT_Rural_Interstate']] = highway_df[['HVMT_All_Systems','HVMT_Total_Rural', 'HVMT_Other_Rural', 'HVMT_Rural_Other_Arterial', 'HVMT_Rural_Interstate']].astype('int64')

In [14]:
# Convert Date column data type to datetime
highway_df['Date'] = pd.to_datetime(highway_df['Date'])
highway_df['Date']

0    2019-12-01
1    2019-11-01
2    2019-10-01
3    2019-09-01
4    2019-08-01
5    2019-07-01
6    2019-06-01
7    2019-05-01
8    2019-04-01
9    2019-03-01
10   2019-02-01
11   2019-01-01
12   2018-12-01
13   2018-11-01
14   2018-10-01
15   2018-09-01
16   2018-08-01
17   2018-07-01
18   2018-06-01
19   2018-05-01
20   2018-04-01
21   2018-03-01
22   2018-02-01
23   2018-01-01
Name: Date, dtype: datetime64[ns]

In [15]:
# Look at data over the past decade
highway_df = highway_df.loc[highway_df['Date']>='2018-1-1']
highway_df

Unnamed: 0,Index,Date,HVMT_All_Systems,HVMT_Total_Rural,HVMT_Other_Rural,HVMT_Rural_Other_Arterial,HVMT_Rural_Interstate
0,875,2019-12-01,274000000000,80395000000,27483000000,31257000000,21654000000
1,874,2019-11-01,260000000000,77563000000,26809000000,30302000000,20452000000
2,873,2019-10-01,284000000000,86765000000,30402000000,33957000000,22406000000
3,872,2019-09-01,272000000000,83642000000,29248000000,32912000000,21482000000
4,871,2019-08-01,287000000000,90787000000,31380000000,35215000000,24192000000
5,870,2019-07-01,296000000000,94815000000,32732000000,36666000000,25417000000
6,869,2019-06-01,281000000000,87384000000,30525000000,33858000000,23002000000
7,868,2019-05-01,286000000000,87500000000,30687000000,33645000000,23167000000
8,867,2019-04-01,282000000000,84394000000,30050000000,32269000000,22075000000
9,866,2019-03-01,272000000000,79928000000,27977000000,31012000000,20939000000


In [16]:
# Read csv file into notebook
pers_spend_df = pd.read_csv("CSVCleanUp/Cleaned_Personal_Spending_on_Transportation.csv")
pers_spend_df.head()

Unnamed: 0.1,Unnamed: 0,Index,Date,Transportation_Services,Gas_and_Other_Engergy_Goods,Motor_Vehicles_and_Parts
0,660,660,1/1/2002 0:00,337000000000.0,452000000000.0,410000000000.0
1,663,663,4/1/2002 0:00,334000000000.0,457000000000.0,409000000000.0
2,666,666,7/1/2002 0:00,334000000000.0,457000000000.0,435000000000.0
3,669,669,10/1/2002 0:00,334000000000.0,455000000000.0,414000000000.0
4,672,672,1/1/2003 0:00,336000000000.0,450000000000.0,411000000000.0


In [17]:
# Drop extra index column (unnamed)
pers_spend_df = pers_spend_df.drop(['Unnamed: 0'], axis = 1)

In [18]:
# Change data type to integer
pers_spend_df[['Transportation_Services', 'Gas_and_Other_Engergy_Goods', 'Motor_Vehicles_and_Parts']] = pers_spend_df[['Transportation_Services', 'Gas_and_Other_Engergy_Goods', 'Motor_Vehicles_and_Parts']].astype('int64')

In [19]:
# Convert Date column data type to datetime
pers_spend_df['Date'] = pd.to_datetime(pers_spend_df['Date'])
pers_spend_df['Date']

0    2002-01-01
1    2002-04-01
2    2002-07-01
3    2002-10-01
4    2003-01-01
        ...    
69   2009-01-01
70   2008-10-01
71   2008-07-01
72   2008-04-01
73   2008-01-01
Name: Date, Length: 74, dtype: datetime64[ns]

In [20]:
# Look at data over the past decade
pers_spend_df = pers_spend_df.loc[pers_spend_df['Date']>='2010-1-1']
pers_spend_df

Unnamed: 0,Index,Date,Transportation_Services,Gas_and_Other_Engergy_Goods,Motor_Vehicles_and_Parts
24,879,2020-04-01,263000000000,346000000000,503000000000
25,876,2020-01-01,415000000000,421000000000,496000000000
26,873,2019-10-01,448000000000,442000000000,539000000000
27,870,2019-07-01,447000000000,445000000000,535000000000
28,867,2019-04-01,440000000000,447000000000,535000000000
29,864,2019-01-01,435000000000,447000000000,521000000000
30,861,2018-10-01,432000000000,447000000000,537000000000
31,858,2018-07-01,433000000000,445000000000,536000000000
32,855,2018-04-01,437000000000,448000000000,536000000000
33,852,2018-01-01,441000000000,448000000000,531000000000


In [21]:
# Reset index
pers_spend_df = pers_spend_df.reset_index(drop=True)
pers_spend_df

Unnamed: 0,Index,Date,Transportation_Services,Gas_and_Other_Engergy_Goods,Motor_Vehicles_and_Parts
0,879,2020-04-01,263000000000,346000000000,503000000000
1,876,2020-01-01,415000000000,421000000000,496000000000
2,873,2019-10-01,448000000000,442000000000,539000000000
3,870,2019-07-01,447000000000,445000000000,535000000000
4,867,2019-04-01,440000000000,447000000000,535000000000
5,864,2019-01-01,435000000000,447000000000,521000000000
6,861,2018-10-01,432000000000,447000000000,537000000000
7,858,2018-07-01,433000000000,445000000000,536000000000
8,855,2018-04-01,437000000000,448000000000,536000000000
9,852,2018-01-01,441000000000,448000000000,531000000000


In [22]:
# Read csv file into notebook
rail_df = pd.read_csv("CSVCleanUp/Cleaned_Passenger_Rail.csv")
rail_df.head()

Unnamed: 0.1,Unnamed: 0,Index,Date,Passengers,Passenger_Miles,Total_Train_Miles,Employee_Hours_Worked,Yard_Switching_Miles
0,336,336,1/1/1975 0:00,0.0,2624696.0,4134425.0,1501878.0,0.0
1,337,337,2/1/1975 0:00,0.0,2275826.0,3600736.0,1371524.0,0.0
2,338,338,3/1/1975 0:00,0.0,2520943.0,4067192.0,1516787.0,0.0
3,339,339,4/1/1975 0:00,0.0,2478477.0,4000157.0,1583115.0,0.0
4,340,340,5/1/1975 0:00,0.0,2561119.0,4050197.0,1544540.0,0.0


In [23]:
# Drop extra index column (unnamed)
rail_df = rail_df.drop(['Unnamed: 0'], axis = 1)

In [24]:
# Change data type to integer
rail_df[['Passengers', 'Passenger_Miles', 'Total_Train_Miles', 'Employee_Hours_Worked', 'Yard_Switching_Miles']] = rail_df[['Passengers', 'Passenger_Miles', 'Total_Train_Miles', 'Employee_Hours_Worked', 'Yard_Switching_Miles']].astype(int)

In [25]:
# Convert Date column data type to datetime
rail_df['Date'] = pd.to_datetime(rail_df['Date'])
rail_df['Date']

0     1975-01-01
1     1975-02-01
2     1975-03-01
3     1975-04-01
4     1975-05-01
         ...    
540   2008-05-01
541   2008-04-01
542   2008-03-01
543   2008-02-01
544   2008-01-01
Name: Date, Length: 545, dtype: datetime64[ns]

In [26]:
# Look at data over the past decade
rail_df = rail_df.loc[rail_df['Date']>='2010-1-1']
rail_df

Unnamed: 0,Index,Date,Passengers,Passenger_Miles,Total_Train_Miles,Employee_Hours_Worked,Yard_Switching_Miles
396,880,2020-05-01,213983,69098462,2037397,2745323,128937
397,879,2020-04-01,121154,36018048,2070618,2745367,121903
398,878,2020-03-01,1130190,233377315,2997776,3073471,162151
399,877,2020-02-01,2163475,400433113,3124037,2945522,174288
400,876,2020-01-01,2399424,447564878,3505313,3147306,182445
...,...,...,...,...,...,...,...
516,760,2010-05-01,2503651,544408276,3518013,3451013,209347
517,759,2010-04-01,2446113,520013629,3451608,3478591,205893
518,758,2010-03-01,2474908,538675366,3560237,3629837,212974
519,757,2010-02-01,1951901,405905875,3088012,3122066,192744


In [27]:
# Reset index
rail_df = rail_df.reset_index(drop=True)
rail_df

Unnamed: 0,Index,Date,Passengers,Passenger_Miles,Total_Train_Miles,Employee_Hours_Worked,Yard_Switching_Miles
0,880,2020-05-01,213983,69098462,2037397,2745323,128937
1,879,2020-04-01,121154,36018048,2070618,2745367,121903
2,878,2020-03-01,1130190,233377315,2997776,3073471,162151
3,877,2020-02-01,2163475,400433113,3124037,2945522,174288
4,876,2020-01-01,2399424,447564878,3505313,3147306,182445
...,...,...,...,...,...,...,...
120,760,2010-05-01,2503651,544408276,3518013,3451013,209347
121,759,2010-04-01,2446113,520013629,3451608,3478591,205893
122,758,2010-03-01,2474908,538675366,3560237,3629837,212974
123,757,2010-02-01,1951901,405905875,3088012,3122066,192744


In [28]:
# Read csv file into notebook
air_df = pd.read_csv("CSVCleanUp/Cleaned_Airline_Traffic.csv")
air_df.head()

Unnamed: 0.1,Unnamed: 0,Index,Date,Total,International,Domestic
0,735,880,5/1/2020 0:00,8420000.0,190000.0,8240000.0
1,736,879,4/1/2020 0:00,3010000.0,130000.0,2880000.0
2,737,878,3/1/2020 0:00,39060000.0,4650000.0,34410000.0
3,738,877,2/1/2020 0:00,67810000.0,7960000.0,59850000.0
4,739,876,1/1/2020 0:00,70760000.0,9150000.0,61610000.0


In [29]:
# Drop extra index column (unnamed)
air_df = air_df.drop(['Unnamed: 0'], axis = 1)

In [31]:
# Change data type to integer
air_df[['Total', 'International', 'Domestic']] = air_df[['Total', 'International', 'Domestic']].astype(int)

In [32]:
# Convert Date column data type to datetime
air_df['Date'] = pd.to_datetime(air_df['Date'])
air_df['Date']

0    2020-05-01
1    2020-04-01
2    2020-03-01
3    2020-02-01
4    2020-01-01
5    2019-12-01
6    2019-11-01
7    2019-10-01
8    2019-09-01
9    2019-08-01
10   2019-07-01
11   2019-06-01
12   2019-05-01
13   2019-04-01
14   2019-03-01
15   2019-02-01
16   2019-01-01
17   2018-12-01
18   2018-11-01
19   2018-10-01
20   2018-09-01
21   2018-08-01
22   2018-07-01
23   2018-06-01
24   2018-05-01
25   2018-04-01
26   2018-03-01
27   2018-02-01
28   2018-01-01
29   2017-12-01
30   2017-11-01
31   2017-10-01
32   2017-09-01
33   2017-08-01
34   2017-07-01
35   2017-06-01
36   2017-05-01
37   2017-04-01
38   2017-03-01
39   2017-02-01
40   2017-01-01
Name: Date, dtype: datetime64[ns]

In [33]:
# Look at data over the past decade
air_df = air_df.loc[air_df['Date']>='2010-1-1']
air_df

Unnamed: 0,Index,Date,Total,International,Domestic
0,880,2020-05-01,8420000,190000,8240000
1,879,2020-04-01,3010000,130000,2880000
2,878,2020-03-01,39060000,4650000,34410000
3,877,2020-02-01,67810000,7960000,59850000
4,876,2020-01-01,70760000,9150000,61610000
5,875,2019-12-01,79300000,9580000,69720000
6,874,2019-11-01,73070000,8260000,64820000
7,873,2019-10-01,78620000,8690000,69920000
8,872,2019-09-01,72590000,8610000,63980000
9,871,2019-08-01,83780000,11070000,72720000


In [34]:
# Reset index
air_df = air_df.reset_index(drop=True)
air_df

Unnamed: 0,Index,Date,Total,International,Domestic
0,880,2020-05-01,8420000,190000,8240000
1,879,2020-04-01,3010000,130000,2880000
2,878,2020-03-01,39060000,4650000,34410000
3,877,2020-02-01,67810000,7960000,59850000
4,876,2020-01-01,70760000,9150000,61610000
5,875,2019-12-01,79300000,9580000,69720000
6,874,2019-11-01,73070000,8260000,64820000
7,873,2019-10-01,78620000,8690000,69920000
8,872,2019-09-01,72590000,8610000,63980000
9,871,2019-08-01,83780000,11070000,72720000


In [39]:
# Read csv file into notebook
carrier_df = pd.read_csv("CSVCleanUp/Cleaned_Air_Carrier_Cargo_MillionsOfRevenueTonMiles.csv")
carrier_df.head()

Unnamed: 0.1,Unnamed: 0,Index,Date,International,Domestic
0,669,669,10/1/2002 0:00,1887142000.0,1458666000.0
1,670,670,11/1/2002 0:00,1778562000.0,1296241000.0
2,671,671,12/1/2002 0:00,1548068000.0,1262516000.0
3,672,672,1/1/2003 0:00,1395876000.0,1177785000.0
4,673,673,2/1/2003 0:00,1366617000.0,1079561000.0


In [40]:
# Drop extra index column (unnamed)
carrier_df = carrier_df.drop(['Unnamed: 0'], axis = 1)

In [41]:
# Change data type to integer
carrier_df[['International', 'Domestic']] = carrier_df[['International', 'Domestic']].astype('int64')

In [42]:
# Convert Date column data type to datetime
carrier_df['Date'] = pd.to_datetime(carrier_df['Date'])
carrier_df['Date']

0     2002-10-01
1     2002-11-01
2     2002-12-01
3     2003-01-01
4     2003-02-01
         ...    
207   2008-05-01
208   2008-04-01
209   2008-03-01
210   2008-02-01
211   2008-01-01
Name: Date, Length: 212, dtype: datetime64[ns]

In [43]:
# Look at data over the past decade
carrier_df = carrier_df.loc[carrier_df['Date']>='2010-1-1']
carrier_df

Unnamed: 0,Index,Date,International,Domestic
63,880,2020-05-01,2228769680,1583861040
64,879,2020-04-01,2025803327,1447281134
65,878,2020-03-01,2165378135,1442357710
66,877,2020-02-01,1781338361,1173055368
67,876,2020-01-01,2041418389,1322461187
...,...,...,...,...
183,760,2010-05-01,2062540999,1061357921
184,759,2010-04-01,1913703531,1076584532
185,758,2010-03-01,1971931105,1085023426
186,757,2010-02-01,1638814937,926043101


In [44]:
# Reset index
carrier_df = carrier_df.reset_index(drop=True)
carrier_df

Unnamed: 0,Index,Date,International,Domestic
0,880,2020-05-01,2228769680,1583861040
1,879,2020-04-01,2025803327,1447281134
2,878,2020-03-01,2165378135,1442357710
3,877,2020-02-01,1781338361,1173055368
4,876,2020-01-01,2041418389,1322461187
...,...,...,...,...
120,760,2010-05-01,2062540999,1061357921
121,759,2010-04-01,1913703531,1076584532
122,758,2010-03-01,1971931105,1085023426
123,757,2010-02-01,1638814937,926043101
