# Create the keys for the OnTime data to join with the Weather
This is to minimize how much data needs to be pulled.
These will be used to make weather files for each.

In [None]:
import pandas as pd
basepath = 'your_path_here'
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df23=pd.read_parquet(basepath + '/OnTime/ONT_DF_2023_DFW.parquet')
df_full_original =pd.read_parquet(basepath + '/OnTime/ONT_2010_2019_DFW.snappy.parquet')

# We're looking at 2010-2019 as our training and development time frame. We're leaving out 2020-2022 due to the
# craziness in the flight industry from COVID.
df10_19b = df_full_original[(df_full_original['Year'] > 2009) & (df_full_original['Year'] < 2020)]

# We're also concentrating on only 'DFW' data - Dallas, TX and narrowning down to the minimal columns to use as a key
# After downloading weather data for all weather stations for all the years we're focusing, it turns out to be about 500GB of data.
df10_19 = df10_19b[(df10_19b['Origin'] == 'DFW')]
df10_19 = df10_19[['Origin', 'FlightDate', 'CRSDepTime','Dest']]
df23 = df23[['Origin', 'Year', 'Month', 'DayofMonth', 'CRSDepTime','Dest']]
df23 = df23[(df23['Origin'] == 'DFW')]

In [None]:
# Inserting 'FlightDate' so the remainder of the code works the same. FlightDate was a column taken out of the 23 data prior to my download.
def createFlightDate(year, month, day):
    month = str(month).zfill(2)
    day = str(day).zfill(2)
    year = str(year)
    ymd = year + '-' + month + '-' + day
    return ymd

df23['FlightDate'] = df23.apply(lambda x: createFlightDate(x['Year'], x['Month'], x['DayofMonth']), axis = 1).copy()

In [None]:
# Combine all years into one dataframe
df = pd.concat([df10_19, df23], axis=0)

# We can narrow down even further to just the airport code and the time
# The airport code will be used to determine which weather station to match to for weather
# and the time will be used to determine which record for that weather station best matches to the time of each flight
# We'll be making a dataframe for the Origin airport and one for the Destination
origindatetime = df[['Origin', 'FlightDate','CRSDepTime']]
print(origindatetime.shape)

# Many flights depart at the same time so we can narrow this down further
origindatetime = origindatetime.drop_duplicates()
print(origindatetime.shape)

#Now the destination
destdatetime = df[['Dest', 'FlightDate', 'CRSDepTime']]
print(destdatetime.shape)
destdatetime = destdatetime.drop_duplicates()
print(destdatetime.shape)

(1039545, 3)
(496672, 3)
(1039545, 3)
(1038602, 3)


Lets do a little work with our dates to match better

In [None]:
# This function creates an datetime value for the flights departure time
def createYMD(date, HM):
    year = ''
    month = ''
    day = ''
    hour = ''
    minute = ''
    for i in range(4):
        year = year + date[i]
    for i in range(5,7):
        month = month + date[i]
    for i in range(8,10):
        try:
            day = day + date[i]
        except:
            print("date:", date, "HM", HM, "day:", day, "i:", i)
    hourminute = str(HM).zfill(4)
    hour = str(hourminute[:2]).zfill(2)
    minute = str(hourminute[2:]).zfill(2)
    OriginDtTm1 = year+month + day + hour + minute
    OriginDtTm2 = pd.to_datetime(OriginDtTm1, format= '%Y%m%d%H%M', errors='coerce')
    return OriginDtTm2 #year, month, day, hour, minute

# Origin data - things weren't working well with FlightDate even though it's a string
#               this is an odd step but putting it out to a list of characters worked better
origindatetime['date_list'] = origindatetime['FlightDate'].apply(lambda x: [char for char in x]).copy()
origindatetime['OriginDtTm2'] = origindatetime.apply(lambda x: createYMD(x['date_list'],x['CRSDepTime']), axis = 1).copy() #, result_type="expand")
origindatetime.head()

# Destination
destdatetime['date_list'] = destdatetime['FlightDate'].apply(lambda x: [char for char in x]).copy()
destdatetime['OrigDtTm2'] = destdatetime.apply(lambda x: createYMD(x['date_list'],x['CRSDepTime']), axis = 1).copy() #, result_type="expand")
destdatetime.head()

# Saving our keys to use later
origindatetime.to_parquet(basepath + "/OnTime/origindatetime_dfw_uniq.parquet")
destdatetime.to_parquet(basepath + "/OnTime/destdatetime_dfw_uniq.parquet")