# Create the Weather Only Files to be used with the DFW Origin and Destinations

Work with dups. There were some weather records duplicated based on date and time.
- Create a df with all NON duplicates
- Create a df with all dups
    - 1st - Take out the rows where liq_precip_qty == '99.0' - these happen to be the rows with all records having complete missing data
          - This was based on analysis not included in this notebook.
            Keeping those != '99.0'
    - 2nd - Drop all remaining duplicate records
        This drops BOTH of the dups for each of the WeatherDtTm dups because we don't know which of the records contains good data.
        This should result in a loss of a total of 162 rows, which equals 81 timepoints.

In [None]:
import pandas as pd
basepath = 'your_path_here'

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Get list of airports to filter down to
destdatetime = pd.read_parquet(basepath + "/OnTime/destdatetime_dttm_dfw_uniq.parquet")
airport_codes = destdatetime['Dest'].unique().tolist()
airport_codes.append('DFW')
print("# of airport codes:", len(airport_codes))

# of airport codes: 193


In [None]:
def DFW_Dest_and_No_Weather_Dups(df, year):
    print("YEAR:", year, "Time:", datetime.now())
    print("full shape:", df.shape)
    weather_airports = df[df['airport_code'].isin(airport_codes)]
    print("filtered airport codes shape:", weather_airports.shape)

    NON_Dups = weather_airports.drop_duplicates(subset=['airport_code', 'WeatherDtTm'])
    print("NON_Dups.shape", NON_Dups.shape)

    WeatherDups = weather_airports[weather_airports.duplicated(subset=['airport_code', 'WeatherDtTm'], keep=False)]
    print("WeatherDups.shape", WeatherDups.shape)

    DupsWithGoodData = WeatherDups[WeatherDups['liq_precip_qty'] != '99.0']
    print("DupsWithGoodData.shape", DupsWithGoodData.shape)

    DupsKeeping = DupsWithGoodData[~(DupsWithGoodData.duplicated(subset=['airport_code', 'WeatherDtTm'], keep=False))]
    print("DupsKeeping.shape", DupsKeeping.shape)

    DFW_Weather = pd.concat([NON_Dups, DupsKeeping], axis=0)
    print("DFW_Weather.shape", DFW_Weather.shape)

    CheckAllMissing =  WeatherDups[WeatherDups['liq_precip_qty'] == '99.0']

    return DFW_Weather

In [None]:
weather_2010 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2010.parquet")
fnl_weather_2010 = DFW_Dest_and_No_Weather_Dups(weather_2010, 2010)
fnl_weather_2010.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2010.parquet")

YEAR: 2010 Time: 2023-07-10 08:57:20.926332
full shape: (4876003, 25)
filtered airport codes shape: (2398689, 25)
NON_Dups.shape (2395215, 25)
WeatherDups.shape (6942, 25)
DupsWithGoodData.shape (6942, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2395215, 25)


In [None]:
weather_2011 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2011.parquet")
fnl_weather_2011 = DFW_Dest_and_No_Weather_Dups(weather_2011, 2011)
fnl_weather_2011.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2011.parquet")

YEAR: 2011 Time: 2023-07-10 08:57:23.964139
full shape: (4901066, 25)
filtered airport codes shape: (2383022, 25)
NON_Dups.shape (2379821, 25)
WeatherDups.shape (6397, 25)
DupsWithGoodData.shape (6397, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2379821, 25)


In [None]:
weather_2012 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2012.parquet")
fnl_weather_2012 = DFW_Dest_and_No_Weather_Dups(weather_2012, 2012)
fnl_weather_2012.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2012.parquet")

YEAR: 2012 Time: 2023-07-10 08:57:26.882908
full shape: (4882547, 25)
filtered airport codes shape: (2391115, 25)
NON_Dups.shape (2388122, 25)
WeatherDups.shape (5983, 25)
DupsWithGoodData.shape (5983, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2388122, 25)


In [None]:
weather_2013 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2013.parquet")
fnl_weather_2013 = DFW_Dest_and_No_Weather_Dups(weather_2013, 2013)
fnl_weather_2013.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2013.parquet")

YEAR: 2013 Time: 2023-07-10 08:57:29.862547
full shape: (4940273, 25)
filtered airport codes shape: (2424879, 25)
NON_Dups.shape (2421733, 25)
WeatherDups.shape (6288, 25)
DupsWithGoodData.shape (6288, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2421733, 25)


In [None]:
weather_2014 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2014.parquet")
fnl_weather_2014 = DFW_Dest_and_No_Weather_Dups(weather_2014, 2014)
fnl_weather_2014.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2014.parquet")

YEAR: 2014 Time: 2023-07-10 08:57:33.206971
full shape: (4920282, 25)
filtered airport codes shape: (2432603, 25)
NON_Dups.shape (2428761, 25)
WeatherDups.shape (7672, 25)
DupsWithGoodData.shape (7672, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2428761, 25)


In [None]:
weather_2015 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2015.parquet")
fnl_weather_2015 = DFW_Dest_and_No_Weather_Dups(weather_2015, 2015)
fnl_weather_2015.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2015.parquet")

YEAR: 2015 Time: 2023-07-10 08:57:36.460234
full shape: (5048855, 25)
filtered airport codes shape: (2489880, 25)
NON_Dups.shape (2483313, 25)
WeatherDups.shape (13113, 25)
DupsWithGoodData.shape (13113, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2483313, 25)


In [None]:
weather_2016 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2016.parquet")
fnl_weather_2016 = DFW_Dest_and_No_Weather_Dups(weather_2016, 2016)
fnl_weather_2016.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2016.parquet")

YEAR: 2016 Time: 2023-07-10 08:57:39.790340
full shape: (4980458, 25)
filtered airport codes shape: (2455420, 25)
NON_Dups.shape (2444153, 25)
WeatherDups.shape (22509, 25)
DupsWithGoodData.shape (22509, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2444153, 25)


In [None]:
weather_2017 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2017.parquet")
fnl_weather_2017 = DFW_Dest_and_No_Weather_Dups(weather_2017, 2017)
fnl_weather_2017.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2017.parquet")

YEAR: 2017 Time: 2023-07-10 08:57:42.959176
full shape: (4863705, 25)
filtered airport codes shape: (2417864, 25)
NON_Dups.shape (2406906, 25)
WeatherDups.shape (21889, 25)
DupsWithGoodData.shape (21889, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2406906, 25)


In [None]:
weather_2018 = pd.read_parquet(basepath + "Data/Weather/Hourly/Weather_2018.parquet")
fnl_weather_2018 = DFW_Dest_and_No_Weather_Dups(weather_2018, 2018)
fnl_weather_2018.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2018.parquet")

YEAR: 2018 Time: 2023-07-10 08:57:46.142119
full shape: (4861115, 25)
filtered airport codes shape: (2430696, 25)
NON_Dups.shape (2419022, 25)
WeatherDups.shape (23321, 25)
DupsWithGoodData.shape (23321, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2419022, 25)


In [None]:
weather_2019 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2019.parquet")
fnl_weather_2019 = DFW_Dest_and_No_Weather_Dups(weather_2019, 2019)
fnl_weather_2019.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2019.parquet")

YEAR: 2019 Time: 2023-07-10 08:57:49.344474
full shape: (4864739, 25)
filtered airport codes shape: (2427688, 25)
NON_Dups.shape (2418198, 25)
WeatherDups.shape (18965, 25)
DupsWithGoodData.shape (18965, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (2418198, 25)


In [None]:
weather_2023 = pd.read_parquet(basepath + "Weather/Hourly/Weather_2023.parquet")
fnl_weather_2023 = DFW_Dest_and_No_Weather_Dups(weather_2023, 2023)
fnl_weather_2023.to_parquet(basepath + "Weather/Hourly/Weather_DFW_2023.parquet")

YEAR: 2023 Time: 2023-07-10 08:57:52.293332
full shape: (2201127, 25)
filtered airport codes shape: (1115636, 25)
NON_Dups.shape (1110556, 25)
WeatherDups.shape (10145, 25)
DupsWithGoodData.shape (10145, 25)
DupsKeeping.shape (0, 25)
DFW_Weather.shape (1110556, 25)
