## Work with Weather Columns
<br>
This code works with the weather data, expanding columns, narrowing down to the columns for our models and fills missing values.

In [None]:
import pandas as pd
basepath = 'your_path_here'

Note: you may need to restart the kernel to use updated packages.


# Creating the weather files with the columns determined to be most useful
This code creates a yearly weather files with the columns we determined after adding the data to the OnTime Keys with Weather Delay/No-Delay flag to see which ones look like they offer more data.<br>
<br>
The next step/notebook will narrow these down to just the DFW and DFW-Destinations and then work with the duplicate weather records to narrow down to unique records per Date/Time

In [None]:
# Get list of airports to filter down to
destdatetime = pd.read_parquet(basepath + "/OnTime/destdatetime_dttm_dfw_uniq.parquet")
airport_codes = destdatetime['Dest'].unique().tolist()
airport_codes.append('DFW')
print("# of airport codes:", len(airport_codes))

In [None]:
# This is basically the same steps used to create the datetime column for the OnTime keys
def createYMD(date):
    year = ''
    month = ''
    day = ''
    hour = ''
    minute = ''
    for i in range(4):
        year = year + date[i]
    for i in range(5,7):
        month = month + date[i]
    for i in range(8,10):
        day = day + date[i]
    for i in range(11,13):
        hour = hour + date[i]
    for i in range(14,16):
        minute = minute + date[i]
    OriginDtTm1 = year + month + day + hour + minute
    WeatherDtTm = pd.to_datetime(OriginDtTm1, format= '%Y%m%d%H%M', errors='coerce')
    return WeatherDtTm #year, month, day, hour, minute

# This function is used to open each of the yearly weather files and apply the function above creating the datetime
def createYrWeather(df, year):
    df = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_"+ year+".pkl")
    df['DateTime_charlist'] = df['DATE'].apply(lambda x: [str(char) for char in x]).copy()
    df['WeatherDtTm'] = df.apply(lambda x: createYMD(x['DateTime_charlist']), axis = 1, result_type="expand").copy()
    df.drop(['DateTime_charlist'], axis = 1, inplace=True)
    return df

There are many columns in the weather data with several columns comma separated within them.<br>
These columns need to be split and expanded into their own columns. We don't need everything in each original column but we have to split it to get the column out we do want.

In [None]:
def updateWeatherCols(wind, cig, vis, tmp, dew, slp, aa1, ga1, gd1, gf1, ma1):
    wind = str(wind)
    cig = str(cig)
    vis = str(vis)
    tmp = str(tmp)
    dew = str(dew)
    slp = str(slp)
    aa1 = str(aa1)
    ga1 = str(ga1)
    gd1 = str(gd1)
    gf1 = str(gf1)
    ma1 = str(ma1)
    if wind.count(",") == 4:
        w_dir_angle, w_dir_qlty, w_type, w_speed_rate, w_speed_qlty = wind.split(",")
    else:
        w_dir_angle, w_dir_qlty, w_type, w_speed_rate, w_speed_qlty = 999, None, 9, 9999, None

    if cig.count(",") == 3:
        sky_c_hgt, sky_c_qlty, sky_c_det, sky_c_cavok = cig.split(",")
    else:
        sky_c_hgt, sky_c_qlty, sky_c_det, sky_c_cavok = 99999, None, 9, 9

    if vis.count(",") == 3:
        vis_dist, vis_dist_qlty, vis_var, vis_var_qlty = vis.split(",")
    else:
        vis_dist, vis_dist_qlty, vis_var, vis_var_qlty = 999999, None, 9, None

    if tmp.count(",") == 1:
        tmp_air, tmp_air_qlty = tmp.split(",")
    else:
        tmp_air, tmp_air_qlty = +9999, None

    if dew.count(",") == 1:
        tmp_dew, tmp_dew_qlty = dew.split(",")
    else:
        tmp_dew, tmp_dew_qlty = +9999, None

    if slp.count(",") == 1:
        sea_lvl_p, sea_lvl_p_qlty = slp.split(",")
    else:
        sea_lvl_p, sea_lvl_p_qlty = 99999, 99

    if aa1.count(",") == 3:
        liq_precip_qty, liq_precip_dim, liq_precip_cond, liq_precip_qlty = aa1.split(",")
    else:
        liq_precip_qty, liq_precip_dim, liq_precip_cond, liq_precip_qlty = 99, 9999, 9, None

    if ga1.count(",") == 5:
        sky_cov, sky_cov_qlty, sky_cov_base_hgt, sky_cov_base_qlty, sky_cov_cld, sky_cov_cld_qlty = ga1.split(",")
    else:
        sky_cov, sky_cov_qlty, sky_cov_base_hgt, sky_cov_base_qlty, sky_cov_cld, sky_cov_cld_qlty = 99, None, +99999, None, 99, None

    if gd1.count(",") == 5:
        sky_sum_cov, sky_sum_cov2, sky_sum_cov_qlty, sky_sum_hgt, sky_sum_hgt_qlty, sky_sum_char = gd1.split(",")
    else:
        sky_sum_cov, sky_sum_cov2, sky_sum_cov_qlty, sky_sum_hgt, sky_sum_hgt_qlty, sky_sum_char = 9, 99, None, +99999, 99999, None

    if gf1.count(",") == 12:
        sky_obs_tot_cov, sky_obs_tot_opaq, sky_obs_qlty_tot_cov, sky_obs_tot_low_cld, sky_obs_qlty_tot_low, \
        sky_low_cld_genus, sky_qlty_low_cld_genus, sky_low_cld_base_hgt, sky_low_cld_base_hgt_qlty, \
        sky_mid_cld_genus, sky_qlty_mid_cld_genus, sky_hi_cld_genus, sky_qlty_hi_cld_genus = gf1.split(",")
    else:
        sky_obs_tot_cov, sky_obs_tot_opaq, sky_obs_qlty_tot_cov, sky_obs_tot_low_cld, sky_obs_qlty_tot_low, \
        sky_low_cld_genus, sky_qlty_low_cld_genus, sky_low_cld_base_hgt, sky_low_cld_base_hgt_qlty, \
        sky_mid_cld_genus, sky_qlty_mid_cld_genus, sky_hi_cld_genus, sky_qlty_hi_cld_genus\
            = 99, None, 99, None, None,\
                None, None, 99999, 99,\
                None, None, None, None

    if ma1.count(",") == 3:
        at_pres_altimeter_rate, at_pres_altimeter_qlty, at_pres_stn_rate, at_pres_stn_qlty = ma1.split(",")
    else:
        at_pres_altimeter_rate, at_pres_altimeter_qlty, at_pres_stn_rate, at_pres_stn_qlty = 99999, None, 99999, None

    return w_dir_angle, w_type, w_speed_rate, sky_c_hgt, sky_c_det, sky_c_cavok,\
            vis_dist, vis_var, tmp_air, tmp_dew, sea_lvl_p, \
            liq_precip_qty, liq_precip_dim, liq_precip_cond, sky_cov, sky_cov_base_hgt, sky_cov_cld, \
            sky_sum_cov, sky_sum_hgt, sky_obs_tot_cov,\
            sky_low_cld_base_hgt, at_pres_altimeter_rate, at_pres_stn_rate

In [None]:
def expandColsAndRunFreqs(df):
    df[['w_dir_angle', 'w_type', 'w_speed_rate', 'sky_c_hgt', 'sky_c_det', 'sky_c_cavok',\
            'vis_dist', 'vis_var', 'tmp_air', 'tmp_dew', 'sea_lvl_p', \
            'liq_precip_qty', 'liq_precip_dim', 'liq_precip_cond', 'sky_cov', 'sky_cov_base_hgt', 'sky_cov_cld', \
            'sky_sum_cov', 'sky_sum_hgt', 'sky_obs_tot_cov',\
            'sky_low_cld_base_hgt', 'at_pres_altimeter_rate', 'at_pres_stn_rate']]\
                = df.apply(lambda x: updateWeatherCols( x['WND'], x['CIG'], x['VIS'], x['TMP'], x['DEW'], x['SLP'],\
                    x['AA1'], x['GA1'], x['GD1'], x['GF1'], x['MA1']), axis = 1, result_type="expand")

    # This list of possible columns are the ones we want to keep
    possible_columns = ['airport_code', 'WeatherDtTm', \
        'w_dir_angle', 'w_type', 'w_speed_rate', 'sky_c_hgt', 'sky_c_det', 'sky_c_cavok', \
        'vis_dist', 'vis_var', 'tmp_air', 'tmp_dew', 'sea_lvl_p', \
        'liq_precip_qty', 'liq_precip_dim', 'liq_precip_cond', 'sky_cov', 'sky_cov_base_hgt', 'sky_cov_cld', \
        'sky_sum_cov', 'sky_sum_hgt', 'sky_obs_tot_cov', \
        'sky_low_cld_base_hgt', 'at_pres_altimeter_rate', 'at_pres_stn_rate']

    # Compare list of columns to what should be in the final dataframe
    weathercols = df.columns.to_list()
    dropcols = []
    for col in weathercols:
        if col not in possible_columns:
            dropcols.append(col)

    # Drop the columns that aren't in the list of possible_columns
    df.drop(dropcols, axis = 1, inplace=True)

    # Convert several columns containing numeric values as string to numeric
    # Assign the missing value. The values used for missing are those determined and used in the NOAA documentation.
    df[['sky_sum_cov', 'liq_precip_cond']] = df[['sky_sum_cov', 'liq_precip_cond']].fillna(9).astype(int)

    df[['sky_c_det', 'sky_c_cavok', 'w_type', 'vis_var', 'liq_precip_qty', 'sky_cov', 'sky_cov_cld', 'sky_obs_tot_cov']] =\
        df[['sky_c_det', 'sky_c_cavok', 'w_type', 'vis_var', 'liq_precip_qty', 'sky_cov', 'sky_cov_cld', 'sky_obs_tot_cov']].astype(str)

    df[['sky_c_det', 'sky_c_cavok', 'w_type', 'vis_var']] =  df[[ 'sky_c_det', 'sky_c_cavok', 'w_type', 'vis_var']].fillna('9').astype(str)

    df[['liq_precip_qty', 'sky_cov', 'sky_cov_cld', 'sky_obs_tot_cov']] = df[['liq_precip_qty', 'sky_cov', 'sky_cov_cld', 'sky_obs_tot_cov']].fillna('99').astype(str)

    df[['w_dir_angle']] = df[['w_dir_angle']].fillna(999).astype(int)

    df[['w_speed_rate', 'liq_precip_dim', 'tmp_air', 'tmp_dew']] = df[['w_speed_rate', 'liq_precip_dim', 'tmp_air', 'tmp_dew']].fillna(9999).astype(int)

    df[['sky_c_hgt', 'sea_lvl_p', 'sky_low_cld_base_hgt', 'at_pres_altimeter_rate', 'at_pres_stn_rate', 'sky_cov_base_hgt', 'sky_sum_hgt' ]] = \
        df[['sky_c_hgt', 'sea_lvl_p', 'sky_low_cld_base_hgt', 'at_pres_altimeter_rate', 'at_pres_stn_rate', 'sky_cov_base_hgt', 'sky_sum_hgt' ]].fillna(99999).astype(int)

    df[['vis_dist']] = df[['vis_dist']].fillna(999999).astype(int)

    #Items found later, correcting things like 9 vs 9.0 which should be the same
    for var in ['sky_c_det', 'w_type', 'sky_c_cavok', 'vis_var']:
        df.loc[df[var].isin(['9.0']), var] = '9'

    for var in ['liq_precip_qty', 'sky_cov', 'sky_cov_cld', 'sky_obs_tot_cov']:
        df.loc[df[var].isin(['99.0']), var] = '99'

    return df

In [None]:
#Hourly weather files were moved to subfolder: Weather_year_pkl_FirstTimeThrough, under Hourly:
weather_2010 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2010.pkl")
weather_2010_2 = createYrWeather(weather_2010, '2010')
weather_2010_3 = expandColsAndRunFreqs(weather_2010_2)
weather_2010_3.to_parquet(basepath + "/Weather/Hourly/Weather_2010.parquet")

In [None]:
weather_2011 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2011.pkl")
weather_2011_2 = createYrWeather(weather_2011, '2011')
weather_2011_3 = expandColsAndRunFreqs(weather_2011_2)
weather_2011_3.to_parquet(basepath + "/Weather/Hourly/Weather_2011.parquet")

In [None]:
weather_2012 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2012.pkl")
weather_2012_2 = createYrWeather(weather_2012, '2012')
weather_2012_3 = expandColsAndRunFreqs(weather_2012_2)
weather_2012_3.to_parquet(basepath + "/Weather/Hourly/Weather_2012.parquet")

In [None]:
weather_2013 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2013.pkl")
weather_2013_2 = createYrWeather(weather_2013, '2013')
weather_2013_3 = expandColsAndRunFreqs(weather_2013_2)
weather_2013_3.to_parquet(basepath + "/Weather/Hourly/Weather_2013.parquet")

In [None]:
weather_2014 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2014.pkl")
weather_2014_2 = createYrWeather(weather_2014, '2014')
weather_2014_3 = expandColsAndRunFreqs(weather_2014_2)
weather_2014_3.to_parquet(basepath + "/Weather/Hourly/Weather_2014.parquet")

In [None]:
weather_2015 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2015.pkl")
weather_2015_2 = createYrWeather(weather_2015, '2015')
weather_2015_3 = expandColsAndRunFreqs(weather_2015_2)
weather_2015_3.to_parquet(basepath + "/Weather/Hourly/Weather_2015.parquet")

In [None]:
weather_2016 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2016.pkl")
weather_2016_2 = createYrWeather(weather_2016, '2016')
weather_2016_3 = expandColsAndRunFreqs(weather_2016_2)
weather_2016_3.to_parquet(basepath + "/Weather/Hourly/Weather_2016.parquet")

In [None]:
weather_2017 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2017.pkl")
weather_2017_2 = createYrWeather(weather_2017, '2017')
weather_2017_3 = expandColsAndRunFreqs(weather_2017_2)
weather_2017_3.to_parquet(basepath + "/Weather/Hourly/Weather_2017.parquet")

In [None]:
weather_2018 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2018.pkl")
weather_2018_2 = createYrWeather(weather_2018, '2018')
weather_2018_3 = expandColsAndRunFreqs(weather_2018_2)
weather_2018_3.to_parquet(basepath + "/Weather/Hourly/Weather_2018.parquet")

In [None]:
weather_2019 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2019.pkl")
weather_2019_2 = createYrWeather(weather_2019, '2019')
weather_2019_3 = expandColsAndRunFreqs(weather_2019_2)
weather_2019_3.to_parquet(basepath + "/Weather/Hourly/Weather_2019.parquet")

In [None]:
weather_2023 = pd.read_pickle(basepath + "/Weather/Hourly/Weather_year_pkl_FirstTimeThrough/Weather_2023.pkl")
weather_2023_2 = createYrWeather(weather_2023, '2023')
weather_2023_3 = expandColsAndRunFreqs(weather_2023_2)
weather_2023_3.to_parquet(basepath + "/Weather/Hourly/Weather_2023.parquet")