# Preprocess Output
This script will comb through the FUSAR runways_data_set.csv data and produce as output a table of form:

| AIRPORT | DATE | TIME (15 min intervals) | # arrivals |
| ------- | ---- | ----------------------- | ---------- |

_In the final output, we will need to convert from TIME (15 min intervals) to BASETIME_BUCKET where BASETIME = TIME and BUCKET is 15 minute intervals._

In [29]:
import pandas as pd
import os
import math
from datetime import date, datetime

In [3]:
def get_bucket(time):
    #given an arbitrary time t (hhmm), returns closest 15 minute interval
    #example: 0112 -> 0130
    hour = int(time[:2])
    minute = int(time[2:])
    if minute <= 15:
        return str(hour%24).zfill(2)+"15"
    elif minute > 15 and minute <= 30:
        return str(hour%24).zfill(2)+"30"
    elif minute > 30 and minute <= 45:
        return str(hour%24).zfill(2)+"45"
    elif minute > 45:
        return str((hour+1)%24).zfill(2)+"00"

In [4]:
#given time t, increases to next bucket
#ex: 0115 -> 0130
def increment_time(time):
    time = get_bucket(time)
    hour = int(time[:2])
    minute = int(time[2:])
    match minute:
        case 00:
            return str(hour%24).zfill(2)+"15"
        case 15:
            return str(hour%24).zfill(2)+"30"
        case 30:
            return str(hour%24).zfill(2)+"45"
        case 45:
            return str((hour+1)%24).zfill(2)+"00"
        case _:
            return None

In [5]:
def convert_timestamp(file, row):
    airport = file.split("_")[0]
    #date = (YYMMDD)
    date = row.split(" ")[0].split("-")[0][2:]+row.split(" ")[0].split("-")[1]+row.split(" ")[0].split("-")[2]
    time = get_bucket(row.split(" ")[-1].split(":")[0]+row.split(" ")[-1].split(":")[1])
    return airport, date, time              

In [39]:
Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]
def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)

autumn


In [6]:
#returns a dataframe with cols [[airport_date_time][# arrivals]]
def get_target_data():
    folder_paths = ['FUSER_train_1']
    extension = 'runways_data_set.csv'
    target_data = {}

    # Search for files in the specified folder
    for folder_path in folder_paths:
        for root, dirs, files in os.walk("data/"+folder_path):
            for file in files:
                if file.endswith(extension):
                    df = pd.read_csv(os.path.join(root, file))
                    for ind, row in df.iterrows():
                        if not pd.isna(df.loc[ind, 'arrival_runway_actual_time']):
                            airport, date, time = convert_timestamp(file, df["arrival_runway_actual_time"][ind])
                            key = airport+"_"+date+"_"+time
                            if key in target_data.keys():
                                target_data[key]+=1
                            else:
                                target_data[key] = 0
    return pd.DataFrame.from_dict(target_data, orient='index', columns=['# arrivals'])

In [7]:
#returns dataframe with cols [airport_date_time][][] 
def get_weather_data():
    folder_paths = ['FUSER_train_1']
    extension = 'LAMP_data_set.csv'
    weather_data = {}
    #weather data is forecasts from timestamp t to n steps in the future. This only uses forecast n=1 steps into the future. 

    # Search for files in the specified folder
    for folder_path in folder_paths:
        for root, dirs, files in os.walk("data/"+folder_path):
            for file in files:
                if file.endswith(extension):
                    df = pd.read_csv(os.path.join(root, file))
                    for ind in range(0, len(df), 25):
                        if not pd.isna(df.loc[ind, 'forecast_timestamp']):
                            airport, date, time = convert_timestamp(file, df["forecast_timestamp"][ind])
                            #get the forecast data for each timestamp
                            temperature = df["temperature"][ind]
                            wind_direction = df["wind_direction"][ind]
                            wind_speed = df["wind_speed"][ind]
                            wind_gust = df["wind_gust"][ind]
                            cloud_ceiling = df["cloud_ceiling"][ind]
                            visibility = df["visibility"][ind]
                            cloud = df["cloud"][ind]
                            lightning_prob = df["lightning_prob"][ind]
                            precip = df["precip"][ind]
                            #set forecast for buckets 15,30,45,00
                            times = [time, increment_time(time), increment_time(increment_time(time)), increment_time(increment_time(increment_time(time)))]
                            keys = [airport+"_"+date+"_"+time for time in times]
                            for key in keys:
                                weather_data[key] = [temperature, wind_direction, wind_speed, wind_gust, cloud_ceiling, visibility, cloud, lightning_prob, precip]
    return pd.DataFrame.from_dict(weather_data, orient='index', columns=['temperature', 'wind_direction', 'wind_speed', 'wind_gust', 'cloud_ceiling', 'visibility', 'cloud', 'lightning_prob', 'precip'])                

In [42]:
#for each row in df, add time, day, week, season data
#for now I'm doing a timeseries, but we could change this to one-hot in the future 
def extract_time_data(df):
    airports = []
    times= []
    weekdays = []
    days = []
    months = []
    seasons = []
    for ind, row in df.iterrows():
        split_str = ind.split("_")
        airport = split_str[0]
        date = split_str[1]
        time = split_str[2]
        day = date[4:]
        weekday = datetime.strptime(date, '%y%m%d').weekday()
        month = date[2:4]
        season = get_season(datetime.strptime(date, '%y%m%d'))
        #append to list
        airports.append(airport)
        times.append(int(time))
        weekdays.append(weekday)
        days.append(int(day))
        months.append(int(month))
        seasons.append(season)
    df['Airport'] = airports
    df['Time'] = times
    df['Weekday'] = weekdays
    df['Day'] = days
    df['Month'] = months
    df['Season'] = seasons
    return df     

# Final Preprocessing

In [44]:
#align weather data with target data using AIRPORT_DATE_TIME and index
data = pd.concat([get_weather_data(), get_target_data()], axis=1)
#extract time data from index
data = extract_time_data(data)
#one-hot encode
data = pd.get_dummies(data, dtype=float)
#move '# arrivals' to end of dataframe
cols_at_end = ['# arrivals']
data = data[[c for c in data if c not in cols_at_end] 
        + [c for c in cols_at_end if c in data]]
#drop rows with NaN
data.dropna(inplace=True)