In [1]:
import os
import copy
import joblib
import shutil
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm
from traffic.core import Flight
from traffic.core import Traffic
from matplotlib import pyplot as plt
from joblib import Parallel, delayed

runway_df = pd.read_csv("runways.csv")
airports_df = pd.read_csv("airports.csv")
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
pd.set_option("display.max_columns", 100)
data_folder = os.path.join(os.getcwd(), "data")
acropole_folder = "../../../Acropole/acropole/"
flights_folder = os.path.join(os.getcwd(), "flightDfs")
cha_df = pd.read_csv(os.path.join(data_folder, "challenge_set.csv"))
sub_df = pd.read_csv(os.path.join(data_folder, "submission_set.csv"))
final_sub_df = pd.read_csv(os.path.join(data_folder, "final_submission_set.csv"))
acropole_folder = "../../../Acropole/acropole/data/"
flight_fuel_df = joblib.load("flight_fuel_df.pkl")
usable_flight_ids = list(set(cha_df.flight_id.unique()).union(
    set(sub_df.flight_id.unique()).union(
        set(final_sub_df.flight_id.unique())
    )
))

In [2]:
def is_leap_year(year):
    """
    Determines if a given year is a leap year.

    Parameters:
    year (int): The year to check.

    Returns:
    bool: True if the year is a leap year, False otherwise.
    """
    return (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)

def day_cyclic_values(timestamps):
    """
    Computes cyclic values (x, y) for time of day using a unit circle.

    Parameters:
    timestamps (pd.Series): Timestamps for which to compute the cyclic values.

    Returns:
    tuple: Two numpy arrays representing the x and y coordinates on a unit circle.
    """
    second_of_day = timestamps.dt.hour * 3600 + timestamps.dt.minute * 60 + timestamps.dt.second
    second_of_day_normalized = second_of_day / 86400.0  # 86400 seconds in a day
    x_day = np.cos(2 * np.pi * second_of_day_normalized)
    y_day = np.sin(2 * np.pi * second_of_day_normalized)
    return x_day, y_day

def week_cyclic_values(timestamps):
    """
    Computes cyclic values (x, y) for the day of the week using a unit circle.

    Parameters:
    timestamps (pd.Series): Timestamps for which to compute the cyclic values.

    Returns:
    tuple: Two numpy arrays representing the x and y coordinates on a unit circle.
    """
    day_of_week = timestamps.dt.dayofweek
    second_of_day = timestamps.dt.hour * 3600 + timestamps.dt.minute * 60 + timestamps.dt.second
    day_of_week_normalized = day_of_week / 7.0
    second_of_day_normalized = second_of_day / 86400.0  # 86400 seconds in a day
    week_fraction = day_of_week_normalized + second_of_day_normalized / 7.0
    x_week = np.cos(2 * np.pi * week_fraction)
    y_week = np.sin(2 * np.pi * week_fraction)
    return x_week, y_week

def month_cyclic_values(timestamps):
    """
    Computes cyclic values (x, y) for the day of the month using a unit circle.

    Parameters:
    timestamps (pd.Series): Timestamps for which to compute the cyclic values.

    Returns:
    tuple: Two numpy arrays representing the x and y coordinates on a unit circle.
    """
    day_of_month = timestamps.dt.day
    second_of_day = timestamps.dt.hour * 3600 + timestamps.dt.minute * 60 + timestamps.dt.second
    days_in_month = timestamps.dt.days_in_month
    day_of_month_normalized = day_of_month / days_in_month
    second_of_day_normalized = second_of_day / 86400.0  # 86400 seconds in a day
    month_fraction = day_of_month_normalized + second_of_day_normalized / days_in_month
    x_month = np.cos(2 * np.pi * month_fraction)
    y_month = np.sin(2 * np.pi * month_fraction)
    return x_month, y_month

def year_cyclic_values(timestamps):
    """
    Computes cyclic values (x, y) for the day of the year using a unit circle, accounting for leap years.

    Parameters:
    timestamps (pd.Series): Timestamps for which to compute the cyclic values.

    Returns:
    tuple: Two numpy arrays representing the x and y coordinates on a unit circle.
    """
    year = timestamps.dt.year
    day_of_year = timestamps.dt.dayofyear
    second_of_day = timestamps.dt.hour * 3600 + timestamps.dt.minute * 60 + timestamps.dt.second
    days_in_year = year.apply(lambda y: 366 if is_leap_year(y) else 365)
    day_of_year_normalized = day_of_year / days_in_year
    second_of_day_normalized = second_of_day / 86400.0  # 86400 seconds in a day
    year_fraction = day_of_year_normalized + second_of_day_normalized / days_in_year
    x_year = np.cos(2 * np.pi * year_fraction)
    y_year = np.sin(2 * np.pi * year_fraction)
    return x_year, y_year

def generate_phase_segments(df):
    """
    Splits a DataFrame into segments where the 'phase' column changes.

    Parameters:
    df (pd.DataFrame): Input DataFrame containing a 'phase' column.

    Returns:
    dict: A dictionary where keys are phase values and values are lists of indices for each segment.
    """
    df = df[["phase"]].copy()
    # Create a shifted phase column to detect phase changes
    df['shifted_phase'] = df['phase'].shift(1)

    # Mark where phase changes (start of a new segment)
    df['segment'] = (df['phase'] != df['shifted_phase']).cumsum()

    # Initialize the dictionary to store the segments
    phase_segments = {}

    # Use groupby to group by phase and segment and collect indices
    for (phase, segment), group in df.groupby(['phase', 'segment']):
        if phase not in phase_segments:
            phase_segments[phase] = []
        phase_segments[phase].append(np.array(list(group.index)))

    return phase_segments

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Compute the great-circle distance between two points on the Earth's surface using NumPy, in meters.
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Radius of earth in meters (6371 km converted to meters)
    r = 6371000
    return c * r

def calculate_distances(df, lat1_col, lon1_col, lat2_col, lon2_col):
    # Shift the lat/lon columns to create the "previous" row
    lat1 = df[lat1_col].values
    lon1 = df[lon1_col].values
    lat2 = df[lat2_col].values
    lon2 = df[lon2_col].values
    
    # Compute the distance between consecutive points
    distances = haversine_np(lon1, lat1, lon2, lat2)

    # Add distances to DataFrame in meters
    return distances

def get_flight_stats_row(flight_df="df", n_quantiles=5, stats_columns="list"):
    """
    Computes statistics (mean, std, quantiles) and cyclic variability for flight data.

    Parameters:
    flight_df (pd.DataFrame): DataFrame containing flight data.
    n_quantiles (int): Number of quantiles to compute.
    stats_columns (list): List of column names to compute statistics on.

    Returns:
    pd.Series: A series of computed statistics including mean, standard deviation, quantiles, 
               and variability within quantile bins.
    """
    means = flight_df[stats_columns].mean()
    stdvs = flight_df[stats_columns].std(ddof=0)
    quants = flight_df[stats_columns].quantile(q=np.linspace(0,1,n_quantiles).round(2))
    total_distance = pd.Series(flight_df.distance.sum(), index=["total_distance"])
    # rearranging
    means.index = ["mean_" + ind for ind in means.index]
    stdvs.index = ["stdv_" + ind for ind in stdvs.index]
    quants = quants.stack().to_frame().T
    quants.columns = [f"{col[0]}_{col[1]}" for col in quants.columns]
    quants = quants.loc[0]
    
    # Step 1: Prepare dictionaries to hold standard deviations and normalized standard deviations
    quantile_stdvs_dict = {}
    normalized_stdvs_dict = {}
    
    # Step 2: Loop over each stat_column
    for column in stats_columns:
        # Get the quantile values for the current column, ensuring duplicates are dropped
        quantile_bins = flight_df[column].quantile(q=np.linspace(0, 1, n_quantiles)).values
        unique_quantile_bins = np.unique(quantile_bins)  # Remove duplicate quantile values
        
        # Digitize the data points into the unique quantile bins
        bin_labels = np.arange(1, len(unique_quantile_bins))  # Adjust labels for the available unique bins
        binned_data = pd.cut(flight_df[column], bins=unique_quantile_bins, labels=bin_labels, include_lowest=True)
        
        # Compute the standard deviation for each unique bin
        if len(unique_quantile_bins)==1:
            for i in range(1, n_quantiles):
                quantile_stdvs_dict[f'{column}_quant_{i}_stdv'] = stdvs[f'stdv_{column}']
        else:
            for i in range(1, len(unique_quantile_bins)):
                bin_stdv = flight_df[column][binned_data == i].std()
                quantile_stdvs_dict[f'{column}_quant_{i}_stdv'] = bin_stdv
        
            # If there are fewer quantiles than desired, duplicate the last bin's standard deviation        
            last_stdv = quantile_stdvs_dict[f'{column}_quant_{len(unique_quantile_bins)-1}_stdv']
            for i in range(len(unique_quantile_bins), n_quantiles):
                quantile_stdvs_dict[f'{column}_quant_{i}_stdv'] = last_stdv
        
        # Normalize the standard deviations by dividing by the absolute mean (handling division by zero)
        mean_value = abs(means[f'mean_{column}'])
        for i in range(1, n_quantiles):
            stdv_col_name = f'{column}_quant_{i}_stdv'
            stdv_value = quantile_stdvs_dict[stdv_col_name]
            normalized_value = stdv_value / mean_value if mean_value != 0 else -1
            normalized_stdvs_dict[f'{column}_quant_{i}_norm_stdv'] = normalized_value
    
    # Step 3: Convert the dictionaries to DataFrames
    quantile_stdvs = pd.DataFrame(quantile_stdvs_dict, index=[0])
    normalized_stdvs = pd.DataFrame(normalized_stdvs_dict, index=[0])
    
    # concatenating everything
    flight_stats_row = pd.concat([
        means, stdvs, quants, total_distance, quantile_stdvs.squeeze(), normalized_stdvs.squeeze()
    ])

    return flight_stats_row

def get_flight_stats_df(flight_df="df", n_quantiles=5, stats_columns="list", possible_phases="list"):
    """
    Computes flight statistics for each phase and generates a DataFrame with the results.

    Parameters:
    flight_df (pd.DataFrame): DataFrame containing flight data.
    n_quantiles (int): Number of quantiles to compute.
    stats_columns (list): List of column names to compute statistics on.
    possible_phases (list): List of possible flight phases to consider.

    Returns:
    pd.DataFrame: DataFrame containing statistics for each flight and phase.
    """
    global_flight_stats = get_flight_stats_row(
        flight_df=flight_df, 
        n_quantiles=n_quantiles,
        stats_columns=stats_columns, 
    )
    
    all_phase_flight_stats = []
    all_phases_dict = generate_phase_segments(flight_df)
    for phase in possible_phases:
        phase_df = flight_df.loc[flight_df.phase==phase]
        if len(phase_df):
            # global_phase_variability_stats
            phase_flight_stats = get_flight_stats_row(
                flight_df=phase_df,
                n_quantiles=n_quantiles,
                stats_columns=stats_columns,
            )
            n_phase_segments = len(all_phases_dict[phase])
            phase_flight_stats = pd.concat([
                pd.Series(
                    data=[n_phase_segments],
                    index=["n_segments"],
                ),
                phase_flight_stats
            ])
            # inter_segment_phase_variability_stats
            # how much change is there between segments in the same phase
            if n_phase_segments > 1:
                stdvs_segment_coefficients_of_variation = []
                means_segment_coefficients_of_variation = []
                for phase_segment_idxs in all_phases_dict[phase]:
                    segment = phase_df.loc[phase_segment_idxs]
                    segment_coefficients_of_variation = (
                        segment[stats_columns].std(ddof=0) / abs(segment[stats_columns].mean())
                    ).replace({np.inf: np.nan, -np.inf: np.nan})
                    stdv_segment_coefficients_of_variation = segment_coefficients_of_variation.std(ddof=0)
                    mean_segment_coefficients_of_variation = segment_coefficients_of_variation.mean()
                    stdvs_segment_coefficients_of_variation.append(stdv_segment_coefficients_of_variation)
                    means_segment_coefficients_of_variation.append(mean_segment_coefficients_of_variation)
                
                stdv_stdvs_segment_coefficients_of_variation = np.std(stdvs_segment_coefficients_of_variation, ddof=0)
                stdv_means_segment_coefficients_of_variation = np.std(means_segment_coefficients_of_variation, ddof=0)
                inter_segment_variability_stats = pd.Series(
                    data=[
                        stdv_stdvs_segment_coefficients_of_variation, 
                        stdv_means_segment_coefficients_of_variation
                    ],
                    index=[
                        f"{phase}_stdv_stdvs_segment_coefficients_of_variation", 
                        f"{phase}_stdv_means_segment_coefficients_of_variation", 
                    ]
                )
            else:
                inter_segment_variability_stats = pd.Series(
                    data=[
                        0, 
                        0
                    ],
                    index=[
                        f"{phase}_stdv_stdvs_segment_coefficients_of_variation", 
                        f"{phase}_stdv_means_segment_coefficients_of_variation", 
                    ]
                )
        else:
            phase_flight_stats = pd.Series(
                np.repeat(np.nan, len(global_flight_stats)),
                index=global_flight_stats.index
            )
            phase_flight_stats = pd.concat([
                pd.Series(
                    data=[0],
                    index=["n_segments"],
                ),
                phase_flight_stats
            ])
            inter_segment_variability_stats = pd.Series(
                data=[
                    np.nan, 
                    np.nan,
                ],
                index=[
                    f"{phase}_stdv_stdvs_segment_coefficients_of_variation", 
                    f"{phase}_stdv_means_segment_coefficients_of_variation", 
                ]
            )
        phase_flight_stats.index = [f"{phase}_{index}" for index in phase_flight_stats.index]
        all_phase_flight_stats.append(
            pd.concat([
                inter_segment_variability_stats,
                phase_flight_stats
            ])
        )
    all_phase_flight_stats = pd.concat(all_phase_flight_stats)
    flight_stats = pd.concat([global_flight_stats, all_phase_flight_stats])
    flight_stats_df = pd.DataFrame(flight_stats, columns=[int(flight_df.flight_id.unique())]).T
    return flight_stats_df

def process_flight_stats_batch(flight_ids_batch, flights_folder, n_quantiles, stats_columns, possible_phases):
    """
    Processes a batch of flight IDs and computes statistics for each flight.

    Parameters:
    flight_ids_batch (list): List of flight IDs to process.
    flights_folder (str): Path to the folder containing flight data.
    n_quantiles (int): Number of quantiles to compute.
    stats_columns (list): List of column names to compute statistics on.
    possible_phases (list): List of possible flight phases to consider.

    Returns:
    pd.DataFrame: DataFrame containing statistics for all flights in the batch.
    """
    flight_stats_df_list = []
    for flight_id in flight_ids_batch:
        flight_df = joblib.load(os.path.join(flights_folder, str(flight_id)))
        flight_stats_df = get_flight_stats_df(
            flight_df=flight_df, 
            n_quantiles=n_quantiles,
            stats_columns=stats_columns, 
            possible_phases=possible_phases,
        )
        flight_stats_df_list.append(flight_stats_df)
    flight_stats_df = pd.concat(flight_stats_df_list, axis=0)
    return flight_stats_df


In [3]:
# arcraft features
aircraft_df = pd.read_excel("FAA-Aircraft-Char-DB-AC-150-5300-13B-App-2023-09-07.xlsx")
aircraft_df["Wingspan"] = pd.DataFrame(
    list(zip(
        aircraft_df.Wingspan_ft_without_winglets_sharklets, 
        aircraft_df.Wingspan_ft_with_winglets_sharklets
    ))
).T.mean().values
aircraft_df["Wingspan_dif_winglets"] = pd.DataFrame(
    list(zip(
        aircraft_df.Wingspan_ft_without_winglets_sharklets, 
        aircraft_df.Wingspan_ft_with_winglets_sharklets
    ))
).T.diff().dropna(how="all").fillna(0).values[0]
aircraft_df = aircraft_df[
    [
        "ICAO_Code",
        "Wingspan",
        "Wingspan_dif_winglets",
        "Num_Engines",
        "Approach_Speed_knot",
        "Length_ft",
        "Tail_Height_at_OEW_ft",
        "Wheelbase_ft",
        "Cockpit_to_Main_Gear_ft",
        "Main_Gear_Width_ft",
        "MTOW_lb",
        "MALW_lb", 
        "Parking_Area_ft2",
    ]
]
aircraft_df.index = aircraft_df.ICAO_Code
aircraft_df = aircraft_df.drop(columns=["ICAO_Code"])
# Read engine data
engine_df = pd.read_excel("ICAO_engine_data.xlsx")
engine_df.index = engine_df.ICAO_Code
engine_df = engine_df.drop(columns=["ICAO_Code"])
# Add engine data
aircraft_df = aircraft_df.join(engine_df) # Adds 80 features
# Read some additional features for the acropole fuel estimation
acropole_features_df = pd.read_csv("aircraft_params_acropole_BB.csv")
acropole_features_df = acropole_features_df.rename(columns={"ACFT_ICAO_TYPE": "ICAO_Code"})
acropole_features_df.index = acropole_features_df.ICAO_Code
acropole_features_df = acropole_features_df.drop(columns=["ICAO_Code"])
# Add acropole features
aircraft_df = aircraft_df.join(acropole_features_df, how='left', lsuffix='_left')
aircraft_df = aircraft_df[aircraft_df.columns[~(aircraft_df.columns.str.contains("_left"))]]
# We are only considering numerical features. This protects against cases in which test sets contain unseen ac_types.
# In that scenario, new unseen categorical variables would most likely reduce the performance of our model, whereas
# Numerical attributes allow us to make better guesstimations
aircraft_df = aircraft_df.loc[
    list(
        set(cha_df.aircraft_type.unique()).union(
            set(final_sub_df.aircraft_type.unique())
        )
    ),
    aircraft_df.columns[aircraft_df.dtypes!="object"]
]

# renaming columns for traceability
aircraft_df.columns = ["AIRCRAFT_" + column for column in aircraft_df.columns]
joblib.dump(aircraft_df, "aircraft_df.pkl")

['aircraft_df.pkl']

In [4]:
# generating features for each airport
airport_df = pd.DataFrame(index=list(set(cha_df.adep.unique()).union(set(cha_df.ades.unique()))))

# lat lon and elevation
lat_lon_el = airports_df.loc[
    airports_df.ident.isin(airport_df.index), 
    ["latitude_deg", "longitude_deg", "elevation_ft",]
].values
airport_order = airports_df.loc[
    airports_df.ident.isin(airport_df.index)
].ident.values
airport_df.loc[airport_order, "latitude"] = lat_lon_el[:,0]
airport_df.loc[airport_order, "longitude"] = lat_lon_el[:,1]
airport_df.loc[airport_order, "elevation"] = lat_lon_el[:,2]

# number of runways
n_runways = runway_df.loc[
    runway_df.airport_ident.isin(airport_df.index), 
    "airport_ident"
].value_counts().to_dict()
airport_df.loc[n_runways.keys(), "nRunways"] = list(n_runways.values())

# number of incoming and outgoing flights
G = nx.DiGraph()
airport_df["nIn"] = 0
airport_df["nOut"] = 0
dep_des_ws = cha_df[["adep", "ades"]].value_counts().to_dict()
for dep_des_w in dep_des_ws:
    (u, v), w = dep_des_w, dep_des_ws[dep_des_w]
    G.add_edge(u, v, weight=w, weight_inv=1/w)
    airport_df.loc[u, "nOut"] += w
    airport_df.loc[v, "nIn"] += w
    
components = sorted(
    [component for component in nx.components.weakly_connected_components(G)],
    key = lambda comp: len(comp)
)[::-1]
lwc_component = components[0]
G = G.subgraph(lwc_component)

# centrality measures

airport_df.loc[list(G.nodes), "inDegree"] = list(nx.in_degree_centrality(G).values())
airport_df.loc[list(G.nodes), "outDegree"] = list(nx.out_degree_centrality(G).values())
airport_df.loc[list(G.nodes), "pageRank"] = list(nx.pagerank(G, weight="weight").values())
airport_df.loc[list(G.nodes), "load"] = list(nx.load_centrality(G, weight="weight").values())
airport_df.loc[list(G.nodes), "betweenness"] = list(nx.betweenness_centrality(G, weight="weight").values())
airport_df.loc[list(G.nodes), "eigenvector"] = list(nx.eigenvector_centrality(G, weight="weight").values())
airport_df.loc[list(G.nodes), "percolation"] = list(nx.percolation_centrality(G, weight="weight").values())
airport_df.loc[list(G.nodes), "harmonic"] = list(nx.harmonic_centrality(G, distance="weight_inv").values())
airport_df.loc[list(G.nodes), "closeness"] = list(nx.closeness_centrality(G, distance="weight_inv").values())

# renaming columns for traceability
airport_df.columns = ["AIRPORT_" + column for column in airport_df.columns]
joblib.dump(airport_df, "airport_df.pkl")

['airport_df.pkl']

In [11]:
# generate flight features
n_quantiles_list = [2, 3, 5, 9]

# NA not included because == noise
possible_phases = [
    "CLIMB",
    "CRUISE",
    "DESCENT",
    "GROUND",
    "LEVEL", 
]
# stats to consider
stats_columns = [
    "wind_x", "wind_y",
    "track_x", "track_y",
    "heading_x", "heading_y",
    "temperature", "specific_humidity", 
    "latitude", "longitude", "altitude", 
    "groundspeed", "TAS", "TdG_speed", "ToG_speed", "vertical_rate", 
    "wind_mag", "track_wind_dot", "track_heading_dot", "heading_wind_dot",
] # no stats for distance because it is equal to groundspeed (scaled) since all elements are 1 second apart
flight_ids = [int(file) for file in os.listdir(flights_folder) if "." not in file]

num_batches = 10000
flight_ids_batches = [
    flight_ids[i*(len(flight_ids)//num_batches):(i+1)*(len(flight_ids)//num_batches)]
    for i in range(num_batches)
]
if len(flight_ids)%num_batches:
    missing_flight_ids = flight_ids[num_batches*(len(flight_ids)//num_batches):]
    for i in range(len(missing_flight_ids)):
        missing_flight_id = missing_flight_ids[i]
        flight_ids_batches[i] = flight_ids_batches[i] + [missing_flight_id]

for n_quantiles in n_quantiles_list:
    if f"flight_stats_df_nq={n_quantiles}.pkl" not in os.listdir(os.getcwd()):
        all_flight_stats_df_list = Parallel(n_jobs=16)(
            delayed(        
                process_flight_stats_batch
            )(
                n_quantiles=n_quantiles, 
                stats_columns=stats_columns,
                flights_folder=flights_folder,
                possible_phases=possible_phases,
                flight_ids_batch=flight_ids_batch,
            )
            for flight_ids_batch in tqdm(flight_ids_batches)
        )
        flight_stats_df = pd.concat(all_flight_stats_df_list, axis=0)
        joblib.dump(flight_stats_df, f"flight_stats_df_nq={n_quantiles}.pkl")

    else:
        flight_stats_df = joblib.load(f"flight_stats_df_nq={n_quantiles}.pkl")    

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [13]:
# merge everything together
for n_quantiles in tqdm(n_quantiles_list):
    X_y_df = cha_df.copy()
    X_y_df.arrival_time = pd.to_datetime(X_y_df.arrival_time)
    X_y_df.actual_offblock_time = pd.to_datetime(X_y_df.actual_offblock_time)
    
    X_y_df["arrDayX"], X_y_df["arrDayY"] = day_cyclic_values(X_y_df.arrival_time)
    X_y_df["arrWeekX"], X_y_df["arrWeekY"] = week_cyclic_values(X_y_df.arrival_time)
    X_y_df["arrMonthX"], X_y_df["arrMonthY"] = month_cyclic_values(X_y_df.arrival_time)
    X_y_df["arrYearX"], X_y_df["arrYearY"] = year_cyclic_values(X_y_df.arrival_time)
    
    X_y_df["depDayX"], X_y_df["depDayY"] = day_cyclic_values(X_y_df.actual_offblock_time)
    X_y_df["depWeekX"], X_y_df["depWeekY"] = week_cyclic_values(X_y_df.actual_offblock_time)
    X_y_df["depMonthX"], X_y_df["depMonthY"] = month_cyclic_values(X_y_df.actual_offblock_time)
    X_y_df["depYearX"], X_y_df["depYearY"] = year_cyclic_values(X_y_df.actual_offblock_time)
    
    # add aircraft and airport point features
    X_y_df = X_y_df.merge(right=aircraft_df, how="left", left_on="aircraft_type", right_index=True)
    X_y_df = X_y_df.merge(right=airport_df, how="left", left_on="adep", right_index=True)
    X_y_df = X_y_df.merge(right=airport_df, how="left", left_on="ades", right_index=True, suffixes=("_adep", "_ades"))
    X_y_df = X_y_df.merge(
        right=flight_fuel_df.rename(columns={
            "total_fuel": "FUEL_total_consumption",
            "total_fuel_replace": "FUEL_total_consumption_replace",
        }), 
        how="left", left_on="flight_id", right_on="flight_id"
    )
    
    # Processes the `X_y_df` DataFrame by consolidating less frequent airlines into an "other" category, 
    # and converting the `airline` column into one-hot encoded binary variables. 
    # This is done to reduce the number of airline categories and prepare the data for analysis or modeling.
    airline_value_counts = X_y_df.airline.value_counts(normalize=True)
    top_airlines_n = np.argmax(airline_value_counts < (1/len(airline_value_counts)))
    arlines_to_discard = airline_value_counts.keys()[top_airlines_n:]
    X_y_df.loc[X_y_df.airline.isin(arlines_to_discard).values.astype(bool), "airline"] = "other"
    airline_dummie_df = pd.get_dummies(X_y_df.airline).astype(float)
    airline_dummie_df.columns = ["AIRLINE_airline_" + column for column in airline_dummie_df.columns]
    X_y_df = pd.concat([X_y_df, airline_dummie_df], axis=1)
    
    aircraft_type_value_counts = X_y_df.aircraft_type.value_counts(normalize=True)
    top_aircraft_type_n = np.argmax(aircraft_type_value_counts < (1/len(aircraft_type_value_counts)))
    aircraft_type_to_discard = aircraft_type_value_counts.keys()[top_aircraft_type_n:]
    X_y_df.loc[X_y_df.aircraft_type.isin(aircraft_type_to_discard).values.astype(bool), "aircraft_type"] = "other"
    aircraft_type_dummie_df = pd.get_dummies(X_y_df.aircraft_type).astype(float)
    aircraft_type_dummie_df.columns = ["AIRCRAFT_aircraft_type_" + column for column in aircraft_type_dummie_df.columns]
    X_y_df = pd.concat([X_y_df, aircraft_type_dummie_df], axis=1)
    
    X_y_df = X_y_df.drop(columns=[
        # "flight_id", "date", 
        "callsign", "wtc", "airline", 
        "name_adep", "country_code_adep", 
        "name_ades", "country_code_ades",
        "actual_offblock_time", "arrival_time",
        "aircraft_type", "adep", "ades"
    ])
    
    # generate new airport features based on the difference 
    # between attribute values of departure and arrival airports
    adep_columns = [column for column in X_y_df.columns if "_adep" in column]
    ades_columns = [column for column in X_y_df.columns if "_ades" in column]
    for i in range(len(adep_columns)):
        adep_column = adep_columns[i]
        ades_column = ades_columns[i]
        diff_col_name = f"{adep_column}_diff"
        X_y_df[diff_col_name] = X_y_df[ades_column] - X_y_df[adep_column]    
        
    X_y_df["AIRPORT_airports_distance"] = calculate_distances(
        X_y_df,
        lat1_col="AIRPORT_latitude_adep", 
        lon1_col="AIRPORT_longitude_adep", 
        lat2_col="AIRPORT_latitude_ades", 
        lon2_col="AIRPORT_longitude_ades"
    )
    
    # add flight features
    flight_stats_df = joblib.load(f"flight_stats_df_nq={n_quantiles}.pkl")
    X_y_df = X_y_df.merge(right=flight_stats_df, how="left", left_on="flight_id", right_index=True)
    
    column_order = X_y_df.columns.tolist()
    column_order.remove("tow")
    column_order += ["tow"]
    X_y_df = X_y_df[column_order]
    # filling nan values
    feature_columns = X_y_df.columns[2:-1]
    for feature_column in feature_columns:
        if X_y_df[feature_column].isna().sum():
            min_value = X_y_df[feature_column].min()
            max_value = X_y_df[feature_column].max()
            feature_range = max_value - min_value
            nan_placeholder = min_value - feature_range
            X_y_df[feature_column] = X_y_df[feature_column].fillna(nan_placeholder)
    
    joblib.dump(X_y_df, f"X_y_df_cha_nq={n_quantiles}_v4.pkl")

  0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
# create corr_dfs for column dropping based on correlation threshold
for n_quantiles in tqdm(n_quantiles_list):
    if f"X_y_cha_nq={n_quantiles}_corr_df_v4.pkl" not in os.listdir(os.getcwd()):
        X_y_df = joblib.load(f"X_y_df_cha_nq={n_quantiles}_v4.pkl")
        feature_columns = X_y_df.columns[2:-1]
        cat_features = [
            column for column in feature_columns if (
                ("airline_" in column) or ("aircraft_type_" in column)
            )
        ]
        num_features = sorted(
            list(set(feature_columns).difference(set(cat_features)))
        )
        corr_df = X_y_df[num_features].corr()
        joblib.dump(corr_df, f"X_y_cha_nq={n_quantiles}_corr_df_v4.pkl")

  0%|          | 0/4 [00:00<?, ?it/s]