# Load Packages

In [16]:
import pandas as pd
import numpy as np
import os

# Prepare file

In [17]:
df = pd.read_csv("../../data/training.csv", sep="\t")
df.replace("(null)", np.nan, inplace=True)
df.rename(columns={"FLownPassengers": "FlownPassengers"}, inplace=True)
num_cols = ['ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers', 'FlightBagsWeight', 'BagsCount', 'ActualTOW']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values in numeric columns

In [18]:
df_clean = df.dropna(subset=num_cols)

# Outlier removal using IQR for each numeric column

In [19]:
def remove_outliers_iqr(data, columns):
    df_out = data.copy()
    for col in columns:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        before = df_out.shape[0]
        df_out = df_out[(df_out[col] >= lower) & (df_out[col] <= upper)]
        after = df_out.shape[0]
        print(f"Column: {col} | Removed: {before - after} rows")
    return df_out

df_no_outliers = remove_outliers_iqr(df_clean, num_cols)

Column: ActualFlightTime | Removed: 496 rows
Column: ActualTotalFuel | Removed: 209 rows
Column: FlownPassengers | Removed: 2049 rows
Column: FlightBagsWeight | Removed: 359 rows
Column: BagsCount | Removed: 151 rows
Column: ActualTOW | Removed: 39 rows


# Save to processed folder with tab separator

In [20]:
os.makedirs("../../data/processed", exist_ok=True)
df_no_outliers.to_csv("../../data/processed/training_outlier_removed.csv", sep="\t", index=False)
print(f"✅ Final dataset shape after IQR-based outlier removal: {df_no_outliers.shape}")

✅ Final dataset shape after IQR-based outlier removal: (23497, 14)
