# Load Packages

In [13]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
import os

# Load cleaned numeric dataset (tab-separated)

In [14]:
df = pd.read_csv("../../data/processed/training_numeric_cleaned.csv", sep="\t")

# Define numeric columns

In [15]:
num_cols = ['ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers',
            'FlightBagsWeight', 'BagsCount', 'ActualTOW']

# Function to apply winsorization to a column

In [16]:
def winsorize_column(series, limits=(0.01, 0.01)):
    return pd.Series(winsorize(series, limits=limits), index=series.index)

# Apply winsorization to all numeric columns

In [17]:
df_winsorized = df.copy()
for col in num_cols:
    original_min = df[col].min()
    original_max = df[col].max()
    df_winsorized[col] = winsorize_column(df[col])
    print(f"{col}: min {original_min:.2f} → {df_winsorized[col].min():.2f}, "
          f"max {original_max:.2f} → {df_winsorized[col].max():.2f}")

ActualFlightTime: min 2.00 → 31.00, max 1440.00 → 249.00
ActualTotalFuel: min 720.00 → 1540.00, max 14090.00 → 10856.00
FlownPassengers: min 13.00 → 84.00, max 189.00 → 187.00
FlightBagsWeight: min 6.00 → 50.00, max 10530.00 → 1600.00
BagsCount: min 1.00 → 4.00, max 1180.00 → 126.00
ActualTOW: min 49322.00 → 55671.00, max 74283.00 → 72614.00


Save the winsorized dataset

In [18]:
os.makedirs("../../data/processed", exist_ok=True)
df_winsorized.to_csv("../../data/processed/training_winsorized.csv", sep="\t", index=False)

print(f"✅ Winsorized dataset saved. Shape: {df_winsorized.shape}")

✅ Winsorized dataset saved. Shape: (26800, 14)
