# Load packages

In [24]:
import pandas as pd
import numpy as np

# Load raw training data

In [25]:
df = pd.read_csv("../data/training.csv", sep="\t")


# Prepare file

In [26]:
df.replace("(null)", np.nan, inplace=True)
df.rename(columns={"FLownPassengers": "FlownPassengers"}, inplace=True)
num_cols = ['ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers', 'FlightBagsWeight', 'BagsCount', 'ActualTOW']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Check missing values per column

In [27]:
missing_counts = df[num_cols].isnull().sum()
print("Missing values in numeric columns:\n", missing_counts)

Missing values in numeric columns:
 ActualFlightTime       0
ActualTotalFuel        0
FlownPassengers       95
FlightBagsWeight    2478
BagsCount           2284
ActualTOW            433
dtype: int64


# Option A — Drop rows with any missing value in numeric columns

In [28]:
df_cleaned = df.dropna(subset=num_cols)

# show how much data was removed

In [29]:
print(f"Original size: {df.shape[0]}, After cleaning: {df_cleaned.shape[0]}")

Original size: 29731, After cleaning: 26800


# Save the cleaned dataset (optional step)

In [30]:
df_cleaned.to_csv("../data/processed/training_numeric_cleaned.csv", index=False, sep="\t")


print("✅ Cleaned numeric data saved.")