In [37]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
import joblib
import yaml

In [40]:
#load config
with open("../config.yaml","r") as f:
    config = yaml.safe_load(f)

RAW_DATA_PATH = "../"+config['data']['raw_path']
PROCESSED_DATA_PATH = "../"+config['data']['processed_path']

os.makedirs(RAW_DATA_PATH, exist_ok=True)
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [41]:
config

{'data': {'raw_path': 'data/raw/', 'processed_path': 'data/processed/'},
 'preprocessing': {'scale_method': 'standard',
  'fill_missing': 'mean',
  'encoder': 'LabelEncoder'},
 'model': {'type': 'autoencoder',
  'latent_dim': 16,
  'epochs': 50,
  'batch_size': 64,
  'learning_rate': 0.001},
 'deployment': {'host': '0.0.0.0', 'port': 8501}}

In [15]:
raw_file_path = os.path.join(RAW_DATA_PATH, "DASHlink_full_fourclass_raw_meta.csv")

df = pd.read_csv(raw_file_path)
print("Data Shape: ",df.shape)
df.head()

Data Shape:  (99837, 7)


Unnamed: 0,data_instance,flight_record,departure_airport,departure_runway,arrival_airport,arrival_runway,label
0,0,652200101120916,KSGF,32,KMEM,36L,0
1,1,652200101121118,KMEM,36L,KMCI,1L,0
2,2,652200101121341,KMCI,19R,KMEM,36R,0
3,3,652200101130002,KMEM,18C,KPNS,17,0
4,4,652200101130451,KPNS,35,KMEM,36R,0


In [17]:
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
data_instance        0
flight_record        0
departure_airport    0
departure_runway     0
arrival_airport      0
arrival_runway       0
label                0
dtype: int64


In [18]:
df.duplicated().sum()

np.int64(0)

In [19]:
#check categorical and numerical columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99837 entries, 0 to 99836
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   data_instance      99837 non-null  int64 
 1   flight_record      99837 non-null  int64 
 2   departure_airport  99837 non-null  object
 3   departure_runway   99837 non-null  object
 4   arrival_airport    99837 non-null  object
 5   arrival_runway     99837 non-null  object
 6   label              99837 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 5.3+ MB


In [23]:
#check number of unique records for each categorical feature
print("Unique values in categorical columns")
print(f"departure_airport = {df['departure_airport'].nunique()}")
print(f"departure_runway = {df['departure_runway'].nunique()}")
print(f"arrival_airport = {df['arrival_runway'].nunique()}")
print(f"arrival_runway = {df['arrival_runway'].nunique()}")

Unique values in categorical columns
departure_airport = 205
departure_runway = 107
arrival_airport = 108
arrival_runway = 108


# As we have many unique values in each categorical column we can go with label encoding as Onehot encoding would increase the feature space drastically in this scenario

In [36]:
#Feature Scaling for numerical columns
scale_method = config['preprocessing']['scale_method']
if scale_method == 'standard':
    scaler = StandardScaler()
elif scale_method == 'minmax':
    scaler = MinMaxScaler()
else:
    raise ValueError(f"Unknown scaling method: {scale_method}")

numeric_cols = df.select_dtypes(exclude='object').columns.tolist()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
#Feature encoding for categorical columns
encoder = config['preprocessing']['encoder']
if encoder == 'LabelEncoder':
    encoder = LabelEncoder()
elif encoder == 'OneHotEncoder':
    encoder = OneHotEncoder()
else:
    raise ValueError(f"Unknown encoding method provided: {encoder}")
encoders = {}
categorical_cols = df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col].astype(str))
    encoders[col] = encoder

In [None]:
#save scaler and encoder for later use in inference
joblib.dump(scaler, os.path.join(PROCESSED_DATA_PATH, "scaler.pkl"))
joblib.dump(encoders, os.path.join(PROCESSED_DATA_PATH,'encoders.pkl'))
#save processed data
processed_file_path = os.path.join(PROCESSED_DATA_PATH,'dataset.csv')
df.to_csv(processed_file_path, index=False)
print(f"Processed dataset saved to: {processed_file_path}")

print("Data preprocessing is complete. Ready for EDA & modeling!")

Data preprocessing is complete. Ready for EDA & modeling!
