In [86]:
"""
02_feature_engineering.ipynb

Prueba de concepto del pipeline

Objetivo:
- Probar transformaciones de variables
- Validar ideas de features (hora, día, aerolínea, etc.)
- Decidir qué variables usar en el modelo

Resultado:
- Se implementan luego
  en src/preprocessing.py
"""

'\n02_feature_engineering.ipynb\n\nPrueba de concepto del pipeline\n\nObjetivo:\n- Probar transformaciones de variables\n- Validar ideas de features (hora, día, aerolínea, etc.)\n- Decidir qué variables usar en el modelo\n\nResultado:\n- Se implementan luego\n  en src/preprocessing.py\n'

In [87]:
import pandas as pd

In [88]:
url_csv = 'https://raw.githubusercontent.com/DnRiv/FlightOnTime-/refs/heads/main/proyecto_hackathon/ds/data/Sample_DelayedFlights.csv'

In [89]:
df = pd.read_csv(url_csv)
df.head(5)

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,4.0,10.0,0,N,0,,,,,


In [90]:
df.columns

Index(['Unnamed: 0', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime',
       'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum',
       'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [91]:
df.shape

(60225, 30)

In [92]:
# Removemos vuelos cancelados
df = df[df["Cancelled"] == 0]

# Definimos target
df["is_delayed"] = (df["ArrDelay"] > 15).astype(int)

df.head()


Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,is_delayed
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,8.0,0,N,0,,,,,,0
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,10.0,0,N,0,,,,,,0
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,17.0,0,N,0,,,,,,0
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0,1
4,5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,...,10.0,0,N,0,,,,,,0


In [93]:
features = [
    "UniqueCarrier",
    "Origin",
    "Dest",
    "CRSDepTime",
    "DayOfWeek",
    "Distance"
]

df = df[features + ["is_delayed"]]
df.head()


Unnamed: 0,UniqueCarrier,Origin,Dest,CRSDepTime,DayOfWeek,Distance,is_delayed
0,WN,IAD,TPA,1955,4,810,0
1,WN,IAD,TPA,735,4,810,0
2,WN,IND,BWI,620,4,515,0
3,WN,IND,BWI,1755,4,515,1
4,WN,IND,JAX,1915,4,688,0


Transformación

In [94]:
df["dep_hour"] = df["CRSDepTime"] // 100

Variable Cat

In [95]:
categorical_cols = ["UniqueCarrier", "Origin", "Dest", "DayOfWeek"]
numerical_cols = ["dep_hour", "Distance"]

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [97]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)


In [98]:
X = preprocessor.fit_transform(df)

In [99]:
X = preprocessor.fit_transform(df)
y = df["is_delayed"]

In [100]:
feature_names = preprocessor.get_feature_names_out()


In [101]:
X_df = pd.DataFrame(X.toarray(), columns=feature_names)
X_df.head()

Unnamed: 0,cat__UniqueCarrier_OH,cat__UniqueCarrier_OO,cat__UniqueCarrier_WN,cat__UniqueCarrier_XE,cat__UniqueCarrier_YV,cat__Origin_ABE,cat__Origin_ABQ,cat__Origin_ACV,cat__Origin_AEX,cat__Origin_ALB,...,cat__Dest_YUM,cat__DayOfWeek_1,cat__DayOfWeek_2,cat__DayOfWeek_3,cat__DayOfWeek_4,cat__DayOfWeek_5,cat__DayOfWeek_6,cat__DayOfWeek_7,num__dep_hour,num__Distance
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,19.0,810.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,810.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0,515.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.0,515.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,19.0,688.0


In [102]:
X.shape, y.shape


((60225, 435), (60225,))

In [103]:
y.value_counts()


Unnamed: 0_level_0,count
is_delayed,Unnamed: 1_level_1
1,37277
0,22948


In [104]:
'''
Conclusión

Se implementó un pipeline simple de feature engineering.

Siguiente paso:
- Entrenar un modelo base de clasificación
- Evaluar métricas básicas
- Exportar el pipeline para FastAPI

'''

'\nConclusión\n\nSe implementó un pipeline simple de feature engineering.\n\nSiguiente paso:\n- Entrenar un modelo base de clasificación\n- Evaluar métricas básicas\n- Exportar el pipeline para FastAPI\n\n'