In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
from collections import Counter

# Métodos de Validación
from sklearn.model_selection import train_test_split

# Modelos de Clasificación
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Métricas para Clasificación
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# OverSampling y UnderSampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [9]:
df = pd.read_pickle(r'C:\Users\Nahuel\Documents\hack_a_boss\mod7-streamlit-main\proyecto_aviones\data\pickle\vuelos_limpio.pkl')

In [10]:
# Creamos la columna target y ya la binarizamos
df["llega_tarde"] = np.where(df['retraso_llegada'] > 15, 'si', 'no')
df['llega_tarde'] = df['llega_tarde'].map({'si': 1, 'no': 0})

In [11]:
# Eliminamos las columnas que no aportan información extra y retraso en la llegada para que no prediga el 100% de las veces y mes que solo tiene 1 valor
columnas_eliminar = [
    "latitude_destino",
    "latitude_origen",
    "longitude_destino",
    "longitude_origen",
    "codigo_aeropuerto_destino",
    "codigo_aeropuerto_origen",
    "direccion_destino",
    "direccion_origen",
    "retraso_llegada",
    "hora_llegada_real",
    "mes",
    'retraso_salida',
    'tiempo_retraso_aerolinea',
    'tiempo_retraso_sistema_aviacion',
    'hora_salida_real', 
    'hora_despegue', 
    'tiempo_pista_salida', 
    'duracion_real',
    'tiempo_retraso_clima',
    'festivos',
    'tiempo_retraso_seguridad']
df.drop(columns=columnas_eliminar, inplace=True)


In [12]:
diccionario_costo = {
    'alto_costo': ["Delta Air Lines", "American Airlines", "United Airlines", "Alaska Airlines", "Hawaiian Airlines"],
    'medio_costo': ["Southwest Airlines", "JetBlue Airways"],
    'bajo_costo': ['Allegiant Air', 'Frontier Airlines', 'Spirit Airlines', 'Envoy Air', 'SkyWest Airlines', 'PSA Airlines', 
                'Endeavor Air', 'Mesa Airlines', 'Republic Airways', 'Horizon Air']}

def asignar_costo(aerolinea):
    for costo, aerolineas in diccionario_costo.items():
        if aerolinea in aerolineas:
            return costo

df['aerolinea'] = df['aerolinea'].apply(asignar_costo)

In [13]:
pesos_categorias = {'alto_costo': 2, 'medio_costo': 1, 'bajo_costo': 0} 

df['aerolinea'] = df['aerolinea'].map(pesos_categorias)

In [14]:
# Cambiamos a minutos las columnas horarias

# df["hora_salida_real"] = df["hora_salida_real"].apply(lambda x: x.hour * 60 + x.minute)
df["hora_salida_programada"] = df["hora_salida_programada"].apply(lambda x: x.hour * 60 + x.minute)

# df['hora_despegue'] = df['hora_despegue'].replace('24:00', '00:00')
# df['hora_despegue'] = pd.to_datetime(df['hora_despegue'], format='%H:%M').dt.hour * 60 + pd.to_datetime(df['hora_despegue'], format='%H:%M').dt.minute

In [15]:
# Como da error el datetime de fecha, extraemos el día del mes, que es el único que nos falta, porque mes como todos tienen el mismo no hace falta
# y sobbreescribimos la columna de fecha, 

df['fecha'] = df['fecha'].dt.day

In [16]:
# Miramos las columnas object que tenemos que cambiar a numéricas para el modelo
columnas_categoricas = df.select_dtypes(include=["object"]).columns.tolist()

In [17]:
def target_encoding(df, columna_categorica, target):

    media_columna_con_target = df.groupby(columna_categorica)[target].mean()
    df[columna_categorica] = df[columna_categorica].map(media_columna_con_target)
    return

# target_encoding(df, 'aerolinea', 'llega_tarde') esta ya no xk la he puesto un label encoding
target_encoding(df, 'numero_cola', 'llega_tarde')
target_encoding(df, 'ciudad_origen', 'llega_tarde')
target_encoding(df, 'estado_origen', 'llega_tarde')
target_encoding(df, 'aeropuerto_origen', 'llega_tarde')
target_encoding(df, 'ciudad_destino', 'llega_tarde')
target_encoding(df, 'estado_destino', 'llega_tarde')
target_encoding(df, 'aeropuerto_destino', 'llega_tarde')

In [18]:
# aplico logaritmo

df["distancia_millas"] = np.log(df["distancia_millas"] + 1) 

In [19]:
# Definimos la target

TARGET = "llega_tarde"
X = df.drop(columns = TARGET)
y  = df[[TARGET]]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (1353980, 16), y_train: (1353980, 1)
X_test: (338496, 16), y_test: (338496, 1)


In [21]:
# Analizamos la distribución de clases en el conjunto de datos. 

contador_clases= Counter([arr[0] for arr in list(y_train.values)])
print(contador_clases)

total = sum(contador_clases.values())
for clase, count in contador_clases.items():
    porcentaje = (count / total) * 100
    print(f'Porcentaje de "{clase}": {porcentaje:.2f}%')

Counter({0: 1248772, 1: 105208})
Porcentaje de "0": 92.23%
Porcentaje de "1": 7.77%


In [22]:
# Definir las estrategias de balanceo
undersampling = RandomUnderSampler(sampling_strategy=0.1)
oversampling = SMOTE(sampling_strategy=0.6)

# Definir la pipeline
balance_pipeline = Pipeline([
    ('undersampling', undersampling),
    ('oversampling', oversampling)
])

# Aplicar la pipeline al conjunto de entrenamiento
X_balanceado, y_balanceado = balance_pipeline.fit_resample(X_train, y_train)

In [23]:
# Comprobamos que se han balanceado más 
contador_clases= Counter([arr[0] for arr in list(y_balanceado.values)])
print(contador_clases)

total = sum(contador_clases.values())
for clase, count in contador_clases.items():
    porcentaje = (count / total) * 100
    print(f'Porcentaje de "{clase}": {porcentaje:.2f}%')

Counter({0: 1052080, 1: 631248})
Porcentaje de "0": 62.50%
Porcentaje de "1": 37.50%


In [24]:
def test_models(X, y):
    # Definir modelos
    models = {
        "Random Forest"       : RandomForestClassifier(),
        "Naive Bayes"         : GaussianNB(),
        "KNN"                 : KNeighborsClassifier(),
        "Gradient Boosting"   : GradientBoostingClassifier(),
        'Nearest Centroid'    : NearestCentroid(),
        'Decision Tree'       : DecisionTreeClassifier(),
        'AdaBoost Classifier' : AdaBoostClassifier()
    }

    # DataFrame para almacenar los resultados de las métricas
    results = list()

    # Entrenar y evaluar cada modelo
    for name, model in models.items():
        model.fit(X, y)
        y_pred = model.predict(X_test)

        # Calcular métricas
        jaccard_index = jaccard_score(y_test, y_pred, average="macro")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        f1_score_value = f1_score(y_test, y_pred, average="macro")
        roc_auc = roc_auc_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
        tn, fp, fn, tp = conf_matrix.ravel()
        specificity = tn / (tn + fp)

        # Agregar resultados a la lista
        results.append({
            'Model': name,
            'Jaccard Index': jaccard_index,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1_score_value,
            'ROC AUC': roc_auc,
            'Specificity': specificity
        })

    # Convertir la lista de resultados a un DataFrame
    df_results = pd.DataFrame(results)

    return df_results

df_resultados = test_models(X_balanceado, y_balanceado)

df_resultados

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Model,Jaccard Index,Accuracy,Precision,Recall,F1-score,ROC AUC,Specificity
0,Random Forest,0.52196,0.904102,0.626025,0.583166,0.598243,0.583166,0.963154
1,Naive Bayes,0.417754,0.72122,0.545264,0.627728,0.526918,0.627728,0.738422
2,KNN,0.467992,0.802547,0.559408,0.624484,0.566007,0.624484,0.835311
3,Gradient Boosting,0.489717,0.836704,0.570581,0.620057,0.582659,0.620057,0.876567
4,Nearest Centroid,0.271181,0.482263,0.514989,0.551975,0.392903,0.551975,0.469436
5,Decision Tree,0.480258,0.835641,0.559058,0.596432,0.568252,0.596432,0.879655
6,AdaBoost Classifier,0.466689,0.794618,0.562314,0.638098,0.568032,0.638098,0.823417


In [27]:
df_resultados.sort_values(by = 'Precision', ascending = False)

Unnamed: 0,Model,Jaccard Index,Accuracy,Precision,Recall,F1-score,ROC AUC,Specificity
0,Random Forest,0.52196,0.904102,0.626025,0.583166,0.598243,0.583166,0.963154
3,Gradient Boosting,0.489717,0.836704,0.570581,0.620057,0.582659,0.620057,0.876567
6,AdaBoost Classifier,0.466689,0.794618,0.562314,0.638098,0.568032,0.638098,0.823417
2,KNN,0.467992,0.802547,0.559408,0.624484,0.566007,0.624484,0.835311
5,Decision Tree,0.480258,0.835641,0.559058,0.596432,0.568252,0.596432,0.879655
1,Naive Bayes,0.417754,0.72122,0.545264,0.627728,0.526918,0.627728,0.738422
4,Nearest Centroid,0.271181,0.482263,0.514989,0.551975,0.392903,0.551975,0.469436


In [26]:
fig = px.bar(df_resultados, x="Model", y=["Jaccard Index", "Accuracy", "Precision", "Recall", "F1-score", "ROC AUC", "Specificity"],
             title="Métricas de Evaluación de los Modelos de Aprendizaje Automático",
             labels={"value": "Valor de la Métrica", "variable": "Métrica"},
             barmode="group")

fig.update_layout(title_x=0.5, xaxis = {'categoryorder' : 'total descending'})

fig.show()

  sf: grouped.get_group(s if len(s) > 1 else s[0])


# Observaciones.

- Vemos que el modelo que mejor métrica `ROC AUC` nos da es **AdaBoostClassifier()**, sin embargo en términos generales de precisión y especificidad (`Precision` & `Sprecifity`) además otras es **RandomForestClasifier()**, por lo que usaremos ese modelo para clasificar nuestros datos.