In [100]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostRegressor


In [101]:
new_df = pd.read_csv("new_flights.csv")

In [102]:
# Convertir las ciudades a categorias númericas
new_df['ORIGIN_CITY'] = new_df['ORIGIN_CITY'].astype('category').cat.codes
new_df['DEST_CITY'] = new_df['DEST_CITY'].astype('category').cat.codes

In [103]:
# Reducir el tamaño del dataset
small_df = new_df.head(100000)

In [104]:
# Seleccionar features y target
features = small_df[['MONTH', 'DAY', 'ORIGIN_CITY', 'DEST_CITY', 'AIR_TIME', 'DISTANCE', 'DEP_DELAY', 'TAXI_OUT']]
target = small_df['ARR_DELAY']

In [105]:
# Dividir los datos en conjuntos de entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=42)

In [106]:
# Normalizar los datos
normalizer = MinMaxScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)

In [107]:
# Ejecutar recolector de basura para liberar memoria
import gc
gc.collect()

677

In [108]:
# Modelo con parámetros por defecto
forest = RandomForestRegressor(random_state=42, n_jobs=4, verbose=0)

forest.fit(X_train_norm, y_train)

pred = forest.predict(X_test_norm)
score = forest.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { score: .2f}")

MAE:  7.61
RMSE:  119.07
R2 score:  0.97


GRID SEARCH

In [109]:
# Definir el rango de parámetros para probar
params = {
    'n_estimators': [50, 100, 200],        # Número de árboles
    'max_depth': [10, 20],            # Profundidad máxima de los árboles
    'min_samples_split': [2, 5, 10],         # Número mínimo de muestras para dividir un nodo
}

# Configurar el GridSearchCV
grid_search = GridSearchCV(estimator=forest, param_grid=params, cv=3, n_jobs=1, verbose=10)

# Ajustar el modelo a los datos de entrenamiento
grid_search.fit(X_train_norm, y_train)

# Ver los mejores parámetros encontrados
print(f"Mejores parámetros: {grid_search.best_params_}")

# Ver la mejor puntuación obtenida
print(f"Mejor puntuación: {grid_search.best_score_}")

# Evaluar el mejor modelo
best_model = grid_search.best_estimator_
pred = best_model.predict(X_test_norm)
score = best_model.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { score: .2f}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3; 1/18] START max_depth=10, min_samples_split=2, n_estimators=50.........
[CV 1/3; 1/18] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.961 total time=   8.4s
[CV 2/3; 1/18] START max_depth=10, min_samples_split=2, n_estimators=50.........
[CV 2/3; 1/18] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.959 total time=   6.5s
[CV 3/3; 1/18] START max_depth=10, min_samples_split=2, n_estimators=50.........
[CV 3/3; 1/18] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.957 total time=  10.1s
[CV 1/3; 2/18] START max_depth=10, min_samples_split=2, n_estimators=100........
[CV 1/3; 2/18] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.961 total time=  13.4s
[CV 2/3; 2/18] START max_depth=10, min_samples_split=2, n_estimators=100........
[CV 2/3; 2/18] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.960 total time=  10.0s
[CV 3/3; 2/18] ST

RANDOM SEARCH

In [110]:
# Definir el rango de parámetros para probar
params = {
    "n_estimators": [int(x) for x in np.linspace(start = 50, stop = 500, num = 4)],
    "max_depth":[int(x) for x in np.linspace(5, 50, num = 4)],
    "min_samples_split": [int(x) for x in np.linspace(start = 2, stop = 15, num = 4)]
}

# Configurar el RandomizedSearchCV
randomized_search = RandomizedSearchCV(estimator=forest, param_distributions=params, n_iter=10, cv=3, n_jobs=1, verbose=10)

# Ajustar el modelo a los datos de entrenamiento
randomized_search.fit(X_train_norm, y_train)

# Ver los mejores parámetros encontrados
print(f"Mejores parámetros: {randomized_search.best_params_}")

# Ver la mejor puntuación obtenida
print(f"Mejor puntuación: {randomized_search.best_score_}")

# Evaluar el mejor modelo
best_model = randomized_search.best_estimator_
pred = best_model.predict(X_test_norm)
score = best_model.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { score: .2f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3; 1/10] START max_depth=35, min_samples_split=10, n_estimators=50........
[CV 1/3; 1/10] END max_depth=35, min_samples_split=10, n_estimators=50;, score=0.964 total time=   9.9s
[CV 2/3; 1/10] START max_depth=35, min_samples_split=10, n_estimators=50........
[CV 2/3; 1/10] END max_depth=35, min_samples_split=10, n_estimators=50;, score=0.964 total time=   9.4s
[CV 3/3; 1/10] START max_depth=35, min_samples_split=10, n_estimators=50........
[CV 3/3; 1/10] END max_depth=35, min_samples_split=10, n_estimators=50;, score=0.961 total time=   9.7s
[CV 1/3; 2/10] START max_depth=5, min_samples_split=6, n_estimators=500.........
[CV 1/3; 2/10] END max_depth=5, min_samples_split=6, n_estimators=500;, score=0.955 total time=  21.9s
[CV 2/3; 2/10] START max_depth=5, min_samples_split=6, n_estimators=500.........
[CV 2/3; 2/10] END max_depth=5, min_samples_split=6, n_estimators=500;, score=0.953 total time=  21.4s
[CV 3/3; 2/10] S

In [111]:
print(f"R2 score: { score: .22f}")

R2 score:  0.9688701374805637200893
