In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
new_df = pd.read_csv("new_flights.csv")

In [3]:
# Combine YEAR, MONTH, and DAY into a single DATE column
new_df['DATE'] = pd.to_datetime(new_df[['YEAR', 'MONTH', 'DAY']])

# Add day of the week
new_df['DAY_OF_WEEK'] = new_df['DATE'].dt.day_name()

def get_time_of_day(dep_time):
    if pd.isnull(dep_time):  # Handle missing values
        return np.nan
    hour = int(dep_time // 100)  # Extract the hour by integer division
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Apply the function to categorize DEP_TIME
new_df['TIME_OF_DAY'] = new_df['DEP_TIME'].apply(get_time_of_day)

In [4]:
# One-hot encode the DAY_OF_WEEK and TIME_OF_DAY columns
new_df= pd.get_dummies(new_df, columns=['DAY_OF_WEEK', 'TIME_OF_DAY'])

# The new_df DataFrame now has the one-hot encoded columns
new_df

Unnamed: 0.1,Unnamed: 0,AIRLINE,DOT_CODE,FL_NUMBER,ORIGIN_CITY,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,...,DAY_OF_WEEK_Monday,DAY_OF_WEEK_Saturday,DAY_OF_WEEK_Sunday,DAY_OF_WEEK_Thursday,DAY_OF_WEEK_Tuesday,DAY_OF_WEEK_Wednesday,TIME_OF_DAY_Afternoon,TIME_OF_DAY_Evening,TIME_OF_DAY_Morning,TIME_OF_DAY_Night
0,1,Delta Air Lines Inc.,19790,1149,"Minneapolis, MN","Seattle, WA",2120,2114.0,-6.0,9.0,...,False,True,False,False,False,False,False,False,False,True
1,3,Delta Air Lines Inc.,19790,2295,"Minneapolis, MN","San Francisco, CA",1609,1608.0,-1.0,27.0,...,True,False,False,False,False,False,True,False,False,False
2,6,American Airlines Inc.,19805,2134,"Washington, DC","Boston, MA",1010,1001.0,-9.0,23.0,...,False,False,True,False,False,False,False,False,True,False
3,8,Spirit Air Lines,20416,590,"Houston, TX","Los Angeles, CA",530,527.0,-3.0,11.0,...,False,False,True,False,False,False,False,False,True,False
4,22,Delta Air Lines Inc.,19790,2706,"Grand Rapids, MI","Minneapolis, MN",730,720.0,-10.0,10.0,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677562,2999989,Republic Airline,20452,4644,"Portland, ME","Washington, DC",550,550.0,0.0,8.0,...,True,False,False,False,False,False,False,False,True,False
677563,2999990,SkyWest Airlines Inc.,20304,4126,"Detroit, MI","Madison, WI",825,824.0,-1.0,32.0,...,False,False,False,False,False,True,False,False,True,False
677564,2999995,American Airlines Inc.,19805,1522,"Jacksonville, FL","Charlotte, NC",1742,1740.0,-2.0,10.0,...,False,False,True,False,False,False,False,True,False,False
677565,2999996,American Airlines Inc.,19805,1535,"Chicago, IL","Austin, TX",1300,1254.0,-6.0,10.0,...,False,False,False,False,False,True,True,False,False,False


In [5]:
new_df['ORIGIN_CITY'] = new_df['ORIGIN_CITY'].astype('category').cat.codes
new_df['DEST_CITY'] = new_df['DEST_CITY'].astype('category').cat.codes

In [6]:
features = new_df[['MONTH','ORIGIN_CITY', 'DEST_CITY', 'AIR_TIME', 'DISTANCE',
        'DEP_DELAY', 'TAXI_OUT'] + [col for col in new_df.columns if 'DAY_OF_WEEK' in col or 'TIME_OF_DAY' in col]]
target = new_df['CANCELLED']

In [7]:
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=42)


In [9]:
normalizer = MinMaxScaler()
normalizer.fit(X_train) 

In [10]:
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)

In [11]:
bagging_reg = BaggingClassifier(DecisionTreeClassifier(max_depth=20),
                            n_estimators=100,
                            random_state=42)

In [12]:
bagging_reg.fit(X_train_norm, y_train)

In [13]:
pred = bagging_reg.predict(X_test_norm)
bag_acc = bagging_reg.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { bagging_reg.score(X_test_norm, y_test): .8f}")

MAE:  0.00
RMSE:  0.00
R2 score:  1.00000000


In [14]:
features = new_df[['MONTH', 'ORIGIN_CITY', 'DEST_CITY', 'AIR_TIME', 'DISTANCE',
        'ARR_DELAY', 'TAXI_OUT'] + [col for col in new_df.columns if 'DAY_OF_WEEK' in col or 'TIME_OF_DAY' in col]]
target = new_df['DEP_DELAY']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=42)


In [16]:
normalizer = MinMaxScaler()

normalizer.fit(X_train)

In [17]:
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)

In [18]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100,
                            random_state=42)

In [19]:
bagging_reg.fit(X_train_norm, y_train)

In [20]:
import gc
gc.collect()

22

In [21]:
pred = bagging_reg.predict(X_test_norm)
bag_acc = bagging_reg.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { bagging_reg.score(X_test_norm, y_test): .8f}")

MAE:  5.22
RMSE:  66.98
R2 score:  0.97978520


In [22]:
forest = RandomForestRegressor(n_estimators=100,
                            random_state=42)

In [23]:
forest.fit(X_train_norm, y_train)

In [24]:
pred = forest.predict(X_test_norm)
forest_acc = forest.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { forest.score(X_test_norm, y_test): .8f}")

MAE:  5.18
RMSE:  66.07
R2 score:  0.98006085


In [25]:
gb_reg = GradientBoostingRegressor(n_estimators=100,
                                random_state=42)

In [26]:
gb_reg.fit(X_train_norm, y_train)

In [27]:
pred = gb_reg.predict(X_test_norm)
gb_acc = gb_reg.score(X_test_norm, y_test)

print(f"MAE: { mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: { mean_squared_error(pred, y_test): .2f}")
print(f"R2 score: { gb_reg.score(X_test_norm, y_test): .8f}")

MAE:  5.60
RMSE:  73.44
R2 score:  0.97783583


In [28]:
print(f"Bagging Accuracy: {bag_acc}")
print(f"Random Forest Accuracy: {forest_acc}")
print(f"Gradient Boosting Accuracy: {gb_acc}")

Bagging Accuracy: 0.9797852046322231
Random Forest Accuracy: 0.9800608457806363
Gradient Boosting Accuracy: 0.9778358299803176


In [29]:
mejor_modelo = max((bag_acc, "Bagging"), (forest_acc, "Random Forest"), (gb_acc, "Gradient Boosting"))
print(f"El mejor modelo es: {mejor_modelo[1]} con una precisión de {mejor_modelo[0]}")

El mejor modelo es: Random Forest con una precisión de 0.9800608457806363
