# Install libraries

In [None]:
# install libraries
!pip install catboost




In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
from IPython.display import display

# score metrics and splitting libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# ML algorithms from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

# extreme algorithms
from xgboost import XGBRFRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# warning turn off
import warnings
warnings.filterwarnings('ignore')

In [None]:
#set display max column to see all column
pd.set_option("display.max_columns", None)
pd.get_option("display.max_columns")

#set display max column to see all column
pd.set_option("display.max_rows", 150)
pd.get_option("display.max_rows")

150

# Load Datasets

In [None]:
# load datasets

train = pd.read_csv('/content/drive/MyDrive/Machine Learning 1 - final Project/cleaned_train.csv')
test = pd.read_csv('/content/drive/MyDrive/Machine Learning 1 - final Project/cleaned_test.csv')

train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


We split the train, contains target variable, to train and validation to train and test models performance and accuracy.

In [None]:
# split train to train set and validation sets, use 75 - 25 % approach
split_size = int(train.shape[0] * 0.75)

df_train, df_test = train.iloc[:split_size, :], train.iloc[split_size:,  :]

print(f'Train size : {df_train.shape}')
print(f'Test size : {df_test.shape}')

Train size : (1065015, 22)
Test size : (355006, 22)


# Parameter Tuning

When working with a large dataset like ours (1 million rows), finding the best parameters for regression models while ensuring efficient model training and evaluation cost to much computation and time.

To handle this issue we use **Sampling for Hyperparameter Tuning**. It means that instead of using the entire dataset for hyperparameter tuning, we can use a representative sample of data. This will significantly reduce computation time and allow us to perform thorough hyperparameter tuning.

In [None]:
# use 15% of the data for tuning
sample_size = int(0.15 * len(df_train))
df_sample = df_train.sample(n = sample_size, random_state = 42)

X_sample = df_sample.drop(columns=['Arrival_Delay'])
y_sample = df_sample['Arrival_Delay']

## Parameters for baseline models

In [None]:
# Define the models and parameter grids
param_grids = {
    'LinearRegression': {
        'poly__degree': [1, 2],  # Polynomial features of degree 1, 2, and 3
        'lr__fit_intercept': [True, False]
    },
    'Ridge': {
        'ridge__alpha': [0.01, 0.1, 1.0, 10.0],
        'ridge__fit_intercept': [True, False],
        'ridge__solver': ['auto', 'svd', 'cholesky']
    },
    'Lasso': {
        'alpha': [0.01, 0.1, 1.0, 10.0],
        'fit_intercept': [True, False],
        'max_iter': [2500, 5000, 15000],
        'selection': ['cyclic', 'random']
    },
    'SVR': {
        'svr__C': [0.1, 1.0, 10.0, 15.0],
        'svr__epsilon': [0.01, 0.1, 0.5],
        'svr__kernel': ['linear', 'poly', 'rbf'],
        'svr__degree': [2, 3, 4],  # Polynomial degree for 'poly' kernel
        'svr__gamma': ['scale', 'auto']
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'auto', 'sqrt', 'log2']
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['auto', 'sqrt', 'log2'],
        'bootstrap': [True, False]
    }
}


We use Pipline to optimize working environmont

In [None]:
# Define the models and parameter grids
models = {
    'LinearRegression': Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('lr', LinearRegression())]),
    'Ridge': Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())]),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor()
}


In [None]:
# define scorer metric
from sklearn.metrics import make_scorer, mean_absolute_percentage_error

scorer = make_scorer(mean_absolute_percentage_error, greater_is_better= False)

In [None]:

def hyperparameter_tuning(model, param_grid, X, y, cv=5, verbose = 4, scoring= scorer):
    search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring=scoring, n_jobs=-1, verbose = verbose, return_train_score=True)
    search.fit(X, y)
    print(f"Best Parameters: {search.best_params_}")
    print(f"Best Score: {search.best_score_}\n")
    return search.best_estimator_

# Tune and Evaluate Models on Sample
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...\n")
    best_model = hyperparameter_tuning(model, param_grids[model_name], X_sample, y_sample)
    results[model_name] = best_model


Training LinearRegression...

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'lr__fit_intercept': False, 'poly__degree': 2}
Best Score: -25.91650346059341

Training Ridge...

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'ridge__alpha': 10.0, 'ridge__fit_intercept': False, 'ridge__solver': 'auto'}
Best Score: -21.383131661796064

Training Lasso...

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'alpha': 10.0, 'fit_intercept': False, 'max_iter': 5000, 'selection': 'random'}
Best Score: -24.45490380149892

Training DecisionTree...

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10}
Best Score: -11.924132227411528

Training RandomForest...

Fitting 5 folds for each of 144 candidates, totalling 720 fits


KeyboardInterrupt: 

Let's see results for 4 models. Running random forest took so many time, that's why I decided to use same parameter as decision tree for it

In [None]:
results = {'LinearRegression': Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                 ('lr', LinearRegression(fit_intercept=False))]),
          'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                          ('ridge', Ridge(alpha=10.0, fit_intercept=False))]),
          'Lasso': Lasso(alpha=10.0, fit_intercept=False, max_iter=5000, selection='random'),
          'DecisionTree': DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                                min_samples_split=10)}

results [ 'RandomForest'] : RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10)

results

{'LinearRegression': Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                 ('lr', LinearRegression(fit_intercept=False))]),
 'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                 ('ridge', Ridge(alpha=10.0, fit_intercept=False))]),
 'Lasso': Lasso(alpha=10.0, fit_intercept=False, max_iter=5000, selection='random'),
 'DecisionTree': DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10)}

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

# Define your train_test_split function and load your data
X_train, X_test, y_train, y_test = df_train.drop(columns = 'Arrival_Delay'), df_test.drop(columns = 'Arrival_Delay'), df_train['Arrival_Delay'], df_test['Arrival_Delay']

# Train and test each model
mape_scores = {}
for model_name, model in models.items():
    print(f"Training and testing {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    mape_scores[model_name] = mape
    print(f"MAPE for {model_name}: {mape}\n\n")

# Find the best model based on MAPE score
best_model_name = min(mape_scores, key=mape_scores.get)
best_mape_score = mape_scores[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"MAPE Score for Best Model: {best_mape_score}")


Training and testing LinearRegression...
MAPE for LinearRegression: 18.94046517262583


Training and testing Ridge...
MAPE for Ridge: 20.702301646853755


Training and testing Lasso...
MAPE for Lasso: 20.412186498854584


Training and testing DecisionTree...
MAPE for DecisionTree: 18.126833483470953


Training and testing RandomForest...
MAPE for RandomForest: 14.392364099286581


Best Model: RandomForest
MAPE Score for Best Model: 14.392364099286581


In [None]:
model_params = {}
for model_name, model in models.items():
  print(f'{model_name}\'s the best parameters : {model.get_params()}')
  model_params[model_name] = model.get_params()

model_params

LinearRegression's the best parameters : {'memory': None, 'steps': [('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('lr', LinearRegression())], 'verbose': False, 'scaler': StandardScaler(), 'poly': PolynomialFeatures(), 'lr': LinearRegression(), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'poly__degree': 2, 'poly__include_bias': True, 'poly__interaction_only': False, 'poly__order': 'C', 'lr__copy_X': True, 'lr__fit_intercept': True, 'lr__n_jobs': None, 'lr__positive': False}
Ridge's the best parameters : {'memory': None, 'steps': [('scaler', StandardScaler()), ('ridge', Ridge())], 'verbose': False, 'scaler': StandardScaler(), 'ridge': Ridge(), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'ridge__alpha': 1.0, 'ridge__copy_X': True, 'ridge__fit_intercept': True, 'ridge__max_iter': None, 'ridge__positive': False, 'ridge__random_state': None, 'ridge__solver': 'auto', 'ridge__tol': 0.0001}
Lasso's the best parameter

{'LinearRegression': {'memory': None,
  'steps': [('scaler', StandardScaler()),
   ('poly', PolynomialFeatures()),
   ('lr', LinearRegression())],
  'verbose': False,
  'scaler': StandardScaler(),
  'poly': PolynomialFeatures(),
  'lr': LinearRegression(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'poly__degree': 2,
  'poly__include_bias': True,
  'poly__interaction_only': False,
  'poly__order': 'C',
  'lr__copy_X': True,
  'lr__fit_intercept': True,
  'lr__n_jobs': None,
  'lr__positive': False},
 'Ridge': {'memory': None,
  'steps': [('scaler', StandardScaler()), ('ridge', Ridge())],
  'verbose': False,
  'scaler': StandardScaler(),
  'ridge': Ridge(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'ridge__alpha': 1.0,
  'ridge__copy_X': True,
  'ridge__fit_intercept': True,
  'ridge__max_iter': None,
  'ridge__positive': False,
  'ridge__random_state': None,
  'ridge__solver': 'auto',
  'ridge__tol': 0.000

In [None]:
model_params  = {'LinearRegression': {'memory': None,
  'steps': [('scaler', StandardScaler()),
   ('poly', PolynomialFeatures()),
   ('lr', LinearRegression())],
  'verbose': False,
  'scaler': StandardScaler(),
  'poly': PolynomialFeatures(),
  'lr': LinearRegression(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'poly__degree': 2,
  'poly__include_bias': True,
  'poly__interaction_only': False,
  'poly__order': 'C',
  'lr__copy_X': True,
  'lr__fit_intercept': True,
  'lr__n_jobs': None,
  'lr__positive': False},

 'Ridge': {'memory': None,
  'steps': [('scaler', StandardScaler()), ('ridge', Ridge())],
  'verbose': False,
  'scaler': StandardScaler(),
  'ridge': Ridge(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'ridge__alpha': 1.0,
  'ridge__copy_X': True,
  'ridge__fit_intercept': True,
  'ridge__max_iter': None,
  'ridge__positive': False,
  'ridge__random_state': None,
  'ridge__solver': 'auto',
  'ridge__tol': 0.0001},

 'Lasso': {'alpha': 1.0,
  'copy_X': True,
  'fit_intercept': True,
  'max_iter': 1000,
  'positive': False,
  'precompute': False,
  'random_state': None,
  'selection': 'cyclic',
  'tol': 0.0001,
  'warm_start': False},

 'DecisionTree': {'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'random_state': None,
  'splitter': 'best'},

 'RandomForest': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': None,
  'max_features': 1.0,
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 100,
  'n_jobs': None,
  'oob_score': False,
  'random_state': None,
  'verbose': 0,
  'warm_start': False}}

### Voting regressor

In [None]:
# Create individual regression models
linear_reg = models['LinearRegression']
svr_reg = SVR(kernel = 'rbf',  degree = 3, gamma = "scale", coef0 = 0.01, epsilon = 0.01)
rf_reg = RandomForestRegressor(**model_params['RandomForest'])
ds_reg = DecisionTreeRegressor(**model_params['DecisionTree'])
ridge_reg = models['Ridge']
lasso_reg = models['Lasso']

In [None]:
# Create a Voting Regressor with the individual models
voting_reg = VotingRegressor(estimators=[
    ('linear', linear_reg),
    ('ridge', ridge_reg),
    ('lasso', lasso_reg),
    ('svr', svr_reg),
    ('decision_tree', ds_reg),
    ('random_forest', rf_reg)
], n_jobs=-1)

# Fit the Voting Regressor on the training data
voting_reg.fit(X_train, y_train)


In [None]:
# Evaluate Best Models Using Cross-Validation on Full Dataset
from sklearn.model_selection import cross_val_score

X = df_train.drop(columns=['Arrival_Delay'])
y = df_train['Arrival_Delay']

final_results = {}
for model_name, model in results.items():
    scores = cross_val_score(model, X, y, cv=5, scoring = scorer, n_jobs=-1)
    final_results[model_name] = {
        'mean_mape': -scores.mean(),  # MAPE is inverted because greater_is_better=False
        'std_mape': scores.std()
    }
    print(f"{model_name} - Mean MAPE: {-scores.mean()}, Std MAPE: {scores.std()}")

# Print final results
print("Final Results:")
for model_name, result in final_results.items():
    print(f"{model_name}: Mean MAPE = {result['mean_mape']}, Std MAPE = {result['std_mape']}")
