# Install libraries

In [None]:
# install libraries
!pip install catboost






















In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
from IPython.display import display

# score metrics and splitting libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# ML algorithms from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

# extreme algorithms
from xgboost import XGBRFRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# warning turn off
import warnings
warnings.filterwarnings('ignore')

In [2]:
#set display max column to see all column
pd.set_option("display.max_columns", None)
pd.get_option("display.max_columns")

#set display max column to see all column
pd.set_option("display.max_rows", 150)
pd.get_option("display.max_rows")

150

# Load Datasets

In [3]:
# load datasets

train = pd.read_csv('/kaggle/input/delays-data-cleaned/cleaned_train.csv')
test = pd.read_csv('/kaggle/input/delays-data-cleaned/cleaned_test.csv')

train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


We split the train, contains target variable, to train and validation to train and test models performance and accuracy.

In [20]:
top_features = ['Departure_Delay', 'Actual_Departure_Time_Minutes',
        'Scheduled_Departure_Time_Minutes', 'Taxi_Out_Time', 'Taxi_In_Time',
        'Weekday', 'Scheduled_Arrival_Time_Minutes', 'Flight_Duration',
        'Flight_Distance', 'Departure_State']

target = ['Arrival_Delay']

train = train[top_features + target].copy()
test = test[top_features].copy()

In [21]:
# split train to train set and validation sets, use 75 - 25 % approach
split_size = int(train.shape[0] * 0.75)

df_train, df_test = train.iloc[:split_size, :], train.iloc[split_size:,  :]

print(f'Train size : {df_train.shape}')
print(f'Test size : {df_test.shape}')

Train size : (1065015, 11)
Test size : (355006, 11)


# Parameter Tuning

When working with a large dataset like ours (1 million rows), finding the best parameters for regression models while ensuring efficient model training and evaluation cost to much computation and time.

To handle this issue we use **Sampling for Hyperparameter Tuning**. It means that instead of using the entire dataset for hyperparameter tuning, we can use a representative sample of data. This will significantly reduce computation time and allow us to perform thorough hyperparameter tuning.

In [22]:
# use 15% of the data for tuning
sample_size = int(0.15 * len(df_train))
df_sample = df_train.sample(n = sample_size, random_state = 42)

X_sample = df_sample.drop(columns=['Arrival_Delay'])
y_sample = df_sample['Arrival_Delay']

## Parameters for baseline models

In [23]:
rf_params = {'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 70,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [24]:
# Define the models and parameter grids
models = {
    'LinearRegression': Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('lr', LinearRegression())]),
    'Ridge': Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())]),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(**rf_params)
}


In [25]:
# define scorer metric
from sklearn.metrics import make_scorer, mean_absolute_percentage_error

scorer = make_scorer(mean_absolute_percentage_error, greater_is_better= False)

Let's see results for 4 models. Running random forest took so many time, that's why I decided to use same parameter as decision tree for it

In [26]:
results = {'LinearRegression': Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                 ('lr', LinearRegression(fit_intercept=False))]),
          'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                          ('ridge', Ridge(alpha=10.0, fit_intercept=False))]),
          'Lasso': Lasso(alpha=10.0, fit_intercept=False, max_iter=5000, selection='random'),
          'DecisionTree': DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                                min_samples_split=10)}

results [ 'RandomForest'] : RandomForestRegressor(**rf_params)

results

{'LinearRegression': Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                 ('lr', LinearRegression(fit_intercept=False))]),
 'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                 ('ridge', Ridge(alpha=10.0, fit_intercept=False))]),
 'Lasso': Lasso(alpha=10.0, fit_intercept=False, max_iter=5000, selection='random'),
 'DecisionTree': DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=2,
                       min_samples_split=10)}

In [27]:
from sklearn.metrics import mean_absolute_percentage_error

# Define your train_test_split function and load your data
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size = 0.25, random_state = 42)

# Train and test each model
mape_scores = {}
for model_name, model in models.items():
    print(f"Training and testing {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    mape_scores[model_name] = mape
    print(f"MAPE for {model_name}: {mape}\n")

# Find the best model based on MAPE score
best_model_name = min(mape_scores, key=mape_scores.get)
best_mape_score = mape_scores[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"MAPE Score for Best Model: {best_mape_score}")


Training and testing LinearRegression...
MAPE for LinearRegression: 9.344578943515447

Training and testing Ridge...
MAPE for Ridge: 10.25203103605143

Training and testing Lasso...
MAPE for Lasso: 10.201369748612022

Training and testing DecisionTree...
MAPE for DecisionTree: 15.910874860862963

Training and testing RandomForest...
MAPE for RandomForest: 8.888191436503597

Best Model: RandomForest
MAPE Score for Best Model: 8.888191436503597


In [30]:
model_params = {}
for model_name, model in models.items():
  print(f'{model_name}\'s the best parameters : {model.get_params()}')
  model_params[model_name] = model.get_params()

model_params

LinearRegression's the best parameters : {'memory': None, 'steps': [('scaler', StandardScaler()), ('poly', PolynomialFeatures()), ('lr', LinearRegression())], 'verbose': False, 'scaler': StandardScaler(), 'poly': PolynomialFeatures(), 'lr': LinearRegression(), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'poly__degree': 2, 'poly__include_bias': True, 'poly__interaction_only': False, 'poly__order': 'C', 'lr__copy_X': True, 'lr__fit_intercept': True, 'lr__n_jobs': None, 'lr__positive': False}
Ridge's the best parameters : {'memory': None, 'steps': [('scaler', StandardScaler()), ('ridge', Ridge())], 'verbose': False, 'scaler': StandardScaler(), 'ridge': Ridge(), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'ridge__alpha': 1.0, 'ridge__copy_X': True, 'ridge__fit_intercept': True, 'ridge__max_iter': None, 'ridge__positive': False, 'ridge__random_state': None, 'ridge__solver': 'auto', 'ridge__tol': 0.0001}
Lasso's the best parameter

{'LinearRegression': {'memory': None,
  'steps': [('scaler', StandardScaler()),
   ('poly', PolynomialFeatures()),
   ('lr', LinearRegression())],
  'verbose': False,
  'scaler': StandardScaler(),
  'poly': PolynomialFeatures(),
  'lr': LinearRegression(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'poly__degree': 2,
  'poly__include_bias': True,
  'poly__interaction_only': False,
  'poly__order': 'C',
  'lr__copy_X': True,
  'lr__fit_intercept': True,
  'lr__n_jobs': None,
  'lr__positive': False},
 'Ridge': {'memory': None,
  'steps': [('scaler', StandardScaler()), ('ridge', Ridge())],
  'verbose': False,
  'scaler': StandardScaler(),
  'ridge': Ridge(),
  'scaler__copy': True,
  'scaler__with_mean': True,
  'scaler__with_std': True,
  'ridge__alpha': 1.0,
  'ridge__copy_X': True,
  'ridge__fit_intercept': True,
  'ridge__max_iter': None,
  'ridge__positive': False,
  'ridge__random_state': None,
  'ridge__solver': 'auto',
  'ridge__tol': 0.000

### Voting regressor

In [31]:
# Create individual regression models
linear_reg = models['LinearRegression']
svr_reg = SVR(kernel = 'rbf',  degree = 3, gamma = "scale", coef0 = 0.01, epsilon = 0.01)
rf_reg = RandomForestRegressor(**rf_params)
ds_reg = DecisionTreeRegressor(**model_params['DecisionTree'])
ridge_reg = models['Ridge']
lasso_reg = models['Lasso']

In [32]:
# Create a Voting Regressor with the individual models
voting_reg = VotingRegressor(estimators=[
    ('linear', linear_reg),
    ('ridge', ridge_reg),
    ('lasso', lasso_reg),
    ('svr', svr_reg),
    ('decision_tree', ds_reg),
    ('random_forest', rf_reg)
], n_jobs=-1)

# Fit the Voting Regressor on the training data
voting_reg.fit(X_train, y_train)


  warn(


In [33]:
# Make predictions
pred = voting_reg.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, pred)

print('MAPE :', mape)


MAPE : 9.181002365703412


In [None]:

# Define your train_test_split function and load your data
X_train, X_test, y_train, y_test = df_train.drop(columns = 'Arrival_Delay'), df_test.drop(columns = 'Arrival_Delay'), df_train['Arrival_Delay'], df_test['Arrival_Delay']

# Create a Voting Regressor with the individual models
voting_reg1 = VotingRegressor(estimators=[
    ('linear', linear_reg),
    ('ridge', ridge_reg),
    ('lasso', lasso_reg),
    ('svr', svr_reg),
    ('decision_tree', ds_reg),
    ('random_forest', rf_reg)
], n_jobs=-1)

# Fit the Voting Regressor on the training data
voting_reg1.fit(X_train, y_train)

voting_reg1.get_params

  warn(


In [None]:
# Make predictions
pred = voting_reg1.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, pred)

print('MAPE :', mape)
