# Install libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
from IPython.display import display

# score metrics and splitting libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# ML algorithms from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor


# warning turn off
import warnings
warnings.filterwarnings('ignore')

In [3]:
#set display max column to see all column
pd.set_option("display.max_columns", None)
pd.get_option("display.max_columns")

#set display max column to see all column
pd.set_option("display.max_rows", 150)
pd.get_option("display.max_rows")

150

# Load Datasets

In [4]:
# load datasets

train = pd.read_csv('/kaggle/input/raw-delays-data/delays_train.csv')
test = pd.read_csv('/kaggle/input/raw-delays-data/delays_test.csv')

In [5]:
selected_features = ['Weekday', 'Month_of_Year', 'Day_of_Month', 'Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'Departure_State',
                     'Arrival_State', 'Flight_Cancelled', 'Departure_Delay', 'Arrival_Delay', 'Taxi_Out_Time', 'Taxi_In_Time', 'Flight_Diverted',
                     'Actual_Departure_Time', 'Flight_Duration', 'Flight_Distance', 'Origin_Temperature', 'Destination_Temperature',
                     'Origin_Wind_Speed', 'Destination_Wind_Speed', 'Origin_Precipitation', 'Destination_Precipitation']

df_train = train[selected_features].copy()

# select the features for test data
df_test = test[list(set(selected_features)^set(['Arrival_Delay']))].copy()

df_train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Scheduled_Departure_Time,Scheduled_Arrival_Time,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Actual_Departure_Time,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation
0,6,6,25,1222,1444,GA,NY,False,,,16.0,10.0,False,1224.0,107.0,760.0,25.14022,19.357739,21.019808,15.452723,0.0,0.0
1,2,6,21,1216,1304,AZ,AZ,False,0.088687,-4.178483,16.0,5.0,False,1216.0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0
2,3,1,5,1945,2055,MN,WI,False,,5.042185,34.0,5.0,False,1945.0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1
3,2,3,22,700,924,FL,TX,False,-1.802698,-0.206932,10.0,11.0,False,658.0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0
4,4,7,14,2130,2359,WA,NV,False,,14.006092,23.0,7.0,False,2133.0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0


In [6]:
df_train.dropna(subset = ['Arrival_Delay'], inplace = True)
df_train.reset_index(drop = True, inplace = True)

### Data preprocessing

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# categorical column imputing
def label_encoder(df, column):
  lb = LabelEncoder()

  # fit and transform the column
  df[column] = lb.fit_transform(df[column].astype(str))

  # replace missing values with the most frequent value
  most_frequent = df[column].mode()[0]
  df[column].fillna(most_frequent, inplace=True)

  return df

# numerical column imputing
def numeric_imputer(df, column, strategy = 'mean'):
  # define the imputer
  mean_imputer = SimpleImputer(strategy = strategy)

  # apply imputer to the column
  df[[column]] = mean_imputer.fit_transform(df[[column]])

  return df

# all in one
def data_imputer(df):

  # get all column that contains missng value
  missing_value_columns = df.columns[df.isnull().sum() > 0]

  # numeric columns that contain missing value
  numerical_columns = df[missing_value_columns].select_dtypes(include = 'number').columns

  # apply to each numerical column
  for column in numerical_columns:
    df = numeric_imputer(df, column, strategy = 'mean')

  print(f'Numerical imputing is finished, encoded columns: {list(numerical_columns)}')

  # categorical column that cointans missing values
  categorical_columns = df.select_dtypes(include = 'object').columns

  # apply to each numerical column
  for column in categorical_columns:
    df = label_encoder(df, column)
  print(f'Categorical encoding is finished, encoded columns: {list(categorical_columns)}')

  # boolen columns encoding
  boolen_columns = df.select_dtypes(include = 'bool').columns

  for column in boolen_columns:
    df[column] = df[column].astype(int)

  print(f'Boolen imputing is finished, encoded columns: {list(boolen_columns)}')
  return df


# let's apply imputer to the dataset
cleaned_train = data_imputer(df_train)
cleaned_test = data_imputer(df_test)


# print missing values for each dataset
display(cleaned_train.isnull().sum())

print('\n\n')

display(cleaned_test.isnull().sum())

Numerical imputing is finished, encoded columns: ['Departure_Delay', 'Flight_Duration', 'Flight_Distance', 'Origin_Temperature', 'Destination_Temperature', 'Origin_Wind_Speed', 'Destination_Wind_Speed']
Categorical encoding is finished, encoded columns: ['Departure_State', 'Arrival_State']
Boolen imputing is finished, encoded columns: ['Flight_Cancelled', 'Flight_Diverted']
Numerical imputing is finished, encoded columns: ['Flight_Duration', 'Taxi_In_Time', 'Actual_Departure_Time', 'Destination_Wind_Speed', 'Origin_Wind_Speed', 'Flight_Distance', 'Destination_Temperature', 'Taxi_Out_Time', 'Origin_Temperature', 'Departure_Delay']
Categorical encoding is finished, encoded columns: ['Arrival_State', 'Departure_State']
Boolen imputing is finished, encoded columns: ['Flight_Diverted', 'Flight_Cancelled']


Weekday                      0
Month_of_Year                0
Day_of_Month                 0
Scheduled_Departure_Time     0
Scheduled_Arrival_Time       0
Departure_State              0
Arrival_State                0
Flight_Cancelled             0
Departure_Delay              0
Arrival_Delay                0
Taxi_Out_Time                0
Taxi_In_Time                 0
Flight_Diverted              0
Actual_Departure_Time        0
Flight_Duration              0
Flight_Distance              0
Origin_Temperature           0
Destination_Temperature      0
Origin_Wind_Speed            0
Destination_Wind_Speed       0
Origin_Precipitation         0
Destination_Precipitation    0
dtype: int64






Flight_Duration              0
Month_of_Year                0
Scheduled_Arrival_Time       0
Taxi_In_Time                 0
Actual_Departure_Time        0
Origin_Precipitation         0
Weekday                      0
Flight_Diverted              0
Flight_Cancelled             0
Destination_Precipitation    0
Destination_Wind_Speed       0
Origin_Wind_Speed            0
Flight_Distance              0
Day_of_Month                 0
Scheduled_Departure_Time     0
Arrival_State                0
Destination_Temperature      0
Taxi_Out_Time                0
Origin_Temperature           0
Departure_Delay              0
Departure_State              0
dtype: int64

In [8]:
# changing the format of time to minute (minutes from 00:00):
def hhmm_to_minutes(time):
    hh = time // 100
    mm = time % 100
    return int(hh * 60 + mm)

# apply conversion function to multiple time columns
def convert_time_columns(df, time_columns):
    for column in time_columns:
        df[f'{column}_Minutes'] = df[column].apply(hhmm_to_minutes)
    return df

# list of time columns to convert
time_columns = ['Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'Actual_Departure_Time', ]

# Convert time columns
cleaned_train = convert_time_columns(cleaned_train, time_columns)
cleaned_test = convert_time_columns(cleaned_test, time_columns)

# drop time columns
cleaned_train.drop(time_columns, axis = 1, inplace = True)
cleaned_test.drop(time_columns, axis = 1, inplace = True)

# save datasets

# path = '...final_Project/'

# cleaned_train.to_csv(path + 'cleaned_train.csv', index = False)
# cleaned_test.to_csv(path + 'cleaned_test.csv', index = False)


cleaned_train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


### Load cleaned data

In [26]:
# load datasets

train = cleaned_train.copy()
test = cleaned_train.copy()

train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


In [10]:
# split train to train set and validation sets, use 75 - 25 % approach
# Shuffle the indices
indices = np.arange(len(train))
np.random.shuffle(indices)

# Define the split ratio
split_ratio = 0.75

# Calculate split indices
split_index = int(len(train) * split_ratio)

# Split the data
df_train = train.iloc[indices[:split_index], :]
df_test = train.iloc[indices[split_index:], :]

df_train.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

print(f'Train size : {df_train.shape}')
print(f'Test size : {df_test.shape}')

Train size : (1065015, 22)
Test size : (355006, 22)


In [11]:
display(df_train.head())

print('*' * 70)
df_test.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,1,5,30,5,45,0,-11.077911,-25.016806,12.0,7.0,0,44.0,283.0,22.546738,16.556182,15.663873,14.963135,0.0,0.1,720,797,709
1,5,7,15,9,30,0,-2.291414,-15.261307,11.0,4.0,0,105.0,746.0,24.020126,18.821658,18.742777,18.288344,0.0,0.0,1357,50,1355
2,7,3,27,13,32,0,-0.135451,-15.176877,9.0,5.0,0,206.0,1521.0,16.015733,23.506895,19.904674,23.530572,0.0,0.0,505,620,505
3,2,6,21,1,46,0,107.471955,109.911613,17.0,5.0,0,94.0,613.0,20.589828,7.462428,22.043225,22.114144,0.1,0.0,860,1033,967
4,1,7,11,8,26,0,77.031023,122.03063,71.0,6.0,0,111.10592,799.232715,16.596518,16.598338,20.077378,20.082186,0.0,0.0,1183,1325,1260


**********************************************************************


Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,5,2,11,37,9,0,-8.873551,5.095986,36.0,6.0,0,107.0,666.0,10.96706,9.710944,19.084149,19.406076,0.0,0.0,430,565,421
1,5,7,15,9,26,0,-3.29995,9.942711,13.0,19.0,0,37.0,140.0,16.965748,12.867547,10.711139,15.425691,0.0,0.0,315,371,312
2,2,2,8,1,44,0,16.740571,14.01251,16.0,21.0,0,100.0,622.0,20.896133,18.037566,18.345945,19.817432,0.1,0.1,1034,1174,1051
3,7,4,24,9,9,0,-5.184916,-12.819327,17.0,5.0,0,23.0,83.0,15.011638,20.134413,17.988768,15.400768,0.5,0.0,990,1043,985
4,7,1,23,8,30,0,2.244583,10.112663,35.0,9.0,0,111.10592,997.0,10.216352,13.896234,20.077378,20.082186,0.0,0.0,766,930,768


In [12]:
# test model with using big data
X_train = df_train.drop(columns=['Arrival_Delay'])
X_test = df_test.drop(columns=['Arrival_Delay'])
y_train = df_train['Arrival_Delay']
y_test = df_test['Arrival_Delay']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1065015, 21) (1065015,)
(355006, 21) (355006,)


### Model Training

In [13]:
lgb_params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8083100557602823,
 'importance_type': 'split',
 'learning_rate': 0.5590257573381373,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 6.329915255850705,
 'min_split_gain': 0.01593297353431914,
 'n_estimators': 1897,
 'n_jobs': -1,
 'num_leaves': 38,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.7801473610044566,
 'reg_lambda': 0.9168560003582829,
 'subsample': 0.560868988960792,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'nthread': 6.6516088270926605}


xgb_params = {'colsample_bytree': 0.9698294480528181,
 'gamma': 3.7604998424073166,
 'learning_rate': 0.2718006286831415,
 'max_depth': 8,
 'min_child_weight': 1,
 'n_estimators': 983,
 'reg_alpha': 146,
 'subsample': 0.7423469165690778,
 'random_state': 123,
 'n_jobs': -1}


cat_params = {'colsample_bylevel': 0.6312979928865734,
 'depth': 9,
 'l2_leaf_reg': 8.754132470161709,
 'learning_rate': 0.09396165335017637,
 'max_bin': 490.0,
 'random_strength': 0.07403229688797397,
 'subsample': 0.8801936829680834}


In [16]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Create individual regression models
xgb_reg = XGBRegressor(**xgb_params)
lgb_reg = LGBMRegressor(**lgb_params, silent=True)
cat_reg = CatBoostRegressor(**cat_params)

# create voting regressor
voting_reg = VotingRegressor(estimators=[
    ('lgb_reg', lgb_reg),
    ('xgb_reg', xgb_reg),
    ('cat_reg', cat_reg)
], n_jobs=-1)

voting_reg

In [17]:
# Fit the Voting Regressor on the training data
voting_reg.fit(X_train, y_train)

0:	learn: 51.6904560	total: 573ms	remaining: 9m 32s
1:	learn: 47.9734998	total: 1.06s	remaining: 8m 48s
2:	learn: 44.6726444	total: 1.5s	remaining: 8m 17s
3:	learn: 41.7506044	total: 1.89s	remaining: 7m 49s
4:	learn: 39.1665654	total: 2.3s	remaining: 7m 38s
5:	learn: 36.7489730	total: 2.73s	remaining: 7m 32s
6:	learn: 34.7024906	total: 3.19s	remaining: 7m 32s
7:	learn: 32.9062676	total: 3.6s	remaining: 7m 26s
8:	learn: 31.2440359	total: 4.07s	remaining: 7m 27s
9:	learn: 29.7922997	total: 4.5s	remaining: 7m 25s
10:	learn: 28.5579281	total: 4.94s	remaining: 7m 23s
11:	learn: 27.4603121	total: 5.37s	remaining: 7m 22s
12:	learn: 26.3606932	total: 5.84s	remaining: 7m 23s
13:	learn: 25.5179773	total: 6.31s	remaining: 7m 24s
14:	learn: 24.7896446	total: 6.74s	remaining: 7m 22s
15:	learn: 24.0013197	total: 7.19s	remaining: 7m 22s
16:	learn: 23.4303910	total: 7.66s	remaining: 7m 22s
17:	learn: 22.9207290	total: 8.11s	remaining: 7m 22s
18:	learn: 22.4694051	total: 8.58s	remaining: 7m 22s
19:	lea

In [18]:
# Make predictions
pred = voting_reg.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, pred)

print('MAPE :', mape)


MAPE : 6.459748982239579


In [19]:
print(mape)

6.459748982239579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.262959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2954
[LightGBM] [Info] Number of data points in the train set: 1065015, number of used features: 19
[LightGBM] [Info] Start training from score 7.495541


### Train model with all train data

In [27]:
X_train = train.drop(columns=['Arrival_Delay'])
y_train = train['Arrival_Delay']

X_train.shape, y_train.shape

((1420021, 21), (1420021,))

In [29]:
# Fit the Voting Regressor on the training data
voting_reg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.550008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2956
[LightGBM] [Info] Number of data points in the train set: 1420021, number of used features: 19
[LightGBM] [Info] Start training from score 7.504105
0:	learn: 51.8020548	total: 348ms	remaining: 5m 47s
1:	learn: 48.0674625	total: 626ms	remaining: 5m 12s
2:	learn: 44.7573132	total: 884ms	remaining: 4m 53s
3:	learn: 41.8285453	total: 1.13s	remaining: 4m 41s
4:	learn: 39.2425309	total: 1.4s	remaining: 4m 39s
5:	learn: 36.9072542	total: 1.69s	remaining: 4m 39s
6:	learn: 34.8682452	total: 1.98s	remaining: 4m 40s
7:	learn: 33.0031679	total: 2.25s	remaining: 4m 38s
8:	learn: 31.3733525	total: 2.52s	remaining: 4m 37s
9:	learn: 29.9247343	total: 2.78s	remaining: 4m 34s
10:	learn: 28.7031322	total: 3.04s	remaining: 4m 33s
11:	learn: 27.5964442	total: 3.31s	remaining: 4m 32s
12:	learn: 26.4731976	total: 3.

# Submission file

In [30]:
# Ensure that test dataset has the same columns and order as the training dataset
test = test[X_train.columns]
display(test.head())

# Make predictions using the reduced feature set
submission = voting_reg.predict(test)

# Create a submission dataframe
submission_df = pd.DataFrame(submission, index=test.index, columns=['Arrival_Delay'])

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission_voting_reg.csv', index = False)

# Display the first few rows of the submission
submission_df.head()


Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468




Unnamed: 0,Arrival_Delay
0,-5.060428
1,7.310188
2,0.557999
3,17.268008
4,-12.657249
