# Install libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# score metrics and splitting libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# extreme algorithms
from lightgbm import LGBMRegressor

# warning turn off
import warnings
warnings.filterwarnings('ignore')

In [2]:
#set display max column to see all column
pd.set_option("display.max_columns", None)
pd.get_option("display.max_columns")

#set display max column to see all column
pd.set_option("display.max_rows", 150)
pd.get_option("display.max_rows")

150

# Load Datasets

In [9]:
# load datasets

train = pd.read_csv('/kaggle/input/raw-delays-data/delays_train.csv')
test = pd.read_csv('/kaggle/input/raw-delays-data/delays_test.csv')

In [10]:
selected_features = ['Weekday', 'Month_of_Year', 'Day_of_Month', 'Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'Departure_State',
                     'Arrival_State', 'Flight_Cancelled', 'Departure_Delay', 'Arrival_Delay', 'Taxi_Out_Time', 'Taxi_In_Time', 'Flight_Diverted',
                     'Actual_Departure_Time', 'Flight_Duration', 'Flight_Distance', 'Origin_Temperature', 'Destination_Temperature',
                     'Origin_Wind_Speed', 'Destination_Wind_Speed', 'Origin_Precipitation', 'Destination_Precipitation']

df_train = train[selected_features].copy()

# select the features for test data
df_test = test[list(set(selected_features)^set(['Arrival_Delay']))].copy()

df_train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Scheduled_Departure_Time,Scheduled_Arrival_Time,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Actual_Departure_Time,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation
0,6,6,25,1222,1444,GA,NY,False,,,16.0,10.0,False,1224.0,107.0,760.0,25.14022,19.357739,21.019808,15.452723,0.0,0.0
1,2,6,21,1216,1304,AZ,AZ,False,0.088687,-4.178483,16.0,5.0,False,1216.0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0
2,3,1,5,1945,2055,MN,WI,False,,5.042185,34.0,5.0,False,1945.0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1
3,2,3,22,700,924,FL,TX,False,-1.802698,-0.206932,10.0,11.0,False,658.0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0
4,4,7,14,2130,2359,WA,NV,False,,14.006092,23.0,7.0,False,2133.0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0


In [11]:
df_train.dropna(subset = ['Arrival_Delay'], inplace = True)
df_train.reset_index(drop = True, inplace = True)

### Data preprocessing

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# categorical column imputing
def label_encoder(df, column):
  lb = LabelEncoder()

  # fit and transform the column
  df[column] = lb.fit_transform(df[column].astype(str))

  # replace missing values with the most frequent value
  most_frequent = df[column].mode()[0]
  df[column].fillna(most_frequent, inplace=True)

  return df

# numerical column imputing
def numeric_imputer(df, column, strategy = 'mean'):
  # define the imputer
  mean_imputer = SimpleImputer(strategy = strategy)

  # apply imputer to the column
  df[[column]] = mean_imputer.fit_transform(df[[column]])

  return df

# all in one
def data_imputer(df):

  # get all column that contains missng value
  missing_value_columns = df.columns[df.isnull().sum() > 0]

  # numeric columns that contain missing value
  numerical_columns = df[missing_value_columns].select_dtypes(include = 'number').columns

  # apply to each numerical column
  for column in numerical_columns:
    df = numeric_imputer(df, column, strategy = 'mean')

  print(f'Numerical imputing is finished, encoded columns: {list(numerical_columns)}')

  # categorical column that cointans missing values
  categorical_columns = df.select_dtypes(include = 'object').columns

  # apply to each numerical column
  for column in categorical_columns:
    df = label_encoder(df, column)
  print(f'Categorical encoding is finished, encoded columns: {list(categorical_columns)}')

  # boolen columns encoding
  boolen_columns = df.select_dtypes(include = 'bool').columns

  for column in boolen_columns:
    df[column] = df[column].astype(int)

  print(f'Boolen imputing is finished, encoded columns: {list(boolen_columns)}')
  return df


# let's apply imputer to the dataset
cleaned_train = data_imputer(df_train)
cleaned_test = data_imputer(df_test)


# print missing values for each dataset
display(cleaned_train.isnull().sum())

print('\n\n')

display(cleaned_test.isnull().sum())

Numerical imputing is finished, encoded columns: ['Departure_Delay', 'Flight_Duration', 'Flight_Distance', 'Origin_Temperature', 'Destination_Temperature', 'Origin_Wind_Speed', 'Destination_Wind_Speed']
Categorical encoding is finished, encoded columns: ['Departure_State', 'Arrival_State']
Boolen imputing is finished, encoded columns: ['Flight_Cancelled', 'Flight_Diverted']
Numerical imputing is finished, encoded columns: ['Flight_Duration', 'Destination_Wind_Speed', 'Flight_Distance', 'Actual_Departure_Time', 'Destination_Temperature', 'Origin_Wind_Speed', 'Taxi_In_Time', 'Taxi_Out_Time', 'Origin_Temperature', 'Departure_Delay']
Categorical encoding is finished, encoded columns: ['Departure_State', 'Arrival_State']
Boolen imputing is finished, encoded columns: ['Flight_Diverted', 'Flight_Cancelled']


Weekday                      0
Month_of_Year                0
Day_of_Month                 0
Scheduled_Departure_Time     0
Scheduled_Arrival_Time       0
Departure_State              0
Arrival_State                0
Flight_Cancelled             0
Departure_Delay              0
Arrival_Delay                0
Taxi_Out_Time                0
Taxi_In_Time                 0
Flight_Diverted              0
Actual_Departure_Time        0
Flight_Duration              0
Flight_Distance              0
Origin_Temperature           0
Destination_Temperature      0
Origin_Wind_Speed            0
Destination_Wind_Speed       0
Origin_Precipitation         0
Destination_Precipitation    0
dtype: int64






Flight_Duration              0
Destination_Precipitation    0
Weekday                      0
Destination_Wind_Speed       0
Flight_Distance              0
Actual_Departure_Time        0
Flight_Diverted              0
Departure_State              0
Destination_Temperature      0
Origin_Wind_Speed            0
Origin_Precipitation         0
Taxi_In_Time                 0
Scheduled_Departure_Time     0
Month_of_Year                0
Taxi_Out_Time                0
Origin_Temperature           0
Scheduled_Arrival_Time       0
Departure_Delay              0
Flight_Cancelled             0
Arrival_State                0
Day_of_Month                 0
dtype: int64

In [13]:
# changing the format of time to minute (minutes from 00:00):
def hhmm_to_minutes(time):
    hh = time // 100
    mm = time % 100
    return int(hh * 60 + mm)

# apply conversion function to multiple time columns
def convert_time_columns(df, time_columns):
    for column in time_columns:
        df[f'{column}_Minutes'] = df[column].apply(hhmm_to_minutes)
    return df

# list of time columns to convert
time_columns = ['Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'Actual_Departure_Time', ]

# Convert time columns
cleaned_train = convert_time_columns(cleaned_train, time_columns)
cleaned_test = convert_time_columns(cleaned_test, time_columns)

# drop time columns
cleaned_train.drop(time_columns, axis = 1, inplace = True)
cleaned_test.drop(time_columns, axis = 1, inplace = True)

# save datasets

# path = '...final_Project/'

# cleaned_train.to_csv(path + 'cleaned_train.csv', index = False)
# cleaned_test.to_csv(path + 'cleaned_test.csv', index = False)


cleaned_train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


### Load cleaned data

In [14]:
# load datasets

train = cleaned_train.copy()
test = cleaned_train.copy()

train.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,-4.178483,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,5.042185,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,-0.206932,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,14.006092,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,-14.067374,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


In [15]:
# split train to train set and validation sets, use 75 - 25 % approach
# Shuffle the indices
indices = np.arange(len(train))
np.random.shuffle(indices)

# Define the split ratio
split_ratio = 0.75

# Calculate split indices
split_index = int(len(train) * split_ratio)

# Split the data
df_train = train.iloc[indices[:split_index], :]
df_test = train.iloc[indices[split_index:], :]

df_train.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

print(f'Train size : {df_train.shape}')
print(f'Test size : {df_test.shape}')

Train size : (1065015, 22)
Test size : (355006, 22)


In [16]:
display(df_train.head())

print('*' * 70)
df_test.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,1,2,28,4,2,0,-2.98733,-8.162826,20.0,8.0,0,158.0,1371.0,21.407384,12.555189,26.225702,23.074144,0.0,0.5,570,881,567
1,6,5,21,5,45,0,68.986457,61.816009,10.0,5.0,0,68.0,799.232715,16.596518,16.598338,15.277769,16.515652,0.0,0.0,975,1065,1044
2,3,3,30,22,44,0,19.777909,52.363682,58.0,12.0,0,132.0,852.0,15.303529,12.877064,18.316268,25.597966,0.0,0.0,390,560,410
3,2,6,21,4,33,0,-4.124476,-14.92002,24.0,23.0,0,287.0,2446.0,18.419125,23.209793,29.895637,30.288414,0.1,2.0,420,945,416
4,7,3,6,14,33,0,13.027852,28.01607,10.0,13.0,0,96.0,665.0,21.561212,30.655706,18.761153,17.78651,0.0,0.0,753,883,792


**********************************************************************


Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Arrival_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,5,3,4,14,9,0,-5.00905,-16.025551,20.0,6.0,0,80.0,566.0,16.779974,8.913649,22.460723,18.944829,1.0,0.0,755,872,750
1,3,4,27,18,37,0,-8.49493,-19.018379,18.0,8.0,0,77.0,496.0,29.281752,21.650015,18.304873,22.187711,0.0,0.0,690,804,682
2,7,5,15,13,4,0,53.208723,24.924919,16.0,5.0,0,226.0,1726.0,24.907481,11.994345,22.024953,24.999159,0.5,0.0,1188,1343,1241
3,2,5,10,46,33,0,-1.834496,8.172292,8.0,38.0,0,53.0,799.232715,16.596518,16.598338,23.060805,19.254444,0.1,0.0,406,495,404
4,3,5,18,33,46,0,25.739172,14.060597,25.0,4.0,0,49.0,229.0,13.311837,16.269236,17.558692,20.642966,0.0,0.0,880,970,906


In [17]:
# test model with using big data
X_train = df_train.drop(columns=['Arrival_Delay'])
X_test = df_test.drop(columns=['Arrival_Delay'])
y_train = df_train['Arrival_Delay']
y_test = df_test['Arrival_Delay']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1065015, 21) (1065015,)
(355006, 21) (355006,)


### Model Training

In [18]:
best_param = {'boosting_type': 'gbdt',
 'colsample_bytree': 0.8083100557602823,
 'importance_type': 'split',
 'learning_rate': 0.5590257573381373,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 6.329915255850705,
 'min_split_gain': 0.01593297353431914,
 'n_estimators': 1897,
 'n_jobs': -1,
 'num_leaves': 38,
 'random_state': 123,
 'reg_alpha': 0.7801473610044566,
 'reg_lambda': 0.9168560003582829,
 'subsample': 0.560868988960792,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'nthread': 6.6516088270926605}



In [19]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Create the XGBRegressor model with the best parameters
lgb_reg = LGBMRegressor(**best_param)

# Train the model
lgb_reg.fit(X_train, y_train)

# Make predictions
pred = lgb_reg.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, pred)

print('MAPE :', mape)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.195724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2949
[LightGBM] [Info] Number of data points in the train set: 1065015, number of used features: 19
[LightGBM] [Info] Start training from score 7.557240
MAPE : 6.76218255896612


In [23]:
# train model with all trainset
X_train = train.drop(columns=['Arrival_Delay'])
y_train = train['Arrival_Delay']

X_train.shape, y_train.shape

((1420021, 21), (1420021,))

In [24]:
# Train the model
lgb_reg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.661414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2954
[LightGBM] [Info] Number of data points in the train set: 1420021, number of used features: 19
[LightGBM] [Info] Start training from score 7.504105


# Submission file

In [25]:
# Ensure that test dataset has the same columns and order as the training dataset
test = test[X_train.columns]
display(test.head())

# Make predictions using the reduced feature set
submission = lgb_reg.predict(test)

# Create a submission dataframe
submission_df = pd.DataFrame(submission, index=test.index, columns=['Arrival_Delay'])

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission_lgb.csv', index = False)

# Display the first few rows of the submission
submission_df.head()


Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Departure_State,Arrival_State,Flight_Cancelled,Departure_Delay,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation,Scheduled_Departure_Time_Minutes,Scheduled_Arrival_Time_Minutes,Actual_Departure_Time_Minutes
0,2,6,21,3,3,0,0.088687,16.0,5.0,0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0,736,784,736
1,3,1,5,22,50,0,13.027852,34.0,5.0,0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1,1185,1255,1185
2,2,3,22,8,44,0,-1.802698,10.0,11.0,0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0,420,564,418
3,4,7,14,49,32,0,13.027852,23.0,7.0,0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0,1290,1439,1293
4,1,5,30,4,4,0,-2.181755,10.0,5.0,0,58.0,417.0,9.360464,15.977111,19.001179,16.88496,0.0,0.0,470,555,468


Unnamed: 0,Arrival_Delay
0,-3.259759
1,6.957463
2,-2.180706
3,17.311539
4,-11.863487
