In [None]:
import pandas as pd
import numpy as np
import random
import os
import gc

In [None]:
# Set the seed for reproducibility
seed = 53
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = "1"
seed_everything(seed)

In [None]:
# Convert CSV files to Parquet format for faster data reading and processing
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
# Load the train, test, and sample submission data
csv_to_parquet('open/train.csv', 'train')
csv_to_parquet('open/test.csv', 'test')

train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
sample_submission = pd.read_csv('open/sample_submission.csv', index_col = 0)

In [None]:
# Drop 'Cancelled' and 'Diverted' columns from train and test datasets
train = train.drop(columns = ['Cancelled', 'Diverted'], axis = 1)
test = test.drop(columns = ['Cancelled', 'Diverted'], axis = 1)

In [None]:
# import necessary libraries and define numerical and categorical columns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Lasso

num_col = ['Estimated_Departure_Time', 'Estimated_Arrival_Time']
cat_col = ['Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)']

# Define an iterative imputer with Lasso estimator to impute missing values in the numerical columns
imp = IterativeImputer(estimator=Lasso(), initial_strategy='most_frequent', missing_values=np.nan, max_iter=1000, imputation_order='roman',random_state=seed)

# Apply iterative imputation to fill missing values in the numerical columns of train and test datasets
train[num_col]=imp.fit_transform(train[num_col])
test[num_col]=imp.transform(test[num_col])

# Round the numerical columns to integers
train[num_col] = np.round(train[num_col])
test[num_col] = np.round(test[num_col])

# Fill missing values in the categorical columns of train and test datasets using groupby and mode
train['Origin_State'] = train['Origin_State'].fillna(train.groupby(['Origin_Airport'])['Origin_State'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
train['Destination_State'] = train['Destination_State'].fillna(train.groupby(['Destination_Airport'])['Destination_State'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
train['Airline'] = train['Airline'].fillna(train.groupby(['Carrier_ID(DOT)'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan).fillna(train.groupby(['Carrier_Code(IATA)'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)).fillna(train.groupby(['Distance'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)))
train['Carrier_Code(IATA)'] = train['Carrier_Code(IATA)'].fillna(train.groupby(['Airline'])['Carrier_Code(IATA)'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
train['Carrier_ID(DOT)'] = train['Carrier_ID(DOT)'].fillna(train.groupby(['Airline'])['Carrier_ID(DOT)'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
test['Origin_State'] = test['Origin_State'].fillna(test.groupby(['Origin_Airport'])['Origin_State'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
test['Destination_State'] = test['Destination_State'].fillna(test.groupby(['Destination_Airport'])['Destination_State'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
test['Airline'] = test['Airline'].fillna(test.groupby(['Carrier_ID(DOT)'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan).fillna(test.groupby(['Carrier_Code(IATA)'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)).fillna(test.groupby(['Distance'])['Airline'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)))
test['Carrier_Code(IATA)'] = test['Carrier_Code(IATA)'].fillna(test.groupby(['Airline'])['Carrier_Code(IATA)'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))
test['Carrier_ID(DOT)'] = test['Carrier_ID(DOT)'].fillna(test.groupby(['Airline'])['Carrier_ID(DOT)'].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan))

# Fill missing values in categorical columns with mode
for col in cat_col:
    if col in train.columns:
        train[col].fillna(train[col].mode()[0], inplace=True)
        test[col].fillna(test[col].mode()[0], inplace=True)

In [None]:
# Feature Engineering
from datetime import datetime
train["Dep_hour"] = (train["Estimated_Departure_Time"] // 100).astype(int)
train["Dep_minute"] = (train["Estimated_Departure_Time"] % 100).astype(int)
train["Arr_hour"] = (train["Estimated_Arrival_Time"] // 100).astype(int)
train["Arr_minute"] = (train["Estimated_Arrival_Time"] % 100).astype(int)

train['Dep_Time_of_Day'] = np.where(train['Dep_hour'] < 12, 'AM', 'PM')
train['Arr_Time_of_Day'] = np.where(train['Arr_hour'] < 12, 'AM', 'PM')

train['Scheduled_Departure'] = train["Dep_hour"] *60 + train["Dep_minute"]
train['Scheduled_Arrival'] = train["Arr_hour"] *60 + train["Arr_minute"]

train['is_weekend'] = np.where((train['Day_of_Month'] % 7 == 0) | (train['Day_of_Month'] % 7 == 6), 1, 0)

train['Departure_weekday'] = pd.to_datetime(train['Month'].astype(str) + train['Day_of_Month'].astype(str), 
                                             format='%m%d').dt.weekday
train['Arrival_weekday'] = pd.to_datetime(train['Month'].astype(str) + train['Day_of_Month'].astype(str), 
                                           format='%m%d').dt.weekday

train['Flight_duration'] = (train['Estimated_Arrival_Time'] -train['Estimated_Departure_Time'])

train['Distance_sub'] = train['Distance'] - train['Flight_duration']

train['Date'] = train['Month'].astype(str) + '-' + train['Day_of_Month'].astype(str)
train['Origin_Location'] = train['Origin_Airport'] + ', ' + train['Origin_State']
train['Route'] = train['Origin_Airport'] + '-' + train['Destination_Airport']

train['Destination_Location'] = train['Destination_Airport'] + ', ' + train['Destination_State']
train['Day_of_Month'] = train.apply(lambda x: x['Day_of_Month']+1 
                                    if x['Estimated_Arrival_Time'] < x['Estimated_Departure_Time'] 
                                    else x['Day_of_Month'], axis=1)
train['Airline_Airport_Departure'] = train['Airline'] + '_' + train['Origin_Airport']
train['Airline_Airport_Arrival'] = train['Airline'] + '_' + train['Destination_Airport']
train['Same_State'] = (train['Origin_State'] == train['Destination_State']).astype(int)

train['Distance_km'] = train['Distance'] * 1.60934

train['Month_sin'] = np.sin(2 * np.pi * train['Month'] / 12)
train['Month_cos'] = np.cos(2 * np.pi * train['Month'] / 12)

train['Prev_month'] = np.where(train['Month'] == 1, 12, train['Month'] - 1)

test["Dep_hour"] = (test["Estimated_Departure_Time"] // 100).astype(int)
test["Dep_minute"] = (test["Estimated_Departure_Time"] % 100).astype(int)
test["Arr_hour"] = (test["Estimated_Arrival_Time"] // 100).astype(int)
test["Arr_minute"] = (test["Estimated_Arrival_Time"] % 100).astype(int)

test['Dep_Time_of_Day'] = np.where(test['Dep_hour'] < 12, 'AM', 'PM')
test['Arr_Time_of_Day'] = np.where(test['Arr_hour'] < 12, 'AM', 'PM')

test['Scheduled_Departure'] = test["Dep_hour"] *60 + test["Dep_minute"]
test['Scheduled_Arrival'] = test["Arr_hour"] *60 + test["Arr_minute"]

test['is_weekend'] = np.where((test['Day_of_Month'] % 7 == 0) | (test['Day_of_Month'] % 7 == 6), 1, 0)

test['Departure_weekday'] = pd.to_datetime(test['Month'].astype(str) + test['Day_of_Month'].astype(str), 
                                             format='%m%d').dt.weekday
test['Arrival_weekday'] = pd.to_datetime(test['Month'].astype(str) + test['Day_of_Month'].astype(str), 
                                           format='%m%d').dt.weekday

test['Flight_duration'] = (test['Estimated_Arrival_Time'] -test['Estimated_Departure_Time'])

test['Distance_sub'] = test['Distance'] - test['Flight_duration']

test['Date'] = test['Month'].astype(str) + '-' + test['Day_of_Month'].astype(str)
test['Origin_Location'] = test['Origin_Airport'] + ', ' + test['Origin_State']
test['Route'] = test['Origin_Airport'] + '-' + test['Destination_Airport']

test['Destination_Location'] = test['Destination_Airport'] + ', ' + test['Destination_State']
test['Day_of_Month'] = test.apply(lambda x: x['Day_of_Month']+1 
                                    if x['Estimated_Arrival_Time'] < x['Estimated_Departure_Time'] 
                                    else x['Day_of_Month'], axis=1)
test['Airline_Airport_Departure'] = test['Airline'] + '_' + test['Origin_Airport']
test['Airline_Airport_Arrival'] = test['Airline'] + '_' + test['Destination_Airport']
test['Same_State'] = (test['Origin_State'] == test['Destination_State']).astype(int)

test['Distance_km'] = test['Distance'] * 1.60934

test['Month_sin'] = np.sin(2 * np.pi * test['Month'] / 12)
test['Month_cos'] = np.cos(2 * np.pi * test['Month'] / 12)

test['Prev_month'] = np.where(test['Month'] == 1, 12, test['Month'] - 1)

In [None]:
# Label encoding for categorical variables
from sklearn.preprocessing import LabelEncoder
object_cols = train.select_dtypes(include=['object']).columns
object_cols = object_cols.drop(['Delay', 'ID'])

for i in object_cols:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [None]:
# Drop rows with missing values and create a dictionary to convert target variable 'Delay' to numeric values
train = train.dropna()

In [None]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number)) 

In [None]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [None]:
# Feature creation using autofeat
from autofeat import AutoFeatClassifier
train_xx = train_x[['Month', 'Day_of_Month', 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 
                    'Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Distance', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']]
test_xx = test_x[['Month', 'Day_of_Month', 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 
                  'Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Distance', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']]
auto = AutoFeatClassifier(n_jobs= -1)
X_train_feature_creation = auto.fit_transform(train_xx, train_y)
X_test_feature_creation = auto.transform(test_xx)

In [None]:
# Exclude original features used in feature creation and concatenate the new features with the remaining original features
excluded_columns = ['Month', 'Day_of_Month', 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Distance', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
included_columns = [col for col in train_x.columns if col not in excluded_columns]
tr = train_x[included_columns].reset_index(drop=True)
te = test_x[included_columns].reset_index(drop=True)
train_x = pd.concat([X_train_feature_creation, tr], axis=1)
test_x = pd.concat([X_test_feature_creation, te], axis=1)

In [None]:
select_col = ['Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Origin_Airport', 'Destination_Airport',
       'Destination_State', 'Distance', 'Airline', 'Carrier_Code(IATA)',
       'Tail_Number', 'Day_of_Month**3/Month', 'Month**2*sqrt(Tail_Number)',
       'Estimated_Departure_Time**3*Month**3',
       'sqrt(Airline)*Estimated_Arrival_Time**2',
       'sqrt(Estimated_Departure_Time)*log(Distance)',
       'Carrier_CodeIATA**3*Estimated_Departure_Time**2',
       'log(Estimated_Arrival_Time)/Estimated_Arrival_Time',
       'Estimated_Arrival_Time**3/Estimated_Departure_Time',
       'Origin_Airport**2*Origin_State**3',
       'sqrt(Carrier_CodeIATA)*Origin_State**3',
       'Estimated_Departure_Time*sqrt(Origin_State)',
       'sqrt(Destination_State)*Estimated_Departure_Time',
       'log(Estimated_Departure_Time)/Estimated_Departure_Time',
       'Airline**3*Origin_State**3', 'Airline/Estimated_Departure_Time',
       'Estimated_Arrival_Time**2*sqrt(Origin_State)',
       '1/(Day_of_Month*Month)', 'log(Day_of_Month)/Day_of_Month',
       'sqrt(Distance)*Estimated_Arrival_Time**2',
       'sqrt(Destination_State)*Estimated_Arrival_Time**2',
       'Estimated_Arrival_Time**3*Estimated_Departure_Time**3',
       'Distance**3*Tail_Number**3', 'Estimated_Departure_Time**3/Month',
       'Origin_Airport_ID', 'Destination_Airport_ID', 'Carrier_ID(DOT)',
       'Dep_hour', 'Dep_minute', 'Arr_hour', 'Arr_minute', 'Dep_Time_of_Day',
       'Arr_Time_of_Day', 'Scheduled_Departure', 'Scheduled_Arrival',
       'is_weekend', 'Arrival_weekday', 'Flight_duration', 'Distance_sub',
       'Date', 'Origin_Location', 'Route', 'Destination_Location',
       'Airline_Airport_Departure', 'Airline_Airport_Arrival', 'Same_State',
       'Distance_km', 'Month_sin', 'Month_cos', 'Prev_month']
train_x = train_x[select_col]
test_x = test_x[select_col]

In [None]:
# Split data into train and validation sets with stratification
from sklearn.model_selection import train_test_split, StratifiedKFold
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=seed, shuffle=True, stratify= train_y)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [None]:
# Standardize the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_x = scaler.transform(test_x)

In [None]:
# Train a CatBoost classifier with hyperparameters and class weights
from catboost import CatBoostClassifier
cat = CatBoostClassifier(n_estimators = 8397, random_state=seed, class_weights={0: 0.14, 1: 0.86})
cat.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                early_stopping_rounds=10)

y_pred = cat.predict_proba(test_x)

In [None]:
# Save the predictions in a CSV file
sample_submission = pd.read_csv('open/sample_submission.csv', index_col = 0)
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('baseline_submission.csv', index=True)