In [1]:
!pip install catboost==1.2.7
# Install missing xgboost package
!pip install xgboost
!pip install lightgbm


Collecting catboost==1.2.7
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hColl

In [2]:

# Import LightGBM and XGBoost
import lightgbm as lgb
import xgboost as xgb
import catboost

# LightGBM uses 'LGBMModel' for loading trained models
lgb_model = lgb.Booster(model_file='lightgbm/lightgbm_model.txt')

# Load XGBoost model
xgb_model = xgb.Booster()
xgb_model.load_model('xg_boost/xgboost_model.json')

catboost_model= catboost.CatBoostClassifier().load_model('catboost_info/best_model.cbm')

In [3]:
import pandas as pd

df = pd.read_parquet('data.parquet')
df.head()

Unnamed: 0,Airline Code,Aircraft Registration,Operator,Type Code,Mode S,Serial Number,Age(years),FROM,TO,Arrival_Delayed,...,STD_UTC_day_of_year,STD_UTC_week_of_year,STD_UTC_weekday,STD_UTC_hour_of_day,STA_UTC_time_of_day_cosine,STA_UTC_time_of_year_cosine,STA_UTC_day_of_year,STA_UTC_week_of_year,STA_UTC_weekday,STA_UTC_hour_of_day
0,3u-csc,b-30cr,Sichuan Airlines,A20N,781848,8873.0,5.0,YIN,TFU,True,...,244,35,5,3,0.021815,-0.492533,244.0,35.0,5.0,5.0
1,3u-csc,b-30cr,Sichuan Airlines,A20N,781848,8873.0,5.0,TFU,YIN,False,...,243,35,4,20,0.707107,-0.492533,244.0,35.0,5.0,3.0
2,3u-csc,b-30cr,Sichuan Airlines,A20N,781848,8873.0,5.0,URC,TFU,True,...,243,35,4,17,0.362438,-0.50743,243.0,35.0,4.0,19.0
3,3u-csc,b-30cr,Sichuan Airlines,A20N,781848,8873.0,5.0,TFU,URC,False,...,243,35,4,3,-0.642788,-0.50743,243.0,35.0,4.0,8.0
4,3u-csc,b-30cr,Sichuan Airlines,A20N,781848,8873.0,5.0,ZHA,TFU,False,...,242,35,3,23,0.854912,-0.50743,243.0,35.0,4.0,2.0


In [4]:
columns_to_drop = [col for col in df.columns if col.startswith('ATA') or col.startswith('ATD') or col == 'Arrival_Delayed']
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
# all columns of first row 
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 
df.dtypes

Airline Code                   category
Aircraft Registration          category
Operator                       category
Type Code                      category
Mode S                         category
Serial Number                  category
Age(years)                      float32
FROM                           category
TO                             category
Departure_Status               category
STD_temp_scaled                 float64
STD_dwpt_scaled                 float64
STD_rhum_scaled                 float64
STD_prcp_scaled                 float64
STD_snow_scaled                 float64
STD_wdir_scaled                 float64
STD_wspd_scaled                 float64
STD_wpgt_scaled                 float64
STD_pres_scaled                 float64
STD_tsun_scaled                 float64
STD_coco_scaled                 float64
STA_temp_scaled                 float64
STA_dwpt_scaled                 float64
STA_rhum_scaled                 float64
STA_prcp_scaled                 float64


In [6]:
# Convert float64 to float32
float_cols = df.select_dtypes(include='float64').columns
df[float_cols] = df[float_cols].astype('float32')

# Convert int64 to int32
int_cols = df.select_dtypes(include='int64').columns
df[int_cols] = df[int_cols].astype('int32')

In [7]:
X = df.drop(columns = ["Departure_Status"])
y = df["Departure_Status"].astype('category').cat.codes
catFeat = X.select_dtypes(include='category').columns.tolist() 
catFeat 

['Airline Code',
 'Aircraft Registration',
 'Operator',
 'Type Code',
 'Mode S',
 'Serial Number',
 'FROM',
 'TO']

In [8]:
from sklearn.model_selection import train_test_split

xTrain, xTemp, yTrain, yTemp, idxTrain, idxTemp = train_test_split(
    X, y, X.index, test_size=0.4, random_state=42
)
xTest, xVal, yTest, yVal, idxTest, idxVal = train_test_split(
    xTemp, yTemp, idxTemp, test_size=0.5, random_state=42
)

xTrain.shape, xTest.shape, xVal.shape


((7001872, 43), (2333957, 43), (2333958, 43))

In [9]:
from catboost import Pool
import numpy as np
## Prediction on everything
# Predict using LightGBM on X

lgb_preds_all = lgb_model.predict(X)
lgb_preds_all = np.argmax(lgb_preds_all, axis=1)
print("LightGBM predictions on all data completed.")
print(lgb_preds_all.shape)
# Predict using CatBoost on X:
# Create a Pool for X with the categorical features specified





LightGBM predictions on all data completed.
(11669787,)


In [10]:
X_encoded = X.copy()
for col in catFeat:
    X[col] = X[col].astype(str).fillna("nan")

# Get the column indices corresponding to the categorical features
cat_indices = [X.columns.get_loc(col) for col in catFeat]
pool_all = Pool(X, cat_features=catFeat)
catboost_preds_all = catboost_model.predict(pool_all)

print("CatBoost predictions on all data completed.")
print(catboost_preds_all.shape)


CatBoost predictions on all data completed.
(11669787, 1)


In [11]:
# For XGBoost, encode the categorical columns by converting them to integer codes.

for col in ['Airline Code', 'Aircraft Registration', 'Operator', 'Type Code', 'Mode S', 'Serial Number', 'FROM', 'TO']:
    X_encoded[col] = X_encoded[col].cat.codes

# Create an xgb.DMatrix from the encoded data and predict
xgb_preds_all = xgb_model.predict(xgb.DMatrix(X_encoded))

print("XGBoost predictions on all data completed.")
print(xgb_preds_all.shape)


XGBoost predictions on all data completed.
(11669787,)


In [12]:
# Stack predictions to form meta-features
meta_all = np.column_stack([lgb_preds_all, xgb_preds_all, catboost_preds_all])

# Convert meta-features to a DataFrame with appropriate column names
meta_df = pd.DataFrame(meta_all,index=X.index, columns=['lgb_pred', 'xgb_pred', 'catboost_pred'])

# Include the true target values (y)
meta_df['y'] = y.values  # Ensure y has the same ordering and length as X

# Save the DataFrame to a Parquet file
meta_df.to_parquet("3base_model/3stacked_base_model_prediction.parquet", index=True)

print("Meta predictions with target variable saved to 'meta_predictions.parquet'")

Meta predictions with target variable saved to 'meta_predictions.parquet'


In [13]:
## free up memory
del xTemp,yTemp,X,y,xTrain,yTrain,yTest,yVal,lgb_preds_all,xgb_preds_all,catboost_preds_all,meta_all

In [14]:

# Subset to only validation rows using the index of xVal
meta_val = meta_df.loc[xVal.index]
meta_test=meta_df.loc[xTest.index]
del xVal,xTest
# Split meta_val into features and target
meta_val_features = meta_val.drop("y", axis=1)
meta_val_y = meta_val["y"]
meta_test_features = meta_test.drop("y", axis=1)
meta_test_y = meta_test["y"]

##Format them in the way each model expects it
# dval = xgb.DMatrix(meta_val_features, label=meta_val_y,enable_categorical=True)
# dtest = xgb.DMatrix(meta_test_features, label=meta_test_y,enable_categorical=True)
# validData = lgb.Dataset(data=meta_val_features, label=meta_val_y, categorical_feature=catFeat)
# testData= lgb.Dataset(data=meta_test_features, label=meta_test_y, categorical_feature=catFeat)

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
import numpy as np


# Define updated parameters to tune for Logistic Regression
# Adjust 'penalty' and 'solver' to remove incompatible combinations
param_grid = {
    'C': [0.01, 0.1, 1, 10,100],
    'penalty': ['l1','l2'],  # 'l1' is compatible with 'liblinear'
    'solver': ['liblinear'],
    'max_iter': [300],
    'tol': [1e-4, 1e-3],
    'class_weight': [None, 'balanced'],
    'fit_intercept': [True, False]
}



# Perform GridSearchCV to find hyperparameters for Logistic Regression
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=4)

grid_search.fit(meta_val_features, meta_val_y)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Update meta model with best parameters
meta_model = grid_search.best_estimator_

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best Parameters: {'C': 0.01, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 300, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.0001}
Best Accuracy Score: 0.9239519306419993


In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
y_pred_meta = meta_model.predict(meta_test_features)
accuracy = accuracy_score(meta_test_y, y_pred_meta)
cm = confusion_matrix(meta_test_y, y_pred_meta)
cr = classification_report(meta_test_y, y_pred_meta)

In [17]:
print(accuracy)
print (cm)
print (cr)

0.9240131673377016
[[   8237    2002     116]
 [     31 2101550   20866]
 [     75  154260   46820]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.88     10355
           1       0.93      0.99      0.96   2122447
           2       0.69      0.23      0.35    201155

    accuracy                           0.92   2333957
   macro avg       0.87      0.67      0.73   2333957
weighted avg       0.91      0.92      0.91   2333957



In [18]:
import joblib
joblib.dump(meta_model, '3base_model/3stacked_meta_model.pkl')


['logistic_regression_meta_model.pkl']

In [3]:
import joblib
import pandas as pd
import numpy as np
meta_model = joblib.load('3base_model/3stacked_meta_model.pkl')
meta_df=pd.read_parquet('3base_model/3stacked_base_model_prediction.parquet')
meta_X=meta_df.drop('y',axis=1)
meta_y=meta_df['y']
all_preds=meta_model.predict(meta_X)
meta_df['final_pred']=all_preds
meta_df.to_parquet('3base_model/3stacked_final_meta_predictions.parquet',index=True)

In [5]:
new_df=pd.read_parquet('3stacked_final_meta_predictions.parquet')
new_df.head()

Unnamed: 0,lgb_pred,xgb_pred,catboost_pred,y,final_pred
0,1.0,1.0,1.0,1,1
1,1.0,1.0,1.0,1,1
2,1.0,1.0,1.0,1,1
3,1.0,1.0,1.0,1,1
4,1.0,1.0,1.0,1,1


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b1bc7aa3-8640-4158-99bf-4ecfc85b064d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>