In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
import lightgbm as lgb
import pickle
import os

In [2]:
flight_data = (
    pd.read_csv('final_dataset_cleaned_v1.csv')
)

In [3]:
flight_data.shape

(12658696, 29)

In [4]:
flight_data.head(n = 3)

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,...,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,target
0,2020,1,1,1,3,2020-01-01,N951WN,ONT,"Ontario, CA",SFO,...,2020-01-01 18:00:00,12.2,-7.3,25.0,0.0,330.0,18.4,1017.3,3.0,1
1,2020,1,1,1,3,2020-01-01,N467WN,ONT,"Ontario, CA",SFO,...,2020-01-01 11:00:00,3.9,-3.4,59.0,0.0,210.0,5.4,1017.6,2.0,0
2,2020,1,1,1,3,2020-01-01,N7885A,ONT,"Ontario, CA",SJC,...,2020-01-01 20:00:00,12.8,-7.3,24.0,0.0,290.0,11.2,1015.8,2.0,0


In [5]:
flight_data['aircraft_type_new'] = [i.split('-')[0] for i in flight_data['Aircraft type']]

In [6]:
# list of numerical and categorical features to select 
l_numeric_cols = [ 'DEP_HOUR', 'temp', 'dwpt', 'rhum', 'prcp', 'wspd', 'pres', 'coco']
l_categorical_cols = ['QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'aircraft_type_new']
target = "target"

In [7]:
# converting the categorical columns to categorical
for col in l_categorical_cols:
    flight_data[col] = flight_data[col].astype('category')

In [8]:
flight_data = flight_data[l_numeric_cols + l_categorical_cols + [target] + ['YEAR']]

In [9]:
train = flight_data[flight_data['YEAR'] == 2019]
test = flight_data[flight_data['YEAR'] == 2020]

In [10]:
del flight_data

In [11]:
parameters = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':4,
            "random_seed":42,
}

classifier = lgb.LGBMClassifier(**parameters)
classifier.fit(
    train.drop(["YEAR","target"], axis=1),
     train[target],
    categorical_feature=l_categorical_cols,
    verbose=1
     )



LGBMClassifier(num_class=4, objective='multiclass', random_seed=42)

In [12]:
path = "./models"
filename = "2019train_LGBM.mdl"

In [13]:
with open(os.path.join(path, filename), "wb") as f:
    pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
classifier = pickle.load(open("./models/2019train_LGBM.mdl",'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [3]:
importances_v1 = pd.DataFrame(
  pd.Series(dict(zip(classifier.feature_name_, classifier.feature_importances_)))
).assign(
  normalized=lambda df: df / df.sum()
).sort_values(
  by="normalized", ascending=False
)

In [5]:
importances_v1[["normalized"]]

Unnamed: 0,normalized
DAY_OF_MONTH,0.285917
aircraft_type_new,0.22025
MONTH,0.143
DEP_HOUR,0.114167
DAY_OF_WEEK,0.071167
pres,0.046917
dwpt,0.034667
temp,0.026083
rhum,0.018167
coco,0.0165


In [14]:
# predicting the test results 
y_pred = classifier.predict(test.drop(["YEAR","target"], axis=1))

In [15]:
print(confusion_matrix(test[target],y_pred))

[[4303520      49       6     808]
 [ 276425      30      54     549]
 [  84659      16     130     661]
 [  52782      22     112    1410]]


In [16]:
print(classification_report(test[target],y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95   4304383
           1       0.26      0.00      0.00    277058
           2       0.43      0.00      0.00     85466
           3       0.41      0.03      0.05     54326

    accuracy                           0.91   4721233
   macro avg       0.50      0.26      0.25   4721233
weighted avg       0.86      0.91      0.87   4721233



In [17]:
print("Testing  set score for model: %f" % classifier.score(test.drop(["YEAR","target"], axis=1)  , test[target] ))

Testing  set score for model: 0.911857


In [18]:
##### balanced class weight #####
parameters_b = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':4,
            "random_seed":42,
            "class_weight":"balanced"
}

classifier_b = lgb.LGBMClassifier(**parameters_b)
classifier_b.fit(
    train.drop(["YEAR","target"], axis=1),
     train[target],
    categorical_feature=l_categorical_cols,
    verbose=1
     )



LGBMClassifier(class_weight='balanced', num_class=4, objective='multiclass',
               random_seed=42)

In [19]:
path = "./models"
filename = "2019train_LGBM_B.mdl"

In [20]:
with open(os.path.join(path, filename), "wb") as f:
    pickle.dump(classifier_b, f, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
classifier_b = pickle.load(open("./models/2019train_LGBM_B.mdl",'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [7]:
importances_v1_B = pd.DataFrame(
  pd.Series(dict(zip(classifier.feature_name_, classifier.feature_importances_)))
).assign(
  normalized=lambda df: df / df.sum()
).sort_values(
  by="normalized", ascending=False
)

In [8]:
importances_v1_B[["normalized"]]

Unnamed: 0,normalized
DAY_OF_MONTH,0.285917
aircraft_type_new,0.22025
MONTH,0.143
DEP_HOUR,0.114167
DAY_OF_WEEK,0.071167
pres,0.046917
dwpt,0.034667
temp,0.026083
rhum,0.018167
coco,0.0165


In [21]:
# predicting the test results 
y_pred = classifier_b.predict(test.drop(["YEAR","target"], axis=1))

In [22]:
print(confusion_matrix(test[target],y_pred))

[[2272683 1057476  529353  444871]
 [  95216   90657   46033   45152]
 [  25130   21897   18302   20137]
 [  14562   11556   11086   17122]]


In [23]:
print(classification_report(test[target],y_pred))

              precision    recall  f1-score   support

           0       0.94      0.53      0.68   4304383
           1       0.08      0.33      0.12    277058
           2       0.03      0.21      0.05     85466
           3       0.03      0.32      0.06     54326

    accuracy                           0.51   4721233
   macro avg       0.27      0.35      0.23   4721233
weighted avg       0.87      0.51      0.63   4721233



In [24]:
print("Testing  set score for model: %f" % classifier_b.score(test.drop(["YEAR","target"], axis=1)  , test[target] ))

Testing  set score for model: 0.508080
