In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
import pickle
import os

In [2]:
flight_data = (
    pd.read_csv('final_dataset_cleaned_v1.csv')
)

In [3]:
flight_data.shape

(12658696, 29)

In [4]:
flight_data.head(n = 3)

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,...,time,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,target
0,2020,1,1,1,3,2020-01-01,N951WN,ONT,"Ontario, CA",SFO,...,2020-01-01 18:00:00,12.2,-7.3,25.0,0.0,330.0,18.4,1017.3,3.0,1
1,2020,1,1,1,3,2020-01-01,N467WN,ONT,"Ontario, CA",SFO,...,2020-01-01 11:00:00,3.9,-3.4,59.0,0.0,210.0,5.4,1017.6,2.0,0
2,2020,1,1,1,3,2020-01-01,N7885A,ONT,"Ontario, CA",SJC,...,2020-01-01 20:00:00,12.8,-7.3,24.0,0.0,290.0,11.2,1015.8,2.0,0


In [5]:
flight_data['aircraft_type_new'] = [i.split('-')[0] for i in flight_data['Aircraft type']]

In [6]:
# list of numerical and categorical features to select 
l_numeric_cols = [ 'DEP_HOUR', 'temp', 'dwpt', 'rhum', 'prcp', 'wspd', 'pres', 'coco']
l_categorical_cols = ['QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'aircraft_type_new']
target = "target"

In [7]:
# converting the categorical columns to categorical
for col in l_categorical_cols:
    flight_data[col] = flight_data[col].astype('category')

In [8]:
flight_data = flight_data[l_numeric_cols + l_categorical_cols + [target] + ['YEAR','FL_DATE']]

In [9]:
# getting dummu values for categorical columns
QUARTER_dummies = pd.get_dummies(flight_data['QUARTER'], prefix='QUARTER', drop_first=True)
MONTH_dummies = pd.get_dummies(flight_data['MONTH'], prefix='MONTH', drop_first=True)
DAY_OF_MONTH_dummies = pd.get_dummies(flight_data['DAY_OF_MONTH'], prefix='DAY_OF_MONTH', drop_first=True)
DAY_OF_WEEK_dummies = pd.get_dummies(flight_data['DAY_OF_WEEK'], prefix='DAY_OF_WEEK', drop_first=True)
Aircraft_type_dummies = pd.get_dummies(flight_data['aircraft_type_new'], prefix='Aircraft_type', drop_first=True)

In [10]:
# Aircraft_type_dummies.columns = ['Aircraft_{}'.format(i) for i in range(len(Aircraft_type_dummies.columns))]

In [11]:
flight_data = flight_data.drop(l_categorical_cols, axis=1)

In [12]:
dummy_flight_data = pd.concat([flight_data, QUARTER_dummies, MONTH_dummies,
                               DAY_OF_MONTH_dummies, DAY_OF_WEEK_dummies, Aircraft_type_dummies ], axis=1)

In [13]:
del Aircraft_type_dummies,QUARTER_dummies,MONTH_dummies,DAY_OF_MONTH_dummies,DAY_OF_WEEK_dummies

In [14]:
train = dummy_flight_data[dummy_flight_data['FL_DATE'] < "2020-08-01"]
test = dummy_flight_data[dummy_flight_data['FL_DATE'] >= "2020-08-01"]

In [15]:
del flight_data, dummy_flight_data

In [16]:
del train

In [2]:
train = pd.read_csv('undersampled_training_data_combined.csv')

In [19]:
logistic_regression = LogisticRegression(multi_class="ovr")
logistic_regression.fit(
    train.drop(["target"], axis=1).values,
     train[target]
     )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(multi_class='ovr')

In [20]:
path = "./models"
filename = "combined_LR_ovr_undersample.mdl"

In [21]:
with open(os.path.join(path, filename), "wb") as f:
    pickle.dump(logistic_regression, f, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
logistic_regression = pickle.load(open("./models/combined_LR_ovr_undersample.mdl",'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [3]:
feature_names = train.columns.drop('target')

In [6]:
feature_importance = abs(logistic_regression.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())

In [7]:
importances = pd.DataFrame(
  pd.Series(dict(zip(feature_names, feature_importance)))
).assign(
  normalized=lambda df: df / df.sum()
).sort_values(
  by="normalized", ascending=False
)

In [8]:
importances[["normalized"]].head(10)

Unnamed: 0,normalized
Aircraft_type_Boeing 737,0.079943
MONTH_3,0.050729
Aircraft_type_Mitsubishi CRJ,0.049738
Aircraft_type_Airbus A320,0.04146
MONTH_12,0.040819
Aircraft_type_Embraer E175LR,0.037494
MONTH_2,0.037467
Aircraft_type_Embraer ERJ,0.032684
DAY_OF_WEEK_2,0.029222
Aircraft_type_Boeing 717,0.028834


In [22]:
# predicting the test results 
y_pred = logistic_regression.predict(test.drop(["YEAR","target","FL_DATE"], axis=1).values)

In [23]:
print(confusion_matrix(test[target],y_pred))

[[981236 308931 178994 290674]
 [ 36491  22376  12421  18068]
 [  9133   4945   4390   6720]
 [  5668   2726   3082   4633]]


In [24]:
print(classification_report(test[target],y_pred))

              precision    recall  f1-score   support

           0       0.95      0.56      0.70   1759835
           1       0.07      0.25      0.10     89356
           2       0.02      0.17      0.04     25188
           3       0.01      0.29      0.03     16109

    accuracy                           0.54   1890488
   macro avg       0.26      0.32      0.22   1890488
weighted avg       0.89      0.54      0.66   1890488



In [25]:
print("Testing  set score for model: %f" % logistic_regression.score(test.drop(["YEAR","target","FL_DATE"], axis=1).values  , test[target] ))

Testing  set score for model: 0.535647


In [26]:
###### logistic regression multinomial ####
logistic_regression_mul = LogisticRegression(multi_class="multinomial")
logistic_regression_mul.fit(
    train.drop(["target"], axis=1).values,
     train[target]
     )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(multi_class='multinomial')

In [27]:
path = "./models"
filename = "combined_LR_mul_undersample.mdl"

In [28]:
with open(os.path.join(path, filename), "wb") as f:
    pickle.dump(logistic_regression_mul, f, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
logistic_regression_mul = pickle.load(open("./models/combined_LR_mul_undersample.mdl",'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [5]:
feature_importance_mul = abs(logistic_regression_mul.coef_[0])
feature_importance_mul = 100.0 * (feature_importance_mul / feature_importance_mul.max())

In [6]:
importances_mul = pd.DataFrame(
  pd.Series(dict(zip(feature_names, feature_importance_mul)))
).assign(
  normalized=lambda df: df / df.sum()
).sort_values(
  by="normalized", ascending=False
)

In [7]:
importances_mul[["normalized"]].head(10)

Unnamed: 0,normalized
DEP_HOUR,0.455629
coco,0.116787
wspd,0.077586
temp,0.035219
Aircraft_type_Boeing 737,0.02743
prcp,0.024117
Aircraft_type_Mitsubishi CRJ,0.017831
MONTH_2,0.014202
MONTH_3,0.013841
MONTH_12,0.010982


In [29]:
# predicting the test results 
y_pred = logistic_regression_mul.predict(test.drop(["YEAR","target","FL_DATE"], axis=1).values)

In [30]:
print(confusion_matrix(test[target],y_pred))

[[1032288  173649  280791  273107]
 [  40374   11784   20598   16600]
 [  10201    3060    6744    5183]
 [   6478    1978    4574    3079]]


In [31]:
print(classification_report(test[target],y_pred))

              precision    recall  f1-score   support

           0       0.95      0.59      0.72   1759835
           1       0.06      0.13      0.08     89356
           2       0.02      0.27      0.04     25188
           3       0.01      0.19      0.02     16109

    accuracy                           0.56   1890488
   macro avg       0.26      0.29      0.22   1890488
weighted avg       0.89      0.56      0.68   1890488



In [32]:
print("Testing  set score for model: %f" % logistic_regression_mul.score(test.drop(["YEAR","target","FL_DATE"], axis=1).values  , test[target] ))

Testing  set score for model: 0.557472
