In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle


# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [2]:
ak = pd.read_csv('../datasets/alaska_single_engine_clean.csv', low_memory=False)

In [3]:
ak.shape

(5732, 32)

In [4]:
ak['has_injury'] = ak.highest_injury_level.apply(lambda x: 0 if x == 'None Reported' else 1)

In [5]:
ak = ak.drop(['ntsb_no', 'mkey', 'n', 'probable_cause', 'far', 'operator', 'event_time', 'fatal_injury_count', 'serious_injury_count', 'minor_injury_count', 'air_craft_damage', 'event_type', 'report_type', 'has_safety_rec', 'highest_injury_level', 'event_year'], axis=1)

In [6]:
ak.has_injury.value_counts()

has_injury
0    4082
1    1650
Name: count, dtype: int64

In [7]:
categorical_cols = ak.select_dtypes(exclude=['number']).columns.tolist()

In [8]:
categorical_cols

['city',
 'make',
 'model',
 'air_craft_category',
 'airport_id',
 'airport_name',
 'scheduled',
 'purpose_of_flight',
 'weather_condition',
 'event_season']

In [9]:
ak.columns

Index(['city', 'latitude', 'longitude', 'make', 'model', 'air_craft_category',
       'airport_id', 'airport_name', 'amateur_built', 'scheduled',
       'purpose_of_flight', 'weather_condition', 'event_month', 'event_day',
       'event_hour', 'event_season', 'has_injury'],
      dtype='object')

In [10]:
ak_dummies = pd.get_dummies(ak, columns = categorical_cols, drop_first=True)

In [11]:
X = ak_dummies.drop('has_injury', axis=1)
y = ak_dummies['has_injury']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.075, random_state=21)

### Baseline

In [13]:
1 - y.mean()

0.712142358688067

### Benchmark Logistic Regression Model

In [14]:
pipe_log_reg_bench = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_bench', LogisticRegression(max_iter=1000))
])

In [15]:
pipe_log_reg_bench.fit(X_train, y_train)

In [16]:
pipe_log_reg_bench.score(X_train, y_train)

0.9072048283666541

In [17]:
pipe_log_reg_bench.score(X_test, y_test)

0.6813953488372093

### Tuned Logistic Regression Model

In [18]:
pipe_log_reg_tuned = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_tuned', LogisticRegression())
])

In [19]:
pipe_log_reg_tuned.get_params()

{'memory': None,
 'steps': [('sc', StandardScaler()), ('log_reg_tuned', LogisticRegression())],
 'verbose': False,
 'sc': StandardScaler(),
 'log_reg_tuned': LogisticRegression(),
 'sc__copy': True,
 'sc__with_mean': True,
 'sc__with_std': True,
 'log_reg_tuned__C': 1.0,
 'log_reg_tuned__class_weight': None,
 'log_reg_tuned__dual': False,
 'log_reg_tuned__fit_intercept': True,
 'log_reg_tuned__intercept_scaling': 1,
 'log_reg_tuned__l1_ratio': None,
 'log_reg_tuned__max_iter': 100,
 'log_reg_tuned__multi_class': 'auto',
 'log_reg_tuned__n_jobs': None,
 'log_reg_tuned__penalty': 'l2',
 'log_reg_tuned__random_state': None,
 'log_reg_tuned__solver': 'lbfgs',
 'log_reg_tuned__tol': 0.0001,
 'log_reg_tuned__verbose': 0,
 'log_reg_tuned__warm_start': False}

In [20]:
pipe_log_reg_tuned_params = {
    'log_reg_tuned__max_iter': [1_000],
    'log_reg_tuned__C': [.001, .01],
    'log_reg_tuned__penalty': ['l1', 'l2']
}

In [21]:
gs_log_reg_tuned = GridSearchCV(pipe_log_reg_tuned,
                                param_grid = pipe_log_reg_tuned_params,
                                cv = 5,
                               n_jobs=6)

In [22]:
%%time
gs_log_reg_tuned.fit(X_train, y_train)

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nolan/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nolan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/nolan/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/nolan/anaconda3/lib/python3.11/s

CPU times: user 2.58 s, sys: 793 ms, total: 3.37 s
Wall time: 15.8 s


In [23]:
gs_log_reg_tuned.best_params_

{'log_reg_tuned__C': 0.001,
 'log_reg_tuned__max_iter': 1000,
 'log_reg_tuned__penalty': 'l2'}

In [24]:
gs_log_reg_tuned.score(X_train, y_train)

0.8655224443606186

In [25]:
gs_log_reg_tuned.score(X_test, y_test)

0.7162790697674418

In [26]:
y_pred = gs_log_reg_tuned.predict(X_test)

In [27]:
confusion_matrix(y_test, y_pred)

array([[285,  27],
       [ 95,  23]])

In [28]:
print(classification_report(y_test, y_pred, digits = 5))

              precision    recall  f1-score   support

           0    0.75000   0.91346   0.82370       312
           1    0.46000   0.19492   0.27381       118

    accuracy                        0.71628       430
   macro avg    0.60500   0.55419   0.54875       430
weighted avg    0.67042   0.71628   0.67280       430



### Coefficients

In [29]:
# coefficients = np.exp(gs_log_reg_tuned.best_estimator_.named_steps['log_reg_tuned'].coef_[0]*.1)

In [30]:
coefficients = gs_log_reg_tuned.best_estimator_.named_steps['log_reg_tuned'].coef_[0]

In [31]:
feature_names = list(X_train.columns)

In [32]:
coefficients_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

In [33]:
pd.set_option('display.max_rows', None)
coefficients_df.head(25)

Unnamed: 0,feature,coefficient
0,latitude,-0.022113
1,longitude,0.019608
2,amateur_built,0.023353
3,event_month,0.000653
4,event_day,0.012156
5,event_hour,-0.016971
6,city_100MI S.KNG SLM,-0.013336
7,city_11NM EAST OF SI,0.022133
8,city_18NM ESE KETCHI,-0.005252
9,city_20 ESE NORTH PO,-0.009885


In [34]:
coefficients_df.sort_values(by='coefficient', ascending=False).head(30)

Unnamed: 0,feature,coefficient
4134,airport_name_Unknown,0.073708
925,city_SLEETMUTE,0.058176
480,city_KENAI,0.053674
3055,airport_id_Unknown,0.053124
240,city_Chugiak,0.046728
1002,city_TATITLEK,0.046485
3832,airport_name_O'MALLEY,0.044755
4237,purpose_of_flight_UNK,0.043734
2340,model_PA-32-260,0.041478
1241,make_De Havilland,0.04085


In [35]:
coefficients_df.sort_values(by='coefficient', ascending=False).tail(30)

Unnamed: 0,feature,coefficient
759,city_Nenana,-0.024927
3522,airport_name_JUNEAU INTL,-0.024958
403,city_HOLY CROSS,-0.025034
655,city_McCarthy,-0.025061
832,city_Palmer,-0.025123
3639,airport_name_LAKE HOOD STRIP,-0.025149
1825,model_AS-350B2,-0.025225
535,city_Kenai,-0.02537
3878,airport_name_PRIVATE STRIP,-0.026553
724,city_NENANA,-0.026678


In [36]:
coefficients_df.to_csv('../datasets/nolan_log_reg_coef.csv', index=False)

### Pickle

In [37]:
with open ('../pickles/nolan_log_reg.pkl', 'wb') as f:
    pickle.dump('gs_log_reg_tuned.pkl', f)