# Logistic Regression: amateur_built - has_fatal_injury

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle


# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [2]:
ak = pd.read_csv('../../datasets/alaska_single_engine_clean.csv', low_memory=False)

In [3]:
ak.shape

(5590, 39)

In [4]:
predictor = 'amateur_built'

In [5]:
target = 'has_fatal_injury'

In [6]:
ak = ak[[predictor, target]]

In [7]:
ak.columns

Index(['amateur_built', 'has_fatal_injury'], dtype='object')

## Dummify Columns

In [8]:
def dummies(df, col_inference, category_to_drop):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    columns_to_dummify_drop_first = [col for col in categorical_columns if col != col_inference]
    
    df_dummies = pd.get_dummies(df, columns=columns_to_dummify_drop_first, drop_first=True)
    
    df_dummies = pd.get_dummies(df_dummies, columns=[col_inference], drop_first=False)
    
    dummy_to_drop = f"{col_inference}_{category_to_drop}"
    if dummy_to_drop in df_dummies.columns:
        df_dummies.drop(columns=[dummy_to_drop], inplace=True)
    
    return df_dummies

In [9]:
ak_dummies_weather = dummies(ak, predictor, '0')

In [10]:
ak_dummies_weather.columns

Index(['has_fatal_injury', 'amateur_built_1'], dtype='object')

In [11]:
ak_dummies_weather.head()

Unnamed: 0,has_fatal_injury,amateur_built_1
0,0,False
1,0,False
2,0,False
3,0,False
4,0,True


## Train, Test, Split

In [12]:
X = ak_dummies_weather.drop(columns = target)
y = ak_dummies_weather[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)

## Baseline

In [14]:
1 - y.mean()

0.8967799642218247

## Benchmark Logistic Regression Model

In [15]:
pipe_log_reg_bench = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_bench', LogisticRegression(max_iter=1000))
])

In [16]:
pipe_log_reg_bench.fit(X_train, y_train)

In [17]:
pipe_log_reg_bench.score(X_train, y_train)

0.8962432915921288

In [18]:
pipe_log_reg_bench.score(X_test, y_test)

0.9016100178890877

## Tuned Logistic Regression Model

In [19]:
pipe_log_reg_tuned = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_tuned', LogisticRegression())
])

In [20]:
pipe_log_reg_tuned.get_params()

{'memory': None,
 'steps': [('sc', StandardScaler()), ('log_reg_tuned', LogisticRegression())],
 'verbose': False,
 'sc': StandardScaler(),
 'log_reg_tuned': LogisticRegression(),
 'sc__copy': True,
 'sc__with_mean': True,
 'sc__with_std': True,
 'log_reg_tuned__C': 1.0,
 'log_reg_tuned__class_weight': None,
 'log_reg_tuned__dual': False,
 'log_reg_tuned__fit_intercept': True,
 'log_reg_tuned__intercept_scaling': 1,
 'log_reg_tuned__l1_ratio': None,
 'log_reg_tuned__max_iter': 100,
 'log_reg_tuned__multi_class': 'auto',
 'log_reg_tuned__n_jobs': None,
 'log_reg_tuned__penalty': 'l2',
 'log_reg_tuned__random_state': None,
 'log_reg_tuned__solver': 'lbfgs',
 'log_reg_tuned__tol': 0.0001,
 'log_reg_tuned__verbose': 0,
 'log_reg_tuned__warm_start': False}

In [21]:
pipe_log_reg_tuned_params = {
    'log_reg_tuned__max_iter': [100, 1_000],
    'log_reg_tuned__C': np.linspace(0.001, 1, 5),
    'log_reg_tuned__penalty': ['l2']
}

In [22]:
gs= GridSearchCV(pipe_log_reg_tuned,
                                param_grid = pipe_log_reg_tuned_params,
                                cv = 5,
                               n_jobs=6)

In [23]:
%%time
gs.fit(X_train, y_train)

CPU times: user 111 ms, sys: 105 ms, total: 216 ms
Wall time: 1.82 s


In [24]:
gs.best_params_

{'log_reg_tuned__C': 0.001,
 'log_reg_tuned__max_iter': 100,
 'log_reg_tuned__penalty': 'l2'}

In [25]:
gs.score(X_train, y_train)

0.8962432915921288

In [26]:
gs.score(X_test, y_test)

0.9016100178890877

In [27]:
y_pred = gs.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred)

array([[504,   0],
       [ 55,   0]])

In [29]:
print(classification_report(y_test, y_pred, digits = 5))

              precision    recall  f1-score   support

           0    0.90161   1.00000   0.94826       504
           1    0.00000   0.00000   0.00000        55

    accuracy                        0.90161       559
   macro avg    0.45081   0.50000   0.47413       559
weighted avg    0.81290   0.90161   0.85496       559



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Coefficients

In [30]:
feature_names = list(X_train.columns)

In [31]:
coef_log_odds = gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0]

In [32]:
coef_odds = np.exp(gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0])

In [33]:
coef_log_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_log_odds})

In [34]:
coef_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_odds})

In [35]:
pd.set_option('display.max_rows', None)
coef_log_odds_df.head()

Unnamed: 0,feature,coefficient
0,amateur_built_1,0.016686


In [36]:
pd.set_option('display.max_rows', None)
coef_odds_df.head()

Unnamed: 0,feature,coefficient
0,amateur_built_1,1.016826


In [37]:
file_name = f"../../datasets/nolan_logreg_{predictor}-{target}_coef.csv"

coef_log_odds_df.to_csv(file_name, index=False)

## Pickle

In [38]:
file_name = f"../../pickles/nolan_logreg_{predictor}-{target}.pkl"

with open(file_name, 'wb') as f:
    pickle.dump(gs, f) 