# Logistic Regression: event_time_of_day - has_fatal_injury

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle


pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [None]:
ak = pd.read_csv('../datasets/data_cleaned/alaska_single_engine_clean.csv', low_memory=False)

In [None]:
ak.shape

In [None]:
predictor = 'event_time_of_day'

In [None]:
target = 'has_fatal_injury'

In [None]:
ak = ak[[predictor, target]]

In [None]:
ak.columns

## Dummify Columns

In [None]:
def dummies(df, col_inference, category_to_drop):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    columns_to_dummify_drop_first = [col for col in categorical_columns if col != col_inference]
    
    df_dummies = pd.get_dummies(df, columns=columns_to_dummify_drop_first, drop_first=True)
    
    df_dummies = pd.get_dummies(df_dummies, columns=[col_inference], drop_first=False)
    
    dummy_to_drop = f"{col_inference}_{category_to_drop}"
    if dummy_to_drop in df_dummies.columns:
        df_dummies.drop(columns=[dummy_to_drop], inplace=True)
    
    return df_dummies

In [None]:
ak_dummies_weather = dummies(ak, predictor, 'Morning')

In [None]:
ak_dummies_weather.columns

In [None]:
ak_dummies_weather.head()

## Train, Test, Split

In [None]:
X = ak_dummies_weather.drop(columns = target)
y = ak_dummies_weather[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)

## Baseline

In [None]:
1 - y.mean()

## Benchmark Logistic Regression Model

In [None]:
pipe_log_reg_bench = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_bench', LogisticRegression(max_iter=1000))
])

In [None]:
pipe_log_reg_bench.fit(X_train, y_train)

In [None]:
pipe_log_reg_bench.score(X_train, y_train)

In [None]:
pipe_log_reg_bench.score(X_test, y_test)

## Tuned Logistic Regression Model

In [None]:
pipe_log_reg_tuned = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_tuned', LogisticRegression())
])

In [None]:
pipe_log_reg_tuned.get_params()

In [None]:
pipe_log_reg_tuned_params = {
    'log_reg_tuned__max_iter': [100, 1_000],
    'log_reg_tuned__C': np.linspace(0.001, 1, 5),
    'log_reg_tuned__penalty': ['l2']
}

In [None]:
gs= GridSearchCV(pipe_log_reg_tuned,
                                param_grid = pipe_log_reg_tuned_params,
                                cv = 5,
                               n_jobs=6)

In [None]:
%%time
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.score(X_train, y_train)

In [None]:
gs.score(X_test, y_test)

In [None]:
y_pred = gs.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, digits = 5))

## Coefficients

In [None]:
feature_names = list(X_train.columns)

In [None]:
coef_log_odds = gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0]

In [None]:
coef_odds = np.exp(gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0])

In [None]:
coef_log_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_log_odds})

In [None]:
coef_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_odds})

In [None]:
pd.set_option('display.max_rows', None)
coef_log_odds_df.head()

In [None]:
pd.set_option('display.max_rows', None)
coef_odds_df.sort_values(by = 'coefficient', ascending=False).head(1000)

In [None]:
file_name = f"../results/logreg_{predictor}-{target}_coef.csv"
coef_log_odds_df.to_csv(file_name, index=False)

## Pickle

In [None]:
file_name = f"../pickles/logreg_{predictor}-{target}.pkl"

with open(file_name, 'wb') as f:
    pickle.dump(gs, f) 