# Logistic Regression: city - has_fatal_injury

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle


pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [2]:
ak = pd.read_csv('../../datasets/alaska_single_engine_clean.csv', low_memory=False)

In [3]:
ak.shape

(5590, 39)

In [4]:
predictor = 'city'

In [5]:
target = 'has_fatal_injury'

In [6]:
ak = ak[[predictor, target]]

In [7]:
ak.columns

Index(['city', 'has_fatal_injury'], dtype='object')

## Dummify Columns

In [8]:
def dummies(df, col_inference, category_to_drop):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    columns_to_dummify_drop_first = [col for col in categorical_columns if col != col_inference]
    
    df_dummies = pd.get_dummies(df, columns=columns_to_dummify_drop_first, drop_first=True)
    
    df_dummies = pd.get_dummies(df_dummies, columns=[col_inference], drop_first=False)
    
    dummy_to_drop = f"{col_inference}_{category_to_drop}"
    if dummy_to_drop in df_dummies.columns:
        df_dummies.drop(columns=[dummy_to_drop], inplace=True)
    
    return df_dummies

In [9]:
ak_dummies_weather = dummies(ak, predictor, 'ANCHORAGE')

In [10]:
ak_dummies_weather.columns

Index(['has_fatal_injury', 'city_(N) SKWENTNA', 'city_100MI S.KNG SLM',
       'city_11NM EAST OF SI', 'city_18NM ESE KETCHI', 'city_20 ESE NORTH PO',
       'city_27 E SAGWON', 'city_35 W. OF KETCHI', 'city_38 SE BARROW',
       'city_40 SE BETTLES',
       ...
       'city_YAKUTAT', 'city_YANTARNI BAY', 'city_YELLOW JACKET',
       'city_YENTA GLACIER', 'city_YENTNA', 'city_YENTNA RIVER',
       'city_YUKI RIVER', 'city_YUKON CHARLIE RIVER', 'city_YUKON RIVER',
       'city_ZACHER BAY'],
      dtype='object', length=878)

In [11]:
ak_dummies_weather.head()

Unnamed: 0,has_fatal_injury,city_(N) SKWENTNA,city_100MI S.KNG SLM,city_11NM EAST OF SI,city_18NM ESE KETCHI,city_20 ESE NORTH PO,city_27 E SAGWON,city_35 W. OF KETCHI,city_38 SE BARROW,city_40 SE BETTLES,...,city_YAKUTAT,city_YANTARNI BAY,city_YELLOW JACKET,city_YENTA GLACIER,city_YENTNA,city_YENTNA RIVER,city_YUKI RIVER,city_YUKON CHARLIE RIVER,city_YUKON RIVER,city_ZACHER BAY
0,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Train, Test, Split

In [12]:
X = ak_dummies_weather.drop(columns = target)
y = ak_dummies_weather[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)

## Baseline

In [14]:
1 - y.mean()

0.8967799642218247

## Benchmark Logistic Regression Model

In [15]:
pipe_log_reg_bench = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_bench', LogisticRegression(max_iter=1000))
])

In [16]:
pipe_log_reg_bench.fit(X_train, y_train)

In [17]:
pipe_log_reg_bench.score(X_train, y_train)

0.9087656529516994

In [18]:
pipe_log_reg_bench.score(X_test, y_test)

0.8980322003577818

## Tuned Logistic Regression Model

In [19]:
pipe_log_reg_tuned = Pipeline([
    ('sc', StandardScaler()),
    ('log_reg_tuned', LogisticRegression())
])

In [20]:
pipe_log_reg_tuned.get_params()

{'memory': None,
 'steps': [('sc', StandardScaler()), ('log_reg_tuned', LogisticRegression())],
 'verbose': False,
 'sc': StandardScaler(),
 'log_reg_tuned': LogisticRegression(),
 'sc__copy': True,
 'sc__with_mean': True,
 'sc__with_std': True,
 'log_reg_tuned__C': 1.0,
 'log_reg_tuned__class_weight': None,
 'log_reg_tuned__dual': False,
 'log_reg_tuned__fit_intercept': True,
 'log_reg_tuned__intercept_scaling': 1,
 'log_reg_tuned__l1_ratio': None,
 'log_reg_tuned__max_iter': 100,
 'log_reg_tuned__multi_class': 'auto',
 'log_reg_tuned__n_jobs': None,
 'log_reg_tuned__penalty': 'l2',
 'log_reg_tuned__random_state': None,
 'log_reg_tuned__solver': 'lbfgs',
 'log_reg_tuned__tol': 0.0001,
 'log_reg_tuned__verbose': 0,
 'log_reg_tuned__warm_start': False}

In [21]:
pipe_log_reg_tuned_params = {
    'log_reg_tuned__max_iter': [100, 1_000],
    'log_reg_tuned__C': np.linspace(0.001, 1, 5),
    'log_reg_tuned__penalty': ['l2']
}

In [22]:
gs= GridSearchCV(pipe_log_reg_tuned,
                                param_grid = pipe_log_reg_tuned_params,
                                cv = 5,
                               n_jobs=6)

In [23]:
%%time
gs.fit(X_train, y_train)

CPU times: user 582 ms, sys: 247 ms, total: 829 ms
Wall time: 8.59 s


In [24]:
gs.best_params_

{'log_reg_tuned__C': 0.001,
 'log_reg_tuned__max_iter': 100,
 'log_reg_tuned__penalty': 'l2'}

In [25]:
gs.score(X_train, y_train)

0.9083681176704432

In [26]:
gs.score(X_test, y_test)

0.8980322003577818

In [27]:
y_pred = gs.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred)

array([[501,   3],
       [ 54,   1]])

In [29]:
print(classification_report(y_test, y_pred, digits = 5))

              precision    recall  f1-score   support

           0    0.90270   0.99405   0.94618       504
           1    0.25000   0.01818   0.03390        55

    accuracy                        0.89803       559
   macro avg    0.57635   0.50611   0.49004       559
weighted avg    0.83848   0.89803   0.85642       559



## Coefficients

In [30]:
feature_names = list(X_train.columns)

In [31]:
coef_log_odds = gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0]

In [32]:
coef_odds = np.exp(gs.best_estimator_.named_steps['log_reg_tuned'].coef_[0])

In [33]:
coef_log_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_log_odds})

In [34]:
coef_odds_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef_odds})

In [35]:
pd.set_option('display.max_rows', None)
coef_log_odds_df.head()

Unnamed: 0,feature,coefficient
0,city_(N) SKWENTNA,-0.004915
1,city_100MI S.KNG SLM,-0.004915
2,city_11NM EAST OF SI,0.033372
3,city_18NM ESE KETCHI,-0.004915
4,city_20 ESE NORTH PO,-0.004915


In [36]:
pd.set_option('display.max_rows', None)
coef_odds_df.sort_values(by = 'coefficient', ascending=False).head(100)

Unnamed: 0,feature,coefficient
385,city_KETCHIKAN,1.084942
720,city_SITKA,1.072408
850,city_WHITTIER,1.051815
461,city_MARSHALL,1.040168
688,city_SAINT MARYS,1.040168
722,city_SKAGWAY,1.040102
345,city_JUNEAU,1.038547
30,city_ALEKNAGIK,1.037559
602,city_NUIQSUT,1.037559
163,city_CHUGIAK,1.035109


In [37]:
file_name = f"../../datasets/nolan_logreg_{predictor}-{target}_coef.csv"

coef_log_odds_df.to_csv(file_name, index=False)

## Pickle

In [38]:
file_name = f"../../pickles/nolan_logreg_{predictor}-{target}.pkl"

with open(file_name, 'wb') as f:
    pickle.dump(gs, f) 