In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
import statsmodels.api as sm

import pickle

In [6]:
df = pd.read_csv('../datasets/mike_atb2.csv')

In [7]:
df.head()

Unnamed: 0,Latitude,Longitude,Make,Model,AmateurBuilt,PurposeOfFlight,WeatherCondition,EventMonth,InjuryBool,OccurredNearAirport,EventHour
0,61.77516,-152.15263,CESSNA,UNCOMMON MODEL,0,PERS,Unknown,9,0,1,8
1,60.805019,-161.78648,PIPER,PA-18-150,0,PERS,Unknown,9,0,0,12
2,59.646929,-151.49323,CESSNA,A185F,0,BUS,VMC,9,0,1,13
3,64.267579,-147.68704,HELIO,H-295,0,BUS,Unknown,9,0,0,16
4,64.650753,-149.83639,UNCOMMON MAKE,UNCOMMON MODEL,1,PERS,VMC,9,0,0,15


In [8]:
ct = ColumnTransformer([
    ('oh', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), ['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour' ]
    )],
    remainder ='passthrough')

In [9]:
df.shape

(5659, 11)

In [10]:
X = df.drop('InjuryBool', axis = 1)

In [11]:
y = df['InjuryBool']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 )

In [13]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [14]:
logreg = LogisticRegression()

In [15]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [16]:
pgrid = {
    'C' : np.linspace(0.01, 10, 20),
    'max_iter' : [100, 500, 1000],
    'penalty' : ['l2', None]
}

In [17]:
gs = GridSearchCV(logreg, pgrid, n_jobs=10)

In [18]:
%%time
gs.fit(X_train_ct, y_train)

CPU times: total: 219 ms
Wall time: 1min 28s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
gs.best_score_

0.7417519501300086

In [20]:
gs.score(X_train_ct, y_train)

0.7448162111215834

In [22]:
gs.score(X_test_ct, y_test)

0.7342756183745583

In [23]:
gs.best_params_

{'C': 2.113157894736842, 'max_iter': 100, 'penalty': 'l2'}

In [24]:
logreg = LogisticRegression(C=2.1131, max_iter=100, penalty='l2')

In [25]:
logreg.fit(X_train_ct, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# with open ('mike_logreg.pkl', 'wb') as f:
#     pickle.dump(logreg, f)

In [26]:
pd.DataFrame(np.exp(logreg.coef_), columns=ct.get_feature_names_out()).T.sort_values(by = 0, ascending = False).head(20)

Unnamed: 0,0
oh__Make_BELL,1.794329
oh__Model_PA-18A,1.72858
oh__Model_208B,1.696947
oh__EventHour_1,1.642315
remainder__AmateurBuilt,1.573014
oh__EventMonth_12,1.539364
oh__Make_UNCOMMON MAKE,1.536203
oh__Model_206,1.525681
oh__PurposeOfFlight_Unknown,1.517312
oh__Model_207A,1.51509


In [28]:
X_dummy = pd.get_dummies(columns=['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour'], data = X, drop_first=True)

In [4]:
df2 = pd.read_csv('../datasets/alaska_single_engine_clean.csv')

In [9]:
df2['highest_injury_level'].value_counts()

None Reported    4080
Minor             640
Fatal             621
Serious           387
Name: highest_injury_level, dtype: int64

In [11]:
df2['aircraft_damage'].value_counts()

Substantial      4982
Destroyed         537
Minor             134
None Reported      52
None               18
Unknown             5
Name: aircraft_damage, dtype: int64

In [15]:
df2['model'] = df2['model'].str.upper()

In [16]:
df2['make'] = df2['make'].str.upper()

In [19]:
df2['occurred_near_airpor'] = 1 - df2['airport_name'].str.contains('Unknown').astype(int)

In [21]:
df2['purpose_of_flight'] = df2['purpose_of_flight'].map(lambda x: 'UNK' if x=='Unknown' else x)

In [22]:
def is_top_model(model, top_x):
    top_list = list(df2['model'].value_counts()[:top_x + 1].index.str.upper())
    return True if model.upper() in top_list else False

In [23]:
df2['model'] = [x.upper() if is_top_model(x,200) else 'UNCOMMON MODEL' for x in df2['model']]

In [24]:
def is_top_make(make, top_x):
    top_list = list(df2['make'].value_counts()[:top_x + 1].index.str.upper())
    return True if make.upper() in top_list else False

In [25]:
df2['make'] = [x.upper() if is_top_make(x,50) else 'UNCOMMON MAKE' for x in df2['make']]


In [32]:
df2['scheduled'] = df2['scheduled'].map(lambda x: 'UNK' if x == 'Unknown' else x)

In [36]:
X = df2.drop(['ntsb_no', 'probable_cause', 'airport_name', 'event_type', 'mkey', 'city', 'n', 'has_safety_rec', 'report_type', 'highest_injury_level', 'fatal_injury_count', 'serious_injury_count', 'minor_injury_count', 'airport_id', 'far', 'aircraft_damage', 'operator', 'event_year', 'event_season', 'event_day', 'aircraft_category', 'has_injury', 'event_time','has_aircraft_damage'], axis=1)

In [37]:
X.head()

Unnamed: 0,latitude,longitude,make,model,amateur_built,scheduled,purpose_of_flight,weather_condition,event_month,event_hour,is_accident,occurred_near_airpor
0,61.77516,-152.15263,CESSNA,UNCOMMON MODEL,0,UNK,PERS,Unknown,9,8,1,1
1,60.805019,-161.78648,PIPER,PA-18-150,0,UNK,PERS,Unknown,9,12,1,0
2,59.646929,-151.49323,CESSNA,A185F,0,UNK,BUS,VMC,9,13,1,1
3,64.267579,-147.68704,HELIO,H-295,0,NSCH,BUS,Unknown,9,16,1,0
4,64.650753,-149.83639,UNCOMMON MAKE,UNCOMMON MODEL,1,UNK,PERS,VMC,9,15,1,0


In [35]:
y = df2['has_injury']

In [38]:
X_dummy = pd.get_dummies(data=X,
                         columns=['make', 'model', 'scheduled', 'purpose_of_flight', 'weather_condition', 'event_month', 'event_hour'],
                         drop_first=True
        )

In [102]:
baseline = 1 - y.mean()
baseline

0.7122905027932961

In [49]:
X_con = sm.add_constant(X_dummy[list(X_dummy.columns)])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_con, y, random_state=2023)

In [69]:
glm_bin = sm.GLM(
    y_train,
    X_train,
    family=sm.families.Binomial(link=sm.families.links.logit())).fit()

AttributeError: module 'statsmodels.genmod.families.links' has no attribute 'Binomial'

In [66]:
glm_bin_results = glm_bin.summary()

In [71]:
test_pred = test_pred.map(lambda x: 1 if x >=0.50 else 0)

In [74]:
type(test_pred)

pandas.core.series.Series

In [75]:
type(y_test)

pandas.core.series.Series

In [89]:
def glm_accuracy(model, x, y):
    test_pred = model.predict(x)
    test_pred = test_pred.map(lambda x: 1 if x >=0.50 else 0)
    acc = pd.concat([test_pred, y], axis = 1)
    return acc[acc.iloc[:, 0] == acc.iloc[:, 1]].shape[0] / y.shape[0]

In [76]:
acc = pd.concat([test_pred, y_test], axis = 1)

In [81]:
acc.rename({0 :'predicted', 'has_injury' : 'actual'}, axis =1 , inplace = True)

In [87]:
accuracy = acc[(acc['predicted'] == acc['actual'])].shape[0] / y_test.shape[0]

In [88]:
accuracy

0.7178770949720671

In [90]:
glm_accuracy(glm_bin, X_test, y_test)

0.7178770949720671

In [91]:
glm_accuracy(glm_poi, X_test, y_test)

0.7248603351955307

In [103]:
glm_poi_coef_df = glm_poi.params.to_frame().sort_values(by = 0, ascending = False)

In [104]:
glm_poi_coef_df.rename({0: 'coef'}, axis = 1, inplace=True)

In [105]:
glm_poi_coef_df.to_csv('../results/GLM_POI_COEF_ACC-724.csv')