In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

import pickle

In [11]:
df = pd.read_csv('../datasets/mike_atb.csv')

In [13]:
df.head()

Unnamed: 0,Latitude,Longitude,Make,Model,AmateurBuilt,PurposeOfFlight,WeatherCondition,EventMonth,InjuryBool,OccurredNearAirport,EventHour
0,61.77516,-152.15263,CESSNA,OTHER,0,PERS,Unknown,9,0,1,8
1,60.805019,-161.78648,PIPER,PA-18-150,0,PERS,Unknown,9,0,0,12
2,59.646929,-151.49323,CESSNA,A185F,0,BUS,VMC,9,0,1,13
3,64.267579,-147.68704,HELIO,H-295,0,BUS,Unknown,9,0,0,16
4,64.650753,-149.83639,OTHER,OTHER,1,PERS,VMC,9,0,0,15


In [15]:
ct = ColumnTransformer([
    ('oh', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), ['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour' ]
    )],
    remainder ='passthrough')

In [19]:
df.shape

(5659, 11)

In [16]:
X = df.drop('InjuryBool', axis = 1)

In [17]:
y = df['InjuryBool']

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 )

In [93]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [26]:
logreg = LogisticRegression()

In [28]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [37]:
pgrid = {
    'C' : np.linspace(0.01, 10, 20),
    'max_iter' : [100, 500, 1000],
    'penalty' : ['l2', None]
}

In [38]:
gs = GridSearchCV(logreg, pgrid, n_jobs=10)

In [39]:
%%time
gs.fit(X_train_ct, y_train)

CPU times: total: 93.8 ms
Wall time: 1min 3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
gs.best_score_

0.741512489721537

In [42]:
gs.score(X_train_ct, y_train)

0.7441093308199811

In [43]:
gs.score(X_train_ct, y_test)

0.7371024734982332

In [45]:
gs.best_params_

{'C': 0.5357894736842106, 'max_iter': 100, 'penalty': 'l2'}

In [94]:
logreg = LogisticRegression(C=0.53578, max_iter=100, penalty='l2')

In [95]:
logreg.fit(X_train_ct, y_train)

In [91]:
with open ('mike_logreg.pkl', 'wb') as f:
    pickle.dump(logreg, f)

In [96]:
pd.DataFrame(np.exp(logreg.coef_), columns=ct.get_feature_names_out()).T.sort_values(by = 0, ascending = False).head(20)

Unnamed: 0,0
oh__EventHour_1,2.946737
oh__Model_PA-28-140,2.69468
oh__Model_150L,2.550394
oh__Model_PA-32-260,2.081159
oh__Model_AS350,1.854866
oh__Model_PA-18A,1.853451
oh__Make_BELL,1.842588
oh__Model_M-5-235C,1.841321
oh__Model_182B,1.839382
oh__Model_U206E,1.832306


In [71]:
ct = ColumnTransformer([
    ('oh', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), ['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour' ]
    ),
    ('sc', StandardScaler(), ['Latitude', 'Longitude'])

],
    remainder ='passthrough')

In [72]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)



In [73]:
lsvc = LinearSVC(dual='auto')

In [74]:
pgrid = {
    'C' : np.linspace(0.001, 1, 20),
}

In [75]:
gs = GridSearchCV(lsvc, pgrid, n_jobs = 10)

In [76]:
gs.fit(X_train_ct, y_train)

In [77]:
gs.best_score_

0.740570204680312

In [80]:
svc = SVC()

In [84]:
pgrid = {
    'C' : np.linspace(0.001, 1, 20),
    'kernel' : ['rbf', 'poly', 'sigmoid'],
    'degree' : [2,3]
}

In [85]:
gs = GridSearchCV(svc, pgrid, n_jobs = 10)

In [86]:
gs.fit(X_train_ct, y_train)

In [87]:
gs.best_score_

0.7433987265817721

In [88]:
gs.score(X_train_ct, y_train)

0.751885014137606

In [89]:
gs.score(X_test_ct, y_test)

0.7385159010600707