In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
import statsmodels.api as sm
import seaborn as sns

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import pickle

In [2]:
df = pd.read_csv('../datasets/mike_atb2.csv')

In [3]:
df.head()

Unnamed: 0,Latitude,Longitude,Make,Model,AmateurBuilt,PurposeOfFlight,WeatherCondition,EventMonth,InjuryBool,OccurredNearAirport,EventHour
0,61.77516,-152.15263,CESSNA,UNCOMMON MODEL,0,PERS,Unknown,9,0,1,8
1,60.805019,-161.78648,PIPER,PA-18-150,0,PERS,Unknown,9,0,0,12
2,59.646929,-151.49323,CESSNA,A185F,0,BUS,VMC,9,0,1,13
3,64.267579,-147.68704,HELIO,H-295,0,BUS,Unknown,9,0,0,16
4,64.650753,-149.83639,UNCOMMON MAKE,UNCOMMON MODEL,1,PERS,VMC,9,0,0,15


In [4]:
ct = ColumnTransformer([
    ('oh', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), ['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour' ]
    )],
    remainder ='passthrough')

In [5]:
df.shape

(5659, 11)

In [6]:
X = df.drop('InjuryBool', axis = 1)

In [7]:
y = df['InjuryBool']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 )

In [9]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)



In [10]:
logreg = LogisticRegression()

In [11]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [12]:
pgrid = {
    'C' : np.linspace(0.01, 10, 20),
    'max_iter' : [100, 500, 1000],
    'penalty' : ['l2', None]
}

In [13]:
gs = GridSearchCV(logreg, pgrid, n_jobs=10)

In [14]:
%%time
gs.fit(X_train_ct, y_train)

KeyboardInterrupt: 

In [15]:
gs.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [None]:
gs.score(X_train_ct, y_train)

In [None]:
gs.score(X_test_ct, y_test)

In [None]:
gs.best_params_

In [None]:
logreg = LogisticRegression(C=2.1131, max_iter=100, penalty='l2')

In [None]:
logreg.fit(X_train_ct, y_train)

In [None]:
# with open ('mike_logreg.pkl', 'wb') as f:
#     pickle.dump(logreg, f)

In [None]:
pd.DataFrame(np.exp(logreg.coef_), columns=ct.get_feature_names_out()).T.sort_values(by = 0, ascending = False).head(20)

In [None]:
X_dummy = pd.get_dummies(columns=['Make', 'Model', 'PurposeOfFlight', 'WeatherCondition', 'EventMonth', 'EventHour'], data = X, drop_first=True)