In [26]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


import statsmodels.api as sm

In [27]:
#using Mike's glm class from mike_model_maker.ipynb
class aviation_data_model:
    def __init__(self, x, y, drop_categories = None, test_size=0.3, random_state = 2023):
        self.x = x.copy()
        self.drop_categories = drop_categories
        self.y = y
        self.random_state = random_state
        self.test_size = test_size
        self.model = None
        self.x_oh = None
        self.x_con = None
        self.results = None
        self.pvalues = None

        
    def one_hot(self):
        #Check to see if dummy parameter passed correctly
        if type(self.drop_categories[0]) == tuple:
            
            categories = []
            for i, j in self.drop_categories:
                # Collect subset of columns to dummify
                categories.append(i)
                
                # If a specific drop value has been given for a column, add a 0- to it so it is 'first' and drops accordingly
                if j != 'first':
                    self.x[i] = np.where(self.x[i] == j, '0-' + j, self.x[i])
        else:
            raise TypeError('Invalid OneHotEncoder values. Use None or a list of tuples of (column name, category name')
            
        self.x_oh = pd.get_dummies(data= self.x, columns = categories, drop_first=True)
        # Log line - uncomment below to check shapes after important steps
        # print(f"OneHot Resulting Shape: {self.x_oh.shape}")
        return self.x_oh
    
    def train_test(self, x, y):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=self.test_size, random_state=self.random_state)
        return (X_train, X_test, y_train, y_test)
        
    
    def fit_binomial(self):
        # Start with OneHotEncoding if given
        if self.drop_categories:
            oh = self.one_hot()
            
        # Add column for constant per statsmodels GLM requirements
        x_con = sm.add_constant(oh[list(oh.columns)])
        self.x_con = x_con
        
        # Log line - uncomment below to check shapes after important steps
        # print(f"After constant Resulting Shape: {self.x_con.shape}")
        
        #Train Test Split
        X_train, X_test, y_train, y_test = self.train_test(x_con, self.y)

        
        # Log lines - uncomment below to check shapes after important steps
        # print(f"After tts X_train Resulting Shape: {X_train.shape}")
        # print(f"After tts X_test Resulting Shape: {X_test.shape}")
        # print(f"After tts y_train Resulting Shape: {y_train.shape}")
        # print(f"After tts y_test Resulting Shape: {y_test.shape}")
        
        #Make and fit a model
        glm_bin = sm.GLM(
            y_train,
            X_train,
            family=sm.families.Binomial(link=sm.families.links.Logit())).fit()
        
        #Stash results as accessible attributes
        self.results = glm_bin.summary()
        self.model = glm_bin
        self.pvalues = np.round(glm_bin.pvalues, 4).to_frame().sort_values(by = 0)

In [28]:
#reading in data
data = pd.read_csv('../datasets/alaska_single_engine_clean.csv')

In [29]:
data.head()

Unnamed: 0,ntsb_no,event_type,mkey,city,n,has_safety_rec,report_type,highest_injury_level,fatal_injury_count,serious_injury_count,...,event_day,event_time,event_hour,event_season,has_injury,has_aircraft_damage,is_accident,occurred_near_airport,make_model,aircraft_family
0,ANC23LA086,ACC,193153,Trimble River / Skwentna,N2586R,0,DirectorBrief,None Reported,0,0,...,24,08:30:00,8,Fall,0,1,1,0,CESSNA 182K,CESSNA 182 SKYLANE
1,ANC23LA084,ACC,193128,Bethel,N8192D,0,DirectorBrief,None Reported,0,0,...,20,12:00:00,12,Fall,0,1,1,0,PIPER PA-18-150,PIPER PA-18 SUPER CUB
2,ANC23LA080,ACC,193097,Homer,N7558H,0,DirectorBrief,None Reported,0,0,...,18,13:00:00,13,Fall,0,1,1,1,CESSNA A185F,CESSNA 185 SKYWAGON
3,ANC23LA082,ACC,193105,Beaver Creek,N713C,0,DirectorBrief,None Reported,0,0,...,16,16:50:00,16,Fall,0,1,1,0,HELIO H-295,HELIO COURIER
4,ANC23LA078,ACC,193088,NENANA,N907W,0,DirectorBrief,None Reported,0,0,...,16,15:00:00,15,Fall,0,1,1,0,RHODES STEVEN D SR3500,UNCOMMON FAMILY


In [30]:
X = data['aircraft_family']
y = data['has_injury']

In [31]:
data.dtypes

ntsb_no                   object
event_type                object
mkey                       int64
city                      object
n                         object
has_safety_rec             int64
report_type               object
highest_injury_level      object
fatal_injury_count         int64
serious_injury_count       int64
minor_injury_count         int64
probable_cause            object
latitude                 float64
longitude                float64
make                      object
model                     object
aircraft_category         object
airport_id                object
airport_name              object
amateur_built              int64
scheduled                 object
purpose_of_flight         object
far                       object
aircraft_damage           object
weather_condition         object
operator                  object
event_year                 int64
event_month                int64
event_day                  int64
event_time                object
event_hour

In [32]:
X_dummies = pd.get_dummies(data=X, columns=['aircraft_family'], drop_first=True)
X_dummies

Unnamed: 0,AERONCA 7AC CHAMPION,BEECHCRAFT 1900,BELL 206,CESSNA 140,CESSNA 150,CESSNA 152,CESSNA 170,CESSNA 172 SKYHAWK,CESSNA 175 SKYLARK,CESSNA 177 CARDINAL,...,PIPER PA-20 PACER,PIPER PA-22 TRI-PACER,PIPER PA-28 CHEROKEE,PIPER PA-31 NAVAJO,PIPER PA-32 CHEROKEE SIX,ROBINSON HELICOPTER COMPANY R44 II,STINSON 108,TAYLORCRAFT B,TAYLORCRAFT F-19 SPORTSMAN,UNCOMMON FAMILY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5585,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# SAMPLE!
# create a new object that is ultimately a fitted binomial model
binomial_model_2 = aviation_data_model( X, y, [
    # Include tuples of (column name, specific value you want dropped)
    # Just put 'first' if you don't care about dropping a specific value
    ('aircraft_family', 'first'),
    
])

# call the fit_binomial() model to get it!
binomial_model_2.fit_binomial()

# access the results
results2 = binomial_model_2.results
# or the p values
pvalues2 = binomial_model_2.pvalues

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.3, random_state=42)

In [36]:
model = sm.GLM(y_train, X_train, family=sm.families.Gaussian())

In [37]:
results = model.fit()

In [39]:
p_values = results.pvalues

In [46]:
p_values

AERONCA 7AC CHAMPION                              4.574145e-01
BEECHCRAFT 1900                                   9.194665e-02
BELL 206                                          1.289959e-09
CESSNA 140                                        3.994473e-01
CESSNA 150                                        3.428205e-08
CESSNA 152                                        3.186696e-03
CESSNA 170                                        2.266565e-05
CESSNA 172 SKYHAWK                                3.802044e-13
CESSNA 175 SKYLARK                                1.372204e-01
CESSNA 177 CARDINAL                               1.004687e-02
CESSNA 180 SKYWAGON                               4.750845e-11
CESSNA 182 SKYLANE                                2.803431e-08
CESSNA 185 SKYWAGON                               5.262164e-23
CESSNA 206 STATIONAIR                             5.337353e-27
CESSNA 207                                        1.198410e-32
CESSNA 208 CARAVAN                                4.649

In [47]:
p_values.round(4)

AERONCA 7AC CHAMPION                              0.4574
BEECHCRAFT 1900                                   0.0919
BELL 206                                          0.0000
CESSNA 140                                        0.3994
CESSNA 150                                        0.0000
CESSNA 152                                        0.0032
CESSNA 170                                        0.0000
CESSNA 172 SKYHAWK                                0.0000
CESSNA 175 SKYLARK                                0.1372
CESSNA 177 CARDINAL                               0.0100
CESSNA 180 SKYWAGON                               0.0000
CESSNA 182 SKYLANE                                0.0000
CESSNA 185 SKYWAGON                               0.0000
CESSNA 206 STATIONAIR                             0.0000
CESSNA 207                                        0.0000
CESSNA 208 CARAVAN                                0.0000
CESSNA 402                                        0.0006
CHAMPION/ BELLANCA/ AMERICAN CH