In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


import statsmodels.api as sm


In [2]:
class aviation_data_model:
    def __init__(self, x, y, drop_categories = None, test_size = 0.3, random_state = 2023):
        self.x = x.copy()
        self.drop_categories = drop_categories
        self.y = y
        self.random_state = random_state
        self.test_size = test_size
        self.model = None
        self.x_oh = None
        self.x_con = None
        self.results = None
        self.pvalues = None

        
    def one_hot(self):
        #Check to see if dummy parameter passed correctly
        if type(self.drop_categories[0]) == tuple:
            
            categories = []
            for i, j in self.drop_categories:
                # Collect subset of columns to dummify
                categories.append(i)
                
                # If a specific drop value has been given for a column, add a 0- to it so it is 'first' and drops accordingly
                if j != 'first':
                    self.x[i] = np.where(self.x[i] == j, '0-' + j, self.x[i])
        else:
            raise TypeError('Invalid OneHotEncoder values. Use None or a list of tuples of (column name, category name')
            
        self.x_oh = pd.get_dummies(data= self.x, columns = categories, drop_first=True)
        # Log line - uncomment below to check shapes after important steps
        # print(f"OneHot Resulting Shape: {self.x_oh.shape}")
        return self.x_oh
    
    def train_test(self, x, y):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=self.test_size, random_state=self.random_state)
        return (X_train, X_test, y_train, y_test)
        
    
    def fit_binomial(self, link = 'logit'):
        # Start with OneHotEncoding if given
        if self.drop_categories:
            oh = self.one_hot()
        else:
            oh = self.x
            
        # Add column for constant per statsmodels GLM requirements
        x_con = sm.add_constant(oh[list(oh.columns)])
        self.x_con = x_con
        
        # Log line - uncomment below to check shapes after important steps
        # print(f"After constant Resulting Shape: {self.x_con.shape}")
        
        #Train Test Split
        X_train, X_test, y_train, y_test = self.train_test(x_con, self.y)

        
        # Log lines - uncomment below to check shapes after important steps
        # print(f"After tts X_train Resulting Shape: {X_train.shape}")
        # print(f"After tts X_test Resulting Shape: {X_test.shape}")
        # print(f"After tts y_train Resulting Shape: {y_train.shape}")
        # print(f"After tts y_test Resulting Shape: {y_test.shape}")
        
        if link == 'logit':
            link_function = sm.families.links.Logit()
        elif link == 'probit':
            link_function = sm.families.links.Probit()
        
        #Make and fit a model
        glm_bin = sm.GLM(
            y_train,
            X_train,
            family=sm.families.Binomial(link=link_function)).fit()
        
        #Stash results as accessible attributes
        self.results = glm_bin.summary()
        self.model = glm_bin
        self.pvalues = np.round(glm_bin.pvalues, 4).to_frame().sort_values(by = 0)

In [3]:
df2 = pd.read_csv('../datasets/alaska_single_engine_clean.csv')

In [4]:
df2['model'] = df2['model'].str.upper()

In [5]:
df2['make'] = df2['make'].str.upper()

In [6]:
df2['occurred_near_airport'] = 1 - df2['airport_name'].str.contains('Unknown').astype(int)

In [7]:
df2['purpose_of_flight'] = df2['purpose_of_flight'].map(lambda x: 'UNK' if x=='Unknown' else x)

In [8]:
def is_top_model(model, top_x):
    top_list = list(df2['model'].value_counts()[:top_x + 1].index.str.upper())
    return True if model.upper() in top_list else False

In [9]:
df2['model'] = [x.upper() if is_top_model(x,200) else 'UNCOMMON MODEL' for x in df2['model']]

In [10]:
def is_top_make(make, top_x):
    top_list = list(df2['make'].value_counts()[:top_x + 1].index.str.upper())
    return True if make.upper() in top_list else False

In [11]:
df2['make'] = [x.upper() if is_top_make(x,50) else 'UNCOMMON MAKE' for x in df2['make']]


In [12]:
X = df2.drop(['ntsb_no', 'probable_cause', 'airport_name', 'event_type', 'mkey', 'city', 'n', 'has_safety_rec', 'report_type', 'highest_injury_level', 'fatal_injury_count', 'serious_injury_count', 'minor_injury_count', 'airport_id', 'far', 'aircraft_damage', 'operator', 'event_year', 'event_season', 'event_day', 'aircraft_category', 'has_injury', 'event_time','has_aircraft_damage'], axis=1)

In [13]:
tst_X = pd.get_dummies(data= X, columns = ['make', 'model', 'scheduled', 'purpose_of_flight', 'weather_condition', 'event_month', 'event_hour'], drop_first=True)

In [14]:
y = df2['has_injury']

In [15]:
# SAMPLE!
# create a new object that is ultimately a fitted binomial model
binomial_model_1 = aviation_data_model( X, y, [
    # Include tuples of (column name, specific value you want dropped)
    # Just put 'first' if you don't care about dropping a specific value
    ('make', 'CESSNA'),
    ('model', 'PA-18'),
    ('scheduled', 'first'),
    ('purpose_of_flight', 'first'),
    ('weather_condition', 'first'),
    ('event_month', 'first'),
    ('event_hour', 'first')
])

# call the fit_binomial() model to get it!
binomial_model_1.fit_binomial()

# access the results
res = binomial_model_1.results
# or the p values
pvalues = binomial_model_1.pvalues

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
X_tr, X_test, y_tr, y_test = glm1.train_test(glm2.x_con, y)

In [None]:
X_tr

In [None]:
        glm_bin = sm.GLM(
            y_tr,
            X_tr,
            family=sm.families.Binomial(link=sm.families.links.Logit())).fit()

In [None]:
glm_bin.pvalues