## Feature selection

This notebook uses Random Forest models to narrow down features for predicting injury proportions and damage.

__Current Approach__

For predicting both injury proportion and damage:
1. Fit a Random Forest with default parameters
2. Drop features below a certain importance threshold
3. Repeat steps 1 and 2 as needed

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [None]:
data.columns

Index(['ntsb_no', 'ev_type', 'ev_country', 'ev_year', 'ev_month', 'latitude',
       'longitude', 'apt_dist', 'gust_kts', 'altimeter', 'ev_highest_injury',
       'inj_f_grnd', 'inj_m_grnd', 'inj_s_grnd', 'inj_tot_f', 'inj_tot_m',
       'inj_tot_n', 'inj_tot_s', 'inj_tot_t', 'aircraft_count', 'Aircraft_ID',
       'event_key', 'damage', 'acft_model', 'total_seats', 'num_eng',
       'date_last_insp', 'Fatal_count', 'Minor_count', 'None_count',
       'Serious_count', 'total_person_count', 'injured_person_count',
       'ground_injury_total', 'light_cond_DAYL', 'light_cond_DUSK',
       'light_cond_NDRK', 'light_cond_NITE', 'light_cond_other/unknown',
       'BroadPhaseofFlight_Air', 'BroadPhaseofFlight_Ground',
       'BroadPhaseofFlight_Landing', 'BroadPhaseofFlight_Takeoff',
       'BroadPhaseofFlight_other/unknown', 'eng_type_REC', 'eng_type_TF',
       'eng_type_TP', 'eng_type_TS', 'eng_type_other/unknown', 'far_part_091',
       'far_part_121', 'far_part_135', 'far_part_137', 'fa

In [None]:
features = ['num_people_onboard', 'light_cond_DAYL',
       'light_cond_DUSK', 'light_cond_NDRK', 'light_cond_NITE',
       'light_cond_other/unknown', 'BroadPhaseofFlight_Air',
       'BroadPhaseofFlight_Ground', 'BroadPhaseofFlight_Landing',
       'BroadPhaseofFlight_Takeoff', 'BroadPhaseofFlight_other/unknown',
       'eng_type_REC', 'eng_type_TF', 'eng_type_TP', 'eng_type_TS',
       'eng_type_other/unknown', 'far_part_091', 'far_part_121',
       'far_part_135', 'far_part_137', 'far_part_PUBU',
       'far_part_other/unknown', 'acft_make_beech', 'acft_make_bell',
       'acft_make_boeing', 'acft_make_cessna', 'acft_make_mooney',
       'acft_make_other/unknown', 'acft_make_piper',
       'acft_make_robinson helicopter', 'acft_category_AIR',
       'acft_category_HELI', 'acft_category_other/unknown', 'homebuilt_N',
       'homebuilt_Y', 'homebuilt_other/unknown', 'fixed_retractable_FIXD',
       'fixed_retractable_RETR', 'fixed_retractable_other/unknown',
       'second_pilot_N', 'second_pilot_Y', 'second_pilot_other/unknown']

In [None]:
target = 'damage'
rf = RandomForestClassifier()

data_inj_prop_known = data.loc[~data[target].isna()]

X = data_inj_prop_known[features]
y = data_inj_prop_known[target]

In [None]:
rf.fit(X, y)

feature_importances = {}

for i, feature in enumerate(features):
    feature_importances[feature] = rf.feature_importances_[i]

In [None]:
#importance threshold for keeping variable
# alpha = np.arange(15)/100

# for a in alpha:
#    important_dummies = {item for item in feature_importances.items() if item[1] > a}
#    print(f'Importance threshold {a} -- {len(important_dummies)} variables')

In [None]:
# important_features = {item for item in feature_importances.items() if item[1] > 0.02}
# important_features

In [None]:
important_features = [feature for feature in feature_importances.keys() if feature_importances[feature] > 0.02]
# important_features

In [None]:
X_new = data_inj_prop_known[important_features]

rf.fit(X_new, y)

new_importances = {}

for i, feature in enumerate(important_features):
    new_importances[feature] = rf.feature_importances_[i]

new_importances

{'num_people_onboard': np.float64(0.238190143955991),
 'BroadPhaseofFlight_Air': np.float64(0.05961742533296121),
 'BroadPhaseofFlight_Landing': np.float64(0.03017209361354894),
 'eng_type_REC': np.float64(0.03703450897600033),
 'eng_type_other/unknown': np.float64(0.04531399126858635),
 'far_part_091': np.float64(0.04125019361945325),
 'far_part_other/unknown': np.float64(0.07981784024221317),
 'acft_make_other/unknown': np.float64(0.025517785583144416),
 'acft_category_other/unknown': np.float64(0.03696840746069401),
 'homebuilt_other/unknown': np.float64(0.12584339568559513),
 'fixed_retractable_FIXD': np.float64(0.02802089848689157),
 'fixed_retractable_RETR': np.float64(0.01912664684697265),
 'fixed_retractable_other/unknown': np.float64(0.1679666199795361),
 'second_pilot_N': np.float64(0.02750698711781947),
 'second_pilot_other/unknown': np.float64(0.037653061830592464)}