In [1]:
import pandas as pd

# "Engines" Data

In [34]:
data = pd.read_csv('../data/ntsb/ntsb_engines.csv',usecols=['ev_id',
                                                            'Aircraft_Key',
                                                            'eng_type',
                                                            'eng_no'])
data['event_key'] = data['ev_id'].astype(str) + '_' + data['Aircraft_Key'].astype(str)

data = data[data['eng_no']==1].drop(columns='eng_no')
engine_dummies = pd.get_dummies(data['eng_type'],dtype=int,prefix='engine')

engines = pd.concat([data[['event_key','ev_id','Aircraft_Key']],engine_dummies],axis=1)

#export.to_csv('../data/ntsb/cleaned/engine_data.csv')

# "Aircraft" Data

In [36]:
data = pd.read_csv('../data/ntsb/ntsb_aircraft.csv',usecols=['ev_id',
                                                             'Aircraft_Key',
                                                             'far_part',
                                                             'damage',
                                                             'acft_fire',
                                                             'acft_expl',
                                                             'acft_make',
                                                             'acft_category',
                                                             'homebuilt',
                                                             'total_seats',
                                                             'num_eng',
                                                             'fixed_retractable',
                                                             'date_last_insp',
                                                             'owner_acft',
                                                             'certs_held',
                                                             'oprtng_cert',
                                                             'oper_cert',
                                                             'type_fly',
                                                             'second_pilot',
                                                             'evacuation',
                                                             'rwy_len',
                                                             'rwy_width',
                                                             'acft_year',
                                                             'fuel_on_board',
                                                             'unmanned'])


data['event_key'] = data['ev_id'].astype(str) + '_' + data['Aircraft_Key'].astype(str)
aircraft = data.copy()

  data = pd.read_csv('../data/ntsb/ntsb_aircraft.csv',usecols=['ev_id',


# "Findings" Data

# "Event" Data

In [40]:
data = pd.read_csv('../data/ntsb/ntsb_events.csv',usecols=['ev_id',
                                                            'ev_type',
                                                            'ev_highest_injury',
                                                            'inj_f_grnd',
                                                            'inj_m_grnd',
                                                            'inj_s_grnd',
                                                            'inj_tot_f',
                                                            'inj_tot_m',
                                                            'inj_tot_m',
                                                            'inj_tot_n',
                                                            'inj_tot_s',
                                                            'inj_tot_t',
                                                            'ev_time',
                                                            'ev_year',
                                                            'ev_month',
                                                            'on_ground_collision',
                                                            'latitude',
                                                            'longitude',
                                                            'apt_dist',
                                                            'light_cond',
                                                            'wx_dew_pt',
                                                            'wind_vel_kts',
                                                            'gust_kts',
                                                            'altimeter',
                                                            ])

  data = pd.read_csv('../data/ntsb/ntsb_events.csv',usecols=['ev_id',


# Joining Event Data with Aircraft-Specific Data
To join the primary "event" dataset, which is event-specific with the other tables which are aircraft-specific, we need to take a careful approach to how we go about joining them.  The general idea goes as follows:
1. Create the "event_key" variable in the aircraft-specific datasets, which takes the format '{ev_id}_{Aircraft_Key}'.
2. Join the lower-level datasets together to maximize the number of observations.  Some will likely be in one and not another, but what is important is that we collect a list of all individual aircraft-level observations.
3. Once all lower-level datasets have been joined together and we have a list of all events with multiple aircraft, we can export a "aircraft_count" variable which expresses the number of "Aircraft_Key" for every "ev_id."  
4. Join this "aircraft_count" column into the "Event" dataset - now we have a count of how many planes were involved in each event.
5. Create a function which duplicates every row in "Events" (aircraft_count - 1 times).  Thus, if there's 3 planes, we'll get 2 new rows of the event.
6. Re-create the "Aircraft" variable with a groupby() and cum_count() function, so that every row per ev_id is added to until there are no more observations left (will be clearer in the code).
7. Now that we have the dataset formatted to resemble the individual-aircraft-level data from other tables, we can create the "event_key" - our master joining variable - in the events data.
8. Join all datasets on the Events data by "event_key", "Aircraft_Key", and "ev_id" to ensure we are joining the right aircraft/event combos onto the event data.

In [91]:
tables = pd.merge(engines,aircraft,on=['event_key','ev_id','Aircraft_Key'],how='left')

aircraft_counts = pd.DataFrame(tables.groupby('ev_id')['Aircraft_Key'].count()).reset_index() # Counts how many unique values of "Aircraft_Key" per event
aircraft_counts.rename(columns={'Aircraft_Key':'aircraft_count'},inplace=True)

In [None]:
df = pd.merge(data,aircraft_counts,on='ev_id',how='left') # 

### NOTE: Work in progress. 