# Read & Prepare 'Findings' Dataset for Joining

In [347]:
import pandas as pd

In [348]:
df = pd.read_csv('../data/ntsb/ntsb_findings.csv',usecols=['ev_id',
                                                             'Aircraft_Key',
                                                             'finding_description',
                                                             'Cause_Factor'])




  df = pd.read_csv('../data/ntsb/ntsb_findings.csv',usecols=['ev_id',


In [349]:
df.sample(30)

Unnamed: 0,ev_id,Aircraft_Key,finding_description,Cause_Factor
25327,20130814X15751,1,Aircraft-Aircraft oper/perf/capability-Perform...,C
45749,20180507X85538,1,Personnel issues-Task performance-Use of equip...,C
15899,20110630X22315,1,Aircraft-Aircraft oper/perf/capability-Perform...,C
12884,20100923X91107,1,Aircraft-Aircraft systems-Landing gear system-...,C
1565,20080523X00719,1,Aircraft-Aircraft power plant-Ignition system-...,C
60811,20220516105084,1,Personnel issues-Action/decision-Action-Unnece...,
330,20080306X00278,1,Aircraft-Aircraft oper/perf/capability-Perform...,C
66506,20240417194104,1,Personnel issues-Task performance-Planning/pre...,
60647,20220422104986,1,Aircraft-Aircraft oper/perf/capability-Perform...,
24604,20130624X31512,1,Environmental issues-Physical environment-Terr...,


In [350]:
## remove blank or NaN
indexCorF = df[ (df['Cause_Factor'] != 'C') & (df['Cause_Factor'] != 'F') ].index
df.drop(indexCorF , inplace=True)

## Set new id for each case
df['accident_id'] = df['ev_id'].astype(str) + '_' + df['Aircraft_Key'].astype(str)
df.drop(columns=['ev_id', 'Aircraft_Key'], inplace = True)


In [351]:
df.sample(30)

Unnamed: 0,finding_description,Cause_Factor,accident_id
12901,Personnel issues-Task performance-Use of equip...,C,20100925X00525_1
44008,Environmental issues-Conditions/weather/phenom...,C,20171118X25754_1
50309,Personnel issues-Task performance-Planning/pre...,C,20190505X61703_1
23734,Personnel issues-Miscellaneous-Intentional act...,C,20130420X22041_1
31580,Aircraft-Aircraft systems-Fuel system-Fuel sel...,C,20150310X74323_1
10676,Personnel issues-Task performance-Planning/pre...,C,20100425X31208_1
27873,Aircraft-Aircraft power plant-Engine (turbine/...,C,20140508X30821_1
31627,Personnel issues-Task performance-Use of equip...,C,20150313X83817_1
49479,Personnel issues-Task performance-Maintenance-...,C,20190311X63755_1
12622,Personnel issues-Psychological-Attention/monit...,C,20100906X40636_1


In [352]:
## Remove rows with ambiguous finding description such as 
## "Not determined-Not determined-(general)-(general)-Unknown/Not determined - C"
df = df[~df.finding_description.str.startswith('Not determined')]

In [353]:
## Finding description has its hierarchy.
## Personnel issue includes fatigue of a person. 
## Environmental issue includes status of ground, weather condition, etc.
## Aircraft contains fuel, engine...
## Organizational seems regulatory factors
## there are only four main system issue among 16k cases. I don't what this is.
## I put only the biggest category as the finding description for simplicity.
df['finding_description_category'] = df.finding_description.str.split('-').str[0]

In [354]:
df.sample(30)

Unnamed: 0,finding_description,Cause_Factor,accident_id,finding_description_category
46255,Environmental issues-Conditions/weather/phenom...,C,20180611X85348_1,Environmental issues
38149,Personnel issues-Task performance-Use of equip...,C,20160812X50604_1,Personnel issues
23004,Personnel issues-Action/decision-Action-Incorr...,C,20130127X54833_1,Personnel issues
5466,Aircraft-Aircraft oper/perf/capability-Perform...,F,20090222X33307_1,Aircraft
4154,Environmental issues-Physical environment-Obje...,C,20081009X71752_1,Environmental issues
26915,Aircraft-Aircraft systems-Hydraulic power syst...,C,20140128X90012_1,Aircraft
46975,Aircraft-Aircraft oper/perf/capability-Perform...,C,20180725X00713_1,Aircraft
9140,Aircraft-Aircraft oper/perf/capability-Perform...,C,20091031X95957_1,Aircraft
41842,Personnel issues-Task performance-Use of equip...,C,20170604X74546_1,Personnel issues
55483,Aircraft-Aircraft oper/perf/capability-Perform...,C,20200914X02320_1,Aircraft


In [355]:
for i in df[df['finding_description_category'] == 'main system'].index:
    print(df.loc[i])
    print()

finding_description             main system-Simulated malf/failure - C
Cause_Factor                                                         C
accident_id                                           20120119X11221_1
finding_description_category                               main system
Name: 18378, dtype: object

finding_description             main system-Damaged/degraded - C
Cause_Factor                                                   C
accident_id                                     20120621X34639_1
finding_description_category                         main system
Name: 20216, dtype: object

finding_description             main system-Capability exceeded - C
Cause_Factor                                                      C
accident_id                                        20120621X34639_1
finding_description_category                            main system
Name: 20217, dtype: object

finding_description             main system-Inoperative - F
Cause_Factor                            

In [356]:
df.drop(df[df['finding_description_category'] == 'main system'].index, inplace = True)

In [357]:
## Counts cause and factor of findings in each case

findings_dummies = pd.get_dummies(df['finding_description_category'],dtype=int)
new_df = pd.concat([df,findings_dummies],axis=1).drop(columns=['finding_description_category',
                                                               'Cause_Factor'])
new_df = new_df.groupby(new_df['accident_id']).sum()
new_df

Unnamed: 0_level_0,finding_description,Aircraft,Environmental issues,Organizational issues,Personnel issues
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20080107X00026_1,Personnel issues-Action/decision-Info processi...,0,0,0,1
20080107X00026_2,Personnel issues-Action/decision-Info processi...,0,0,0,1
20080107X00027_1,Environmental issues-Conditions/weather/phenom...,0,1,0,1
20080109X00036_1,Environmental issues-Conditions/weather/phenom...,1,1,0,0
20080114X00044_1,Personnel issues-Task performance-Inspection-S...,0,0,0,1
...,...,...,...,...,...
20200915X34450_1,Aircraft-Fluids/misc hardware-Fluids-Fuel-Flui...,1,0,0,0
20200915X70606_1,Aircraft-Aircraft oper/perf/capability-Perform...,1,0,0,1
20200917X42444_1,Personnel issues-Task performance-Use of equip...,0,0,0,1
20200920X32151_1,Aircraft-Aircraft oper/perf/capability-Perform...,1,0,0,1


In [380]:
np.sum(new_df.index.str[-1]!='1')

np.int64(267)

In [358]:
new_df.to_csv('../data/ntsb/cleaned/findings_data.csv')