In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# read the data

# keep_default_na=False reads empty cells as '', which I found temporarily helpful
# we may want to take that out later

flights = pd.read_csv('../data/ntsb-sample-2013.csv', keep_default_na=False)

In [3]:
# Dropped columns and reasons for dropping them

# Not relevant
flights.drop(['Mkey', 'ReportNo', 'N#', 'SerialNumber', 'ReportType', 'ReportStatus', 'RepGenFlag', 
              'MostRecentReportType', 'OriginalPublishedDate', 'DocketOriginalPublishedDate', 
              'Operator', 'EventID', 'NtsbNo', 'DocketUrl', 'ReportUrl'], axis=1, inplace=True)

# (Almost) all rows have same value
flights.drop(['Country', 'Mode', 'HasSafetyRec', ], axis=1, inplace=True)

# Redundant
flights.drop(['HighestInjuryLevel'], axis=1, inplace=True)


### Possible issues with other variables

Redundancy: 
- FatalInjuryCount + SeriousInjuryCount + MinorInjuryCount = OnboardInjuryCount + OnGroundInjuryCount
- City, State, Latitude, Longitude, AirportID, AirportName

Requires significant processing:
- ProbableCause, Findings
- Make, Model (hundreds of distinct values)
- PurposeOfFlight (26 distinct values, 17 of which have < 1% occurrence rate)

Possible data leakage
- EventType (accident vs. incident)

### Preliminary recommendations (C.J.)
- Drop location column unless it's necessary for train/test split
- Drop make + model column
- Drop 31 rows with EventType == INC, then drop column EventType
- Investigate meanings of entries in PurposeOfFlight column, group into smaller categories
- Drop rows involving multiple aircraft (identifiable by two comma-separated values in a single cell)

In [4]:
flights.head()

Unnamed: 0,EventType,EventDate,City,State,FatalInjuryCount,SeriousInjuryCount,MinorInjuryCount,OnboardInjuryCount,OnGroundInjuryCount,ProbableCause,...,AirportName,AmateurBuilt,NumberOfEngines,EngineType,Scheduled,PurposeOfFlight,FAR,AirCraftDamage,WeatherCondition,BroadPhaseofFlight
0,ACC,2013-12-31T13:00:00Z,Salinas,California,0,0,0,0,0,The pilot's failure to maintain directional co...,...,SALINAS MUNI,False,1,REC,,PERS,91,Substantial,VMC,Landing
1,ACC,2013-12-30T12:40:00Z,Concordia,Kansas,0,0,2,2,0,The pilot's failure to maintain a proper glide...,...,BLOSSER MUNI,False,1,REC,,PERS,91,Substantial,VMC,Landing
2,ACC,2013-12-29T15:10:00Z,Paradis,Louisiana,0,0,0,0,0,The collision with a bird which resulted in su...,...,LOUIS ARMSTRONG NEW ORLEANS IN,False,1,REC,,PERS,91,Substantial,VMC,Enroute
3,ACC,2013-12-29T13:53:00Z,Pacoima,California,0,0,2,2,0,The flight instructor’s inadequate compensatio...,...,WHITEMAN,False,1,REC,,INST,91,Substantial,VMC,Maneuvering
4,ACC,2013-12-29T13:30:00Z,Roberts,Montana,0,0,0,0,0,The pilot's selection of unsuitable terrain du...,...,Private,False,1,REC,,PERS,91,Substantial,VMC,Taxi


In [5]:
# Drop rows corresponding to multi-aircraft accidents
# indicated by a comma in the column FAR
multiple_aircraft = flights['FAR'].str.contains(',')
flights = flights[~multiple_aircraft]