In [1]:
import pandas as pd
import numpy as np

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [2]:
ak = pd.read_csv('../datasets/alaska_single_engine.csv', low_memory=False)

In [3]:
ak.dtypes

Unnamed: 0               int64
NtsbNo                  object
EventType               object
Mkey                     int64
EventDate               object
City                    object
State                   object
Country                 object
ReportNo                object
N                       object
HasSafetyRec              bool
ReportType              object
OriginalPublishDate     object
HighestInjuryLevel      object
FatalInjuryCount         int64
SeriousInjuryCount       int64
MinorInjuryCount         int64
ProbableCause           object
EventID                float64
Latitude               float64
Longitude              float64
Make                    object
Model                   object
AirCraftCategory        object
AirportID               object
AirportName             object
AmateurBuilt            object
NumberOfEngines         object
Scheduled               object
PurposeOfFlight         object
FAR                     object
AirCraftDamage          object
WeatherC

In [4]:
ak.isna().sum()

Unnamed: 0                0
NtsbNo                    0
EventType                 0
Mkey                      0
EventDate                 0
City                      0
State                     0
Country                   0
ReportNo               5645
N                         0
HasSafetyRec              0
ReportType               22
OriginalPublishDate     885
HighestInjuryLevel     4044
FatalInjuryCount          0
SeriousInjuryCount        0
MinorInjuryCount          0
ProbableCause          1452
EventID                5659
Latitude                  0
Longitude                 0
Make                      0
Model                     1
AirCraftCategory          0
AirportID              3510
AirportName            3196
AmateurBuilt              0
NumberOfEngines           0
Scheduled              4342
PurposeOfFlight         414
FAR                       0
AirCraftDamage           52
WeatherCondition         20
Operator               2842
ReportStatus              0
RepGenFlag          

In [5]:
ak_clean = ak

In [6]:
# convert to datetime
ak_clean['EventDate'] = pd.to_datetime(ak_clean['EventDate'])

In [7]:
ak_clean['EventYear'] = ak_clean['EventDate'].dt.year

In [8]:
ak_clean['EventMonth'] = ak_clean['EventDate'].dt.month

In [9]:
ak_clean['EventDay'] = ak_clean['EventDate'].dt.day

In [10]:
ak_clean['EventTime'] = ak_clean['EventDate'].dt.time

In [11]:
# engineer hour
ak_clean['EventHour'] = [x.hour for x in ak_clean['EventTime']]

In [12]:
def get_season(month):
    if 3 <= month <= 5:
        return "Spring"
    elif 6 <= month <= 8:
        return "Summer"
    elif 9 <= month <= 11:
        return "Fall"
    else:
        return "Winter"

In [13]:
# engineer season
ak_clean['EventSeason'] = ak_clean['EventMonth'].apply(get_season)

In [14]:
# drop unnecessary columns
ak_clean = ak.drop(['Unnamed: 0', 'Unnamed: 37', 'DocketUrl', 'DocketPublishDate', 'State', 'OriginalPublishDate', 'NumberOfEngines', 'ReportNo', 'Country', 'EventID', 'EventDate', 'ReportStatus', 'RepGenFlag'], axis=1)

In [15]:
# replace extra commas
ak_clean['Operator'] = ak_clean['Operator'].str.replace(', LLC', ' LLC').str.replace(', Inc', ' Inc').str.replace(', INC', ' INC').str.replace('SPARKS,', 'SPARKS_').str.replace('SPARKS,', 'SPARKS_').str.replace('STUHR,', 'STUHR_')

In [16]:
def split_and_explode(row):
    columns_to_split = ['N', 'Make', 'Model', 'AirCraftCategory', 'PurposeOfFlight', 'FAR', 'AirCraftDamage', 'Operator', 'AmateurBuilt']
    
    if all(',' in row[column] for column in columns_to_split):
        max_len = 1

        for column in columns_to_split:
            if ',' in row[column]:
                values = [v.strip() for v in row[column].split(',')]
                max_len = max(max_len, len(values))

        if max_len == 1:
            return [row]

        result = []
        for i in range(max_len):
            new_row = row.copy()
            for column in columns_to_split:
                if ',' in row[column]:
                    values = [v.strip() for v in row[column].split(',')]
                else:
                    values = [row[column]]
                new_row[column] = values[i] if i < len(values) else None
            result.append(new_row)

        return result
    else:
        return [row]

In [17]:
# split and explode rows with multiple aircraft
new_rows = []
for index, row in ak_clean.iterrows():
    new_rows.extend(split_and_explode(row))

ak_clean = pd.DataFrame(new_rows)
ak_clean.reset_index(drop=True, inplace=True) 

In [18]:
# remap booleans
ak_clean['AmateurBuilt'] = ak_clean['AmateurBuilt'].replace({'FALSE': 0, 'TRUE': 1})

In [19]:
ak_clean['HasSafetyRec'] = ak_clean['HasSafetyRec'].replace({'False': 0, 'True': 1}).astype(int)

In [20]:
ak_clean.replace('', None, inplace=True)

In [21]:
# fill na

ak_clean.fillna({'HighestInjuryLevel': 'None Reported',
                 'AirCraftDamage': 'None Reported',
                 }, inplace=True)

In [22]:
ak_clean.fillna({'WeatherCondition': 'Unknown',
                 'Scheduled': 'Unknown',
                 'Operator': 'Unknown',
                 'AirportID': 'Unknown',
                 'AirportName': 'Unknown',
                 'PurposeOfFlight': 'Unknown',
                 'ReportType': 'Unknown',
                 'ProbableCause': 'Unknown',
                 'Model': 'Unknown'}, inplace=True)

In [23]:
def camel_to_snake(column_name):
    result = []
    i = 0

    while i < len(column_name):
        if column_name[i].isupper():
            if i > 0 and not column_name[i - 1].isupper():
                result.append('_')
            result.append(column_name[i].lower())
        else:
            result.append(column_name[i])
        i += 1

    return ''.join(result)

In [24]:
# camel to snake case feature names
ak_clean.columns = [camel_to_snake(column) for column in ak_clean.columns]

In [25]:
ak_clean.rename(columns={'air_craft_category': 'aircraft_category', 'air_craft_damage': 'aircraft_damage'}, inplace=True)

In [27]:
# engineer injury bool
ak_clean['has_injury'] = ak_clean.highest_injury_level.apply(lambda x: 0 if x == 'None Reported' else 1)

In [28]:
# engineer aircraft damage bool
ak_clean['has_aircraft_damage'] = ak_clean.aircraft_damage.apply(lambda x: 0 if x == 'None Reported' else 1)

In [29]:
# engineer event type bool
ak_clean['is_accident'] = ak_clean.event_type.apply(lambda x: 0 if x == 'INC' else 1)

In [30]:
ak_clean.head()

Unnamed: 0,ntsb_no,event_type,mkey,city,n,has_safety_rec,report_type,highest_injury_level,fatal_injury_count,serious_injury_count,minor_injury_count,probable_cause,latitude,longitude,make,model,aircraft_category,airport_id,airport_name,amateur_built,scheduled,purpose_of_flight,far,aircraft_damage,weather_condition,operator,event_year,event_month,event_day,event_time,event_hour,event_season,has_injury,has_aircraft_damage,is_accident
0,ANC23LA086,ACC,193153,Trimble River / Skwentna,N2586R,0,DirectorBrief,None Reported,0,0,0,Unknown,61.77516,-152.15263,CESSNA,182K,AIR,Unknown,Trimble River,0,Unknown,PERS,91,Substantial,Unknown,Unknown,2023,9,24,08:30:00,8,Fall,0,1,1
1,ANC23LA084,ACC,193128,Bethel,N8192D,0,DirectorBrief,None Reported,0,0,0,Unknown,60.805019,-161.78648,PIPER,PA-18-150,AIR,Unknown,Unknown,0,Unknown,PERS,91,Substantial,Unknown,Unknown,2023,9,20,12:00:00,12,Fall,0,1,1
2,ANC23LA080,ACC,193097,Homer,N7558H,0,DirectorBrief,None Reported,0,0,0,Unknown,59.646929,-151.49323,CESSNA,A185F,AIR,5BL,HOMER-BELUGA LAKE,0,Unknown,BUS,91,Substantial,VMC,Adventure Airways,2023,9,18,13:00:00,13,Fall,0,1,1
3,ANC23LA082,ACC,193105,Beaver Creek,N713C,0,DirectorBrief,None Reported,0,0,0,Unknown,64.267579,-147.68704,HELIO,H-295,AIR,Unknown,Unknown,0,NSCH,BUS,135,Substantial,Unknown,WRIGHT AIR SERVICE INC,2023,9,16,16:50:00,16,Fall,0,1,1
4,ANC23LA078,ACC,193088,NENANA,N907W,0,DirectorBrief,None Reported,0,0,0,Unknown,64.650753,-149.83639,RHODES STEVEN D,SR3500,AIR,Unknown,Unknown,1,Unknown,PERS,91,Substantial,VMC,Unknown,2023,9,16,15:00:00,15,Fall,0,1,1


In [31]:
ak_clean.isna().sum()

ntsb_no                 0
event_type              0
mkey                    0
city                    0
n                       0
has_safety_rec          0
report_type             0
highest_injury_level    0
fatal_injury_count      0
serious_injury_count    0
minor_injury_count      0
probable_cause          0
latitude                0
longitude               0
make                    0
model                   0
aircraft_category       0
airport_id              0
airport_name            0
amateur_built           0
scheduled               0
purpose_of_flight       0
far                     0
aircraft_damage         0
weather_condition       0
operator                0
event_year              0
event_month             0
event_day               0
event_time              0
event_hour              0
event_season            0
has_injury              0
has_aircraft_damage     0
is_accident             0
dtype: int64

In [32]:
ak_clean.dtypes

ntsb_no                  object
event_type               object
mkey                      int64
city                     object
n                        object
has_safety_rec            int64
report_type              object
highest_injury_level     object
fatal_injury_count        int64
serious_injury_count      int64
minor_injury_count        int64
probable_cause           object
latitude                float64
longitude               float64
make                     object
model                    object
aircraft_category        object
airport_id               object
airport_name             object
amateur_built             int64
scheduled                object
purpose_of_flight        object
far                      object
aircraft_damage          object
weather_condition        object
operator                 object
event_year                int64
event_month               int64
event_day                 int64
event_time               object
event_hour                int64
event_se

In [34]:
ak_clean.head()

Unnamed: 0,ntsb_no,event_type,mkey,city,n,has_safety_rec,report_type,highest_injury_level,fatal_injury_count,serious_injury_count,minor_injury_count,probable_cause,latitude,longitude,make,model,aircraft_category,airport_id,airport_name,amateur_built,scheduled,purpose_of_flight,far,aircraft_damage,weather_condition,operator,event_year,event_month,event_day,event_time,event_hour,event_season,has_injury,has_aircraft_damage,is_accident
0,ANC23LA086,ACC,193153,Trimble River / Skwentna,N2586R,0,DirectorBrief,None Reported,0,0,0,Unknown,61.77516,-152.15263,CESSNA,182K,AIR,Unknown,Trimble River,0,Unknown,PERS,91,Substantial,Unknown,Unknown,2023,9,24,08:30:00,8,Fall,0,1,1
1,ANC23LA084,ACC,193128,Bethel,N8192D,0,DirectorBrief,None Reported,0,0,0,Unknown,60.805019,-161.78648,PIPER,PA-18-150,AIR,Unknown,Unknown,0,Unknown,PERS,91,Substantial,Unknown,Unknown,2023,9,20,12:00:00,12,Fall,0,1,1
2,ANC23LA080,ACC,193097,Homer,N7558H,0,DirectorBrief,None Reported,0,0,0,Unknown,59.646929,-151.49323,CESSNA,A185F,AIR,5BL,HOMER-BELUGA LAKE,0,Unknown,BUS,91,Substantial,VMC,Adventure Airways,2023,9,18,13:00:00,13,Fall,0,1,1
3,ANC23LA082,ACC,193105,Beaver Creek,N713C,0,DirectorBrief,None Reported,0,0,0,Unknown,64.267579,-147.68704,HELIO,H-295,AIR,Unknown,Unknown,0,NSCH,BUS,135,Substantial,Unknown,WRIGHT AIR SERVICE INC,2023,9,16,16:50:00,16,Fall,0,1,1
4,ANC23LA078,ACC,193088,NENANA,N907W,0,DirectorBrief,None Reported,0,0,0,Unknown,64.650753,-149.83639,RHODES STEVEN D,SR3500,AIR,Unknown,Unknown,1,Unknown,PERS,91,Substantial,VMC,Unknown,2023,9,16,15:00:00,15,Fall,0,1,1


In [33]:
ak_clean.to_csv('../datasets/alaska_single_engine_clean.csv', index=False)