# Imports

In [96]:
%reset -f
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
import re

import pandas as pd

from controller import Controller

In [98]:
c = Controller('i01')

# Load data

In [99]:
filepath = fr'{c.get_path_data_original()}/asrs-aviation-reports-train.jsonl'
df_train = pd.read_json(path_or_buf=filepath, lines=True)
df_train['train_val_test_split'] = 'Train'
print(f'{df_train.shape=}')

filepath = fr'{c.get_path_data_original()}/asrs-aviation-reports-test.jsonl'
df_test = pd.read_json(path_or_buf=filepath, lines=True)
df_test['train_val_test_split'] = 'Test'
print(f'{df_test.shape=}')

filepath = fr'{c.get_path_data_original()}/asrs-aviation-reports-validation.jsonl'
df_validation = pd.read_json(path_or_buf=filepath, lines=True)
df_validation['train_val_test_split'] = 'Validation'
print(f'{df_validation.shape=}')

df_train.shape=(38655, 112)
df_test.shape=(4773, 112)
df_validation.shape=(4295, 112)


# Combine in one dataframe

In [100]:
df_train_val_test = pd.concat([df_train, df_validation, df_test], ignore_index=True)
print(f'{df_train_val_test.shape=}')

pd.concat([
    df_train_val_test['train_val_test_split'].value_counts(dropna=False),
    df_train_val_test['train_val_test_split'].value_counts(dropna=False, normalize=True),
], axis=1, keys=['Total #', 'Total %'])

print('Train')
df_train['train_val_test_split'].value_counts()

df_train_val_test.shape=(47723, 112)
Train


Train    38655
Name: train_val_test_split, dtype: int64

# Make all column names in capital case

In [101]:
def clean_column_names(name: str) -> str:
    # convert whitespace and punctuation to an underscore
    name = re.sub(r'[\s\W]+', '_', name)

    # make upper case
    name = name.upper()

    return name

In [102]:
print('############## Before ##############')
print(df_train_val_test.columns)

print()
print('############## After ##############')
print(df_train_val_test.columns.map(clean_column_names))

############## Before ##############
Index(['acn_num_ACN', 'Time_Date', 'Time.1_Local Time Of Day',
       'Place_Locale Reference', 'Place.1_State Reference',
       'Place.2_Relative Position.Angle.Radial',
       'Place.3_Relative Position.Distance.Nautical Miles',
       'Place.4_Altitude.AGL.Single Value',
       'Place.5_Altitude.MSL.Single Value', 'Environment_Flight Conditions',
       ...
       'Events.4_When Detected', 'Events.5_Result',
       'Assessments_Contributing Factors / Situations',
       'Assessments.1_Primary Problem', 'Report 1_Narrative',
       'Report 1.1_Callback', 'Report 2_Narrative', 'Report 2.1_Callback',
       'Report 1.2_Synopsis', 'train_val_test_split'],
      dtype='object', length=112)

############## After ##############
Index(['ACN_NUM_ACN', 'TIME_DATE', 'TIME_1_LOCAL_TIME_OF_DAY',
       'PLACE_LOCALE_REFERENCE', 'PLACE_1_STATE_REFERENCE',
       'PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL',
       'PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILE

In [103]:
df_train_val_test.columns = df_train_val_test.columns.map(clean_column_names)
df_train_val_test

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,EVENTS_4_WHEN_DETECTED,EVENTS_5_RESULT,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,In-flight,Air Traffic Control Issued New Clearance; Flig...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,In-flight,General None Reported / Taken,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,In-flight,Air Traffic Control Provided Assistance; Air T...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,In-flight,Flight Crew Became Reoriented; General Mainten...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,In-flight,Air Traffic Control Issued New Clearance; Flig...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,In-flight,Air Traffic Control Provided Assistance; Fligh...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,In-flight,Air Traffic Control Issued Advisory / Alert; F...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test
47720,1756601,202008,,,,,,0.0,,,...,,General None Reported / Taken,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,In-flight,Air Traffic Control Issued New Clearance; Flig...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test


# Saving to files

In [None]:
filepath = fr'{c.get_path_data_prepared()}/01_df_train_val_test.pkl'
df_train_val_test.to_pickle(filepath)