In [1]:
import pandas as pd
import numpy as np
import typing
from nltk.tokenize import sent_tokenize

In [2]:
aviationData = pd.read_csv("narrativesAviationCsv.csv", encoding='mac-roman').fillna('')
aviationData

Unnamed: 0,ev_id,Aircraft_Key,narr_accf,narr_cause,Unnamed: 4,lchg_date,lchg_userid
0,20001204X00000,1,The certificated commercial pilot reported tha...,Loss of engine power due to fracture of the en...,,12/12/2000,dbo
1,20001204X00001,1,"After landing, the number four engine thrust r...",The improper overhaul of the thrust reverser p...,,12/12/2000,dbo
2,20001204X00002,1,"The certificated airline transport pilot, with...",The failure of the driver of a vehicle to main...,,4/27/2001,dbo
3,20001204X00003,1,The certificated private pilot departed the ac...,The pilot's attempted flight into adverse weat...,,4/27/2001,dbo
4,20001204X00004,1,The certificated commercial pilot reported tha...,The pilot's continued flight into adverse weat...,,12/12/2000,dbo
...,...,...,...,...,...,...,...
63907,20101006X81741,1,,,,11/10/2011,
63908,20120502X05030,1,,,,5/2/2012,
63909,20121004X53806,1,"While maneuvering at 3,000 feet over the Atlan...",Extensive salt buildup in the inlets of all fo...,,5/23/2013,
63910,20170710X10920,1,While on a cross-country flight over mountaino...,"Failure by the pilot-in-command, for unknown r...",,9/19/2017,


In [3]:
trueReports = aviationData.narr_accf.astype('string').apply(sent_tokenize).values

In [4]:
eleven_sentence_reports = [report for report in trueReports if len(report) == 11][:100]

In [11]:
import random

random.seed(487)
num_permutations_per_report = 20

permutations = []
for report in eleven_sentence_reports:
    perms_of_curr_report = [report]
    for _ in range(num_permutations_per_report):
        shuffled = report.copy()
        random.shuffle(shuffled)
        if shuffled not in perms_of_curr_report:
            permutations.append(shuffled)
            perms_of_curr_report.append(shuffled)
            
print(permutations[0])
print(eleven_sentence_reports[0])

['The airplane cruising endurance based on 75% power and 90 gallons of fuel on board is 5.4 hours.', 'During this time, the engine began running smoothly whereupon the pilot continued to his initial destination.', 'Shortly after departing, the engine began running rough upon which the pilot began a deviation to another airport.', 'The pilot performed a forced landing causing substantial damage.', 'However, the usable fuel capacity on this airplane when the fuel tanks are filled to the bottom of the filler necks is approximately 80 gallons.', 'The pilot operated the spring loaded valve handle inside the airplane which apparently stopped the leak.', 'After starting the engine, fuel began leaking from the fuel sump drain hose.', 'The preflight inspection of the fuel tanks by the pilot revealed the tanks were filled to the bottom of the filler neck, which the pilot believed was appropriately full.', 'A Federal Aviation Administration inspector who examined the airplane noted only residual 

In [12]:
# should be true
len(eleven_sentence_reports) == len(permutations) / 20

True

In [13]:
joined_reports = [' '.join(report) for report in eleven_sentence_reports]
joined_perms = [' '.join(perm) for perm in permutations]
print(joined_reports[0])
print(joined_perms[0])

The preflight inspection of the fuel tanks by the pilot revealed the tanks were filled to the bottom of the filler neck, which the pilot believed was appropriately full. After starting the engine, fuel began leaking from the fuel sump drain hose. The pilot operated the spring loaded valve handle inside the airplane which apparently stopped the leak. Shortly after departing, the engine began running rough upon which the pilot began a deviation to another airport. During this time, the engine began running smoothly whereupon the pilot continued to his initial destination. During descent, the engine began running rough and the pilot decided the tanks had been exhausted. The pilot performed a forced landing causing substantial damage. The airplane cruising endurance based on 75% power and 90 gallons of fuel on board is 5.4 hours. However, the usable fuel capacity on this airplane when the fuel tanks are filled to the bottom of the filler necks is approximately 80 gallons. The actual flight

In [14]:
d = {
    'paragraph': joined_reports + joined_perms,
    'is_coherent': [1] * 100 + [0] * 2000
}
pars_with_label = pd.DataFrame(d)
pars_with_label

Unnamed: 0,paragraph,is_coherent
0,The preflight inspection of the fuel tanks by ...,1
1,"The pilot reported that he was cleared to 4,00...",1
2,The instrument-rated private pilot lost contro...,1
3,The non-instrument rated private pilot was rec...,1
4,The commercial pilot reported a partial power ...,1
...,...,...
2095,THE FUEL SELECTOR WAS POSITIONED ON THE RIGHT ...,0
2096,DURING AN INTENTIONAL MANEUVER TO AVOID A HOME...,0
2097,RESIDUAL FUEL WAS FOUND IN THE CARBURETOR BOWL...,0
2098,"AT A 75% POWER SETTING, FUEL FLOW IS ABOUT 10....",0


In [15]:
pars_with_label.to_csv('aviationPerms.csv', index=False)

In [16]:
reconstructed = pd.read_csv('aviationPerms.csv')
reconstructed

Unnamed: 0,paragraph,is_coherent
0,The preflight inspection of the fuel tanks by ...,1
1,"The pilot reported that he was cleared to 4,00...",1
2,The instrument-rated private pilot lost contro...,1
3,The non-instrument rated private pilot was rec...,1
4,The commercial pilot reported a partial power ...,1
...,...,...
2095,THE FUEL SELECTOR WAS POSITIONED ON THE RIGHT ...,0
2096,DURING AN INTENTIONAL MANEUVER TO AVOID A HOME...,0
2097,RESIDUAL FUEL WAS FOUND IN THE CARBURETOR BOWL...,0
2098,"AT A 75% POWER SETTING, FUEL FLOW IS ABOUT 10....",0
