In [1]:
from docx import Document
import pandas as pd
import os
import spacy
import textacy.extract
from collections import Counter
import numpy as np

In [2]:
df = pd.read_csv('aggregate_nasa_data.csv')
segments = ['time / day', 'place', 'environment', 'aircraft', 'component', 'person', 'events', 'assessments', 'narrative: 1', 'synopsis']
segment_lens = [len(x) for x in segments]
segment_cnt_dict = {k:0 for k in segments}

In [3]:
'''
This functions retrieves the string index of the next segment (segments[i+1])
Unfortunately, the inconsistencies in the data cause make this function insufficient in retrieving all the segments
properly. This issue is resolved later by retrieving several segments with custom code
'''

def get_next_segment_idx(narrative, segments, i):
    segment = segments[i]
    while(i<len(segments)-1):
        next_segment_start = narrative.find(segments[i+1])
        if next_segment_start != -1:
            return next_segment_start
        i+=1
    return -1

In [4]:
segment_dict = {}
for narrative in df['narratives']:
    for i in range(len(segments)):
        segment = segments[i]
        segment_start = narrative.find(segment)
                
        if segment_start==0: #segment should always be 0 since we cut off the last segment
            if (i < len(segments)-1):
                next_segment_start = get_next_segment_idx(narrative, segments, i)
                next_segment = narrative[next_segment_start:next_segment_start+10]
            text_start = segment_start+segment_lens[i]       
            text = narrative[text_start : next_segment_start]
        else:
            text = " " #have to append something for the lists for each field to be of equal length
       
        if segment in segment_dict:
            segment_dict[segment].append(text)
        else:
            segment_dict[segment] = [text]
        
        narrative = narrative[next_segment_start:] #cut off last segment
        
df_parsed = pd.DataFrame(segment_dict)
df_parsed['type'] = df['type']
df_parsed
'''
Clearly, some fields are missing from the dataframe. This is fixed in the cells below via code specific to each field.
'''

Unnamed: 0,time / day,place,environment,aircraft,component,person,events,assessments,narrative: 1,synopsis,type
0,date : 201710 local time of day : 1201-1800,locale reference.airport : zzzz.airport state ...,light : daylight,reference : x aircraft operator : air carrier ...,,,,,,,cabin_fumes_fire_smoke
1,date : 201708 local time of day : 0601-1200,locale reference.airport : zzz.airport state r...,flight conditions : vmc light : daylight,reference : x atc / advisory.tower : zzz aircr...,,,,,,,cabin_fumes_fire_smoke
2,date : 201708 local time of day : 1201-1800,locale reference.airport : zzz.airport state r...,light : daylight,reference : x aircraft operator : air carrier ...,aircraft component : air conditioning and pres...,: 1reference : 1 location of person.aircraft ...,anomaly.aircraft equipment problem : less seve...,contributing factors / situations : aircraft c...,upon entering the jet bridge the f/as (flight ...,"a319 flight crew reported a strong ""dirty sock...",cabin_fumes_fire_smoke
3,date : 201708 local time of day : 1201-1800,locale reference.atc facility : zzz.artcc stat...,flight conditions : vmc light : daylight,reference : x atc / advisory.center : zzz airc...,aircraft component : turbine engine aircraft r...,reference : 1 location of person.aircraft : x ...,anomaly.aircraft equipment problem : critical ...,contributing factors / situations : aircraft p...,"the aircraft was on a test flight, operating w...",a first officer reported that during testing t...,cabin_fumes_fire_smoke
4,date : 201707 local time of day : 0601-1200,locale reference.atc facility : zzz.artcc stat...,flight conditions : vmc weather elements / vis...,reference : x atc / advisory.center : zzz airc...,aircraft component : navigational equipment an...,reference : 1 location of person.aircraft : x ...,\nanomaly.aircraft equipment problem : less se...,contributing factors / situations : aircraft p...,"in cruise, having just passed over zzz airport...",sr22 pilot reported that during cruise the mul...,cabin_fumes_fire_smoke
5,date : 201708 local time of day : 1201-1800,locale reference.airport : zzz.airport state r...,light : daylight,reference : x atc / advisory.tower : zzz aircr...,aircraft component : apu aircraft reference : ...,: 1reference : 1 location of person.aircraft ...,\nanomaly.aircraft equipment problem : less se...,contributing factors / situations : aircraft p...,when we leaving [the departure airport] and pe...,\na319 flight attendants reported a dirty sock...,cabin_fumes_fire_smoke
6,date : 201708 local time of day : 0601-1200,locale reference.airport : zzz.airport state r...,flight conditions : vmc light : daylight,reference : x atc / advisory.ramp : zzz aircra...,aircraft component : apu aircraft reference : ...,reference : 1 location of person.aircraft : x ...,anomaly.aircraft equipment problem : less seve...,\ncontributing factors / situations : aircraft...,about to push out of gate and then i smelt smo...,crj200 captain reported an apu fire and shutdo...,cabin_fumes_fire_smoke
7,date : 201708 local time of day : 1201-1800,locale reference.airport : mhlm.airport state ...,flight conditions : vmc light : daylight,reference : x atc / advisory.ground : mhlm air...,,,,,,,cabin_fumes_fire_smoke
8,date : 201708 local time of day : 1801-2400,locale reference.atc facility : kzak.artcc sta...,flight conditions : vmc light : dusk,reference : x atc / advisory.center : kzak air...,aircraft component : traffic collision avoidan...,: 1reference : 1 location of person.aircraft ...,anomaly.aircraft equipment problem : critical ...,contributing factors / situations : aircraft p...,"at fl370, smoke was observed emanating from th...",b767-300 flight crew reported smoke from an el...,cabin_fumes_fire_smoke
9,date : 201708 local time of day : 0601-1200,locale reference.atc facility : zzz.tower stat...,flight conditions : vmc light : daylight,reference : x atc / advisory.tower : zzz aircr...,aircraft component : turbine engine aircraft r...,: 1reference : 1 location of person.aircraft ...,anomaly.aircraft equipment problem : critical ...,contributing factors / situations : aircraft p...,"sleep, layover, and crew brief all uneventful ...",b737 flight crew reported that they smelled sm...,cabin_fumes_fire_smoke


In [6]:
'''
Unfortunately, do to the inconsistencies in the data (specifically, the inconsistencies in the segments of each report)
the function we wrote to extract each segment from each accident report is not able to get to perform well. The issue
is that the inconsistencies make it difficult to generalize the approach. Therefore, we've opted to segment-specific
code to properly extract these segments. These segments are the following --> assessments, narrative, synopsis, events
'''

#Extract the US State information from the 'place' field
state_list = []
for text in df_parsed['place']:
    start_idx = text.find('airport state reference')+len('airport state reference')
    end_idx = start_idx+5
    ret = text[start_idx+3:end_idx]
    state_list.append(ret)
df_parsed['state'] = state_list     

assm_list = []
for text in df['narratives']:
    assess_idx = text.find('assessments')
    narr_idx = text.find('narrative: 1')
    ret_text = text[assess_idx:narr_idx]
    assm_list.append(ret_text)
df_parsed['assessments'] = assm_list

narr_list = []
for text in df['narratives']:
    start_idx = text.find('narrative: 1')
    end_idx = text.find('synopsis')
    ret_text = text[start_idx : end_idx]
    narr_list.append(ret_text[12:])
df_parsed['narrative: 1'] = narr_list

syn_list = []
for text in df['narratives']:
    start_idx = text.find('synopsis')
#   end_idx = text.find('synopsis')
    ret_text = text[start_idx:]
    syn_list.append(ret_text[8:])
df_parsed['synopsis'] = syn_list

event_list = []
for text in df['narratives']:
    start_idx = text.find('events')
    end_idx = text.find('assessments')
    ret_text = text[start_idx: end_idx]
    event_list.append(ret_text[6:])
df_parsed['events'] = event_list

In [7]:
'''
Extract /month/year/time information from "time / day" field
'''
def get_month(text):
    return text[11:13]

def get_year(text):
    return text[6:11]

def get_local_time(text):
    return text[-10:]

df_parsed['month'] = df_parsed['time / day'].apply(get_month)
df_parsed['year'] = df_parsed['time / day'].apply(get_year)
df_parsed['local_time'] = df_parsed['time / day'].apply(get_local_time)#LOCAL

In [8]:
def get_prim_problem(text):
    end_index = text.find('primary problem')
    start_index = text.rfind(':', 0, end_index)
    prim_problem = text[start_index:end_index]
    return prim_problem[2:]
df_parsed['primary_problem'] = df_parsed['assessments'].apply(get_prim_problem)

In [9]:
'''
If a keyword in env_list is found in the 'environment' field of a row, then that keyword is added to a list, which 
is then recorded in the column called 'env'
'''
env_list = ['night', 'daytime','haze', 'smoke', 'dusk', 'dawn', 'turbulence', 'thunderstorm', 'rain', 'light', 'cloudy', 'fog', 'icing', 'snow', 'birds']
def get_env_info(text):
    ret = ''
    text = text.split()
    for word in text:
        if word in env_list:
            ret += ' '
            ret += word
    return ret
df_parsed['env'] = df_parsed['environment'].apply(get_env_info)

In [10]:
'''
This one is tricky. Essentially we create a column for every element in env_list (defined in the cell above). Then, we
populate each of those columns (which are initialized to all 0s) with 1s depending on whether or not a specific row
(each row denote one accident) has a certain keyword in it's 'env' column. This is essentially a manual way to do
dummies, which I could not do because dummies does not work with lists.
'''
for elem in env_list:
    df_parsed[elem] = 0
for i in range(len(df_parsed['env'])):
    cell = df_parsed['env'].iloc[i]
    cell = cell.split()
    for word in cell:
        df_parsed[word].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
'''
We've extracted all the necessary information from the columns so now we can drop certain columns
'''
df_parsed = df_parsed.drop(columns = ['time / day', 'place', 'component', 'aircraft', 'environment', 'assessments', 'person'])
df_parsed = df_parsed.rename(columns = {'narrative: 1': 'narrative'})
df_parsed

Unnamed: 0,events,narrative,synopsis,type,state,month,year,local_time,primary_problem,env,...,dawn,turbulence,thunderstorm,rain,light,cloudy,fog,icing,snow,birds
0,anomaly.aircraft equipment problem : critical ...,i was one of the four crew members who were al...,air carrier pilot reported performing an evacu...,cabin_fumes_fire_smoke,fo,10,2017,1201-1800,procedure,light,...,0,0,0,0,1,0,0,0,0,0
1,anomaly.flight deck / cabin / aircraft event :...,a few seconds after v1 the cockpit filled with...,b757 flight crew reported smoke in the cockpit...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,procedure,light,...,0,0,0,0,1,0,0,0,0,0
2,anomaly.aircraft equipment problem : less seve...,upon entering the jet bridge the f/as (flight ...,"a319 flight crew reported a strong ""dirty sock...",cabin_fumes_fire_smoke,us,08,2017,1201-1800,procedure,light,...,0,0,0,0,1,0,0,0,0,0
3,anomaly.aircraft equipment problem : critical ...,"the aircraft was on a test flight, operating w...",a first officer reported that during testing t...,cabin_fumes_fire_smoke,li,08,2017,1201-1800,aircraft,light,...,0,0,0,0,1,0,0,0,0,0
4,\nanomaly.aircraft equipment problem : less se...,"in cruise, having just passed over zzz airport...",sr22 pilot reported that during cruise the mul...,cabin_fumes_fire_smoke,li,07,2017,0601-1200,aircraft,light,...,0,0,0,0,1,0,0,0,0,0
5,\nanomaly.aircraft equipment problem : less se...,when we leaving [the departure airport] and pe...,\na319 flight attendants reported a dirty sock...,cabin_fumes_fire_smoke,us,08,2017,1201-1800,aircraft,light,...,0,0,0,0,1,0,0,0,0,0
6,anomaly.aircraft equipment problem : less seve...,about to push out of gate and then i smelt smo...,crj200 captain reported an apu fire and shutdo...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,aircraft,light,...,0,0,0,0,1,0,0,0,0,0
7,anomaly.flight deck / cabin / aircraft event :...,"after parking at the gate, ecam: ""smoke aft ca...",airbus captain reported receiving a smoke aft ...,cabin_fumes_fire_smoke,fo,08,2017,1201-1800,procedure,light,...,0,0,0,0,1,0,0,0,0,0
8,anomaly.aircraft equipment problem : critical ...,"at fl370, smoke was observed emanating from th...",b767-300 flight crew reported smoke from an el...,cabin_fumes_fire_smoke,li,08,2017,1801-2400,aircraft,light dusk,...,0,0,0,0,1,0,0,0,0,0
9,anomaly.aircraft equipment problem : critical ...,"sleep, layover, and crew brief all uneventful ...",b737 flight crew reported that they smelled sm...,cabin_fumes_fire_smoke,li,08,2017,0601-1200,aircraft,light,...,0,0,0,0,1,0,0,0,0,0


In [13]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def hasChar(inputString):
    l = [',']
    for char in inputString:
        if char.isdigit() == False and char not in l:
            return True
        
def notPlane(word):
    for char in word:
        if char == '-' and hasNumbers(word) ==False:
            return True

In [14]:
'''
The 'aircraft' field is so inconsistent that it has proven to be unreliable. However, the synopsis field does contain
the name of the aicraft for virtually every accident. So the following code is a very specialized attempt at extracting 
the name of the aircraft from the synopsis field. Of course, the main challenge is that there is no dependable list
of aircraft names available. However, because nearly all aircraft names are of the form ccc-nn where c denotes chars
and n denotes numbers, it was possible to figure out some rules to extract the aircraft names. 

freq_dict contains a wide range of elements which all have one of the following forms: cn, c-n, n-n, c-c
good_freq_dict contains a more specific set of element, but the rules for good_freq_dict alone do not eliminate
all non-plane works. Therefore, to create the list of planes, we take every element from good_freq_dict 
that IS NOT in freq_dict. the result is a list of all plane names mentioned in the data. 
'''

freq_dict={}
for text in df_parsed['synopsis']:
    text = text.split()
    for word in text:
        if '-' in word or hasNumbers(word):
            if hasChar(word):
                if notPlane(word) == True:
                    if word in freq_dict:
                        freq_dict[word]+=1
                    else:
                        freq_dict[word]=1
                        
good_freq_dict={}
for gtext in df_parsed['synopsis']:
    gtext = gtext.split()
    for gword in gtext:
        if '-' in gword or hasNumbers(gword):
            if hasChar(gword):
                if gword in good_freq_dict:
                    good_freq_dict[gword]+=1
                else:
                    good_freq_dict[gword]=1

                    
plane_names_dict = {k: v for k, v in good_freq_dict.items() if k not in freq_dict}
plane_names_dict = {k: v for k, v in plane_names_dict.items() if v >3 }
plane_list = [k for k in plane_names_dict.items()]
plane_list = [item[0] for item in plane_list]
plane_list

['b757',
 'a319',
 'sr22',
 'crj200',
 'b767-300',
 'b737',
 'md-11',
 'md80',
 'b767',
 'a320',
 'crj700',
 'crj-700',
 'emb-145',
 'b747',
 'crj-200',
 'md11',
 'c150',
 'c172',
 'ce-750',
 'pc-12',
 'sr20',
 'pa28',
 'a300',
 'c90',
 '#1',
 'pa-28',
 'be35',
 'b737-700',
 'a321',
 'emb-175',
 'be-400',
 'erj-170',
 'crj-900',
 'erj-175',
 'c182',
 'b777',
 'emb175',
 'cl60',
 'ce-560',
 'b737-800',
 'b777.',
 '22l',
 'g200',
 'a330',
 'b737ng',
 'b787',
 'a320.',
 'b757.']

In [15]:
'''
Essentially we create a column for every element in plane_list. Then, we
populate each of those columns (which are initialized to all 0s) with 1s depending on whether or not a specific row
(each row denote one accident) has that plane's name in it's 'synopsis'. This is essentially a manual and more custom
way to do dummies.
'''
for elem in plane_list:
    df_parsed[elem] = 0
for i in range(len(df_parsed['synopsis'])):
    text = df_parsed['synopsis'].iloc[i]
    for word in text:
        if word in plane_list:
            df_parsed[elem].iloc[i] = 1

Unnamed: 0,events,narrative,synopsis,type,state,month,year,local_time,primary_problem,env,...,ce-560,b737-800,b777.,22l,g200,a330,b737ng,b787,a320.,b757.
0,anomaly.aircraft equipment problem : critical ...,i was one of the four crew members who were al...,air carrier pilot reported performing an evacu...,cabin_fumes_fire_smoke,fo,10,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,0
1,anomaly.flight deck / cabin / aircraft event :...,a few seconds after v1 the cockpit filled with...,b757 flight crew reported smoke in the cockpit...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,procedure,light,...,0,0,0,0,0,0,0,0,0,0
2,anomaly.aircraft equipment problem : less seve...,upon entering the jet bridge the f/as (flight ...,"a319 flight crew reported a strong ""dirty sock...",cabin_fumes_fire_smoke,us,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,0
3,anomaly.aircraft equipment problem : critical ...,"the aircraft was on a test flight, operating w...",a first officer reported that during testing t...,cabin_fumes_fire_smoke,li,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,0,0
4,\nanomaly.aircraft equipment problem : less se...,"in cruise, having just passed over zzz airport...",sr22 pilot reported that during cruise the mul...,cabin_fumes_fire_smoke,li,07,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,0
5,\nanomaly.aircraft equipment problem : less se...,when we leaving [the departure airport] and pe...,\na319 flight attendants reported a dirty sock...,cabin_fumes_fire_smoke,us,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,0,0
6,anomaly.aircraft equipment problem : less seve...,about to push out of gate and then i smelt smo...,crj200 captain reported an apu fire and shutdo...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,0
7,anomaly.flight deck / cabin / aircraft event :...,"after parking at the gate, ecam: ""smoke aft ca...",airbus captain reported receiving a smoke aft ...,cabin_fumes_fire_smoke,fo,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,0
8,anomaly.aircraft equipment problem : critical ...,"at fl370, smoke was observed emanating from th...",b767-300 flight crew reported smoke from an el...,cabin_fumes_fire_smoke,li,08,2017,1801-2400,aircraft,light dusk,...,0,0,0,0,0,0,0,0,0,0
9,anomaly.aircraft equipment problem : critical ...,"sleep, layover, and crew brief all uneventful ...",b737 flight crew reported that they smelled sm...,cabin_fumes_fire_smoke,li,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,0


In [16]:
'''
turn month information into seasonal information. This turned out to be unhelpful in our classification models
'''
def monthify(text):
    winter=['12','01','02']
    spring=['03','04','05']
    summer=['06','07','08']
    fall=['09','10','11']
    if text in winter:
        return 0
    if text in spring:
        return 1
    if text in summer:
        return 2
    else:
        return 3
df_parsed['season'] = df_parsed['month'].apply(monthify)

Unnamed: 0,events,narrative,synopsis,type,state,month,year,local_time,primary_problem,env,...,b737-800,b777.,22l,g200,a330,b737ng,b787,a320.,b757.,season
0,anomaly.aircraft equipment problem : critical ...,i was one of the four crew members who were al...,air carrier pilot reported performing an evacu...,cabin_fumes_fire_smoke,fo,10,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,3
1,anomaly.flight deck / cabin / aircraft event :...,a few seconds after v1 the cockpit filled with...,b757 flight crew reported smoke in the cockpit...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,procedure,light,...,0,0,0,0,0,0,0,0,0,2
2,anomaly.aircraft equipment problem : less seve...,upon entering the jet bridge the f/as (flight ...,"a319 flight crew reported a strong ""dirty sock...",cabin_fumes_fire_smoke,us,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,2
3,anomaly.aircraft equipment problem : critical ...,"the aircraft was on a test flight, operating w...",a first officer reported that during testing t...,cabin_fumes_fire_smoke,li,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,0,2
4,\nanomaly.aircraft equipment problem : less se...,"in cruise, having just passed over zzz airport...",sr22 pilot reported that during cruise the mul...,cabin_fumes_fire_smoke,li,07,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,2
5,\nanomaly.aircraft equipment problem : less se...,when we leaving [the departure airport] and pe...,\na319 flight attendants reported a dirty sock...,cabin_fumes_fire_smoke,us,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,0,2
6,anomaly.aircraft equipment problem : less seve...,about to push out of gate and then i smelt smo...,crj200 captain reported an apu fire and shutdo...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,2
7,anomaly.flight deck / cabin / aircraft event :...,"after parking at the gate, ecam: ""smoke aft ca...",airbus captain reported receiving a smoke aft ...,cabin_fumes_fire_smoke,fo,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,0,2
8,anomaly.aircraft equipment problem : critical ...,"at fl370, smoke was observed emanating from th...",b767-300 flight crew reported smoke from an el...,cabin_fumes_fire_smoke,li,08,2017,1801-2400,aircraft,light dusk,...,0,0,0,0,0,0,0,0,0,2
9,anomaly.aircraft equipment problem : critical ...,"sleep, layover, and crew brief all uneventful ...",b737 flight crew reported that they smelled sm...,cabin_fumes_fire_smoke,li,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,0,2


In [None]:
event_dict = {}
for event in df_parsed['events']:
    event = event.split(':')
    for elem in event:
        if elem in event_dict:
            event_dict[elem]+=1
        else:
            event_dict[elem] = 1
event_dict

In [18]:
#Create Vectorized df (use dummies)
df_vectorized = pd.DataFrame()
category_list = 'primary_problem month state'.split()
for category in df_parsed:
    if category in category_list:
        df_new = pd.get_dummies(df_parsed[category], prefix = category)
        df_vectorized = pd.concat([df_vectorized, df_new], axis=1)

#Concatenate env columns and plane columns into vectorized df
df_vectorized = pd.concat([df_vectorized, df_parsed[env_list] ], axis=1)
df_vectorized = pd.concat([df_vectorized, df_parsed[plane_list] ], axis=1)
df_vectorized

Unnamed: 0,state_,state_ :,state_ak,state_al,state_ar,state_az,state_ca,state_co,state_dc,state_fl,...,ce-560,b737-800,b777.,22l,g200,a330,b737ng,b787,a320.,b757.
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
#Vectorize type column
unique_type_list = list(df_parsed['type'].unique())
df_parsed['type_vectorized'] = 0
#unique_list = []
for i in range(len(df_parsed['type'])):
    cell = df_parsed['type'].iloc[i]
    for k in range(len(unique_type_list)):  #k is the unique int value of that type, k range i 0-29 inclusive
        if cell == unique_type_list[k]:
            df_parsed['type_vectorized'].iloc[i] = k 
df_parsed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,events,narrative,synopsis,type,state,month,year,local_time,primary_problem,env,...,b777.,22l,g200,a330,b737ng,b787,a320.,b757.,season,type_vectorized
0,anomaly.aircraft equipment problem : critical ...,i was one of the four crew members who were al...,air carrier pilot reported performing an evacu...,cabin_fumes_fire_smoke,fo,10,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,3,0
1,anomaly.flight deck / cabin / aircraft event :...,a few seconds after v1 the cockpit filled with...,b757 flight crew reported smoke in the cockpit...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,procedure,light,...,0,0,0,0,0,0,0,0,2,0
2,anomaly.aircraft equipment problem : less seve...,upon entering the jet bridge the f/as (flight ...,"a319 flight crew reported a strong ""dirty sock...",cabin_fumes_fire_smoke,us,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,2,0
3,anomaly.aircraft equipment problem : critical ...,"the aircraft was on a test flight, operating w...",a first officer reported that during testing t...,cabin_fumes_fire_smoke,li,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,2,0
4,\nanomaly.aircraft equipment problem : less se...,"in cruise, having just passed over zzz airport...",sr22 pilot reported that during cruise the mul...,cabin_fumes_fire_smoke,li,07,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,2,0
5,\nanomaly.aircraft equipment problem : less se...,when we leaving [the departure airport] and pe...,\na319 flight attendants reported a dirty sock...,cabin_fumes_fire_smoke,us,08,2017,1201-1800,aircraft,light,...,0,0,0,0,0,0,0,0,2,0
6,anomaly.aircraft equipment problem : less seve...,about to push out of gate and then i smelt smo...,crj200 captain reported an apu fire and shutdo...,cabin_fumes_fire_smoke,us,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,2,0
7,anomaly.flight deck / cabin / aircraft event :...,"after parking at the gate, ecam: ""smoke aft ca...",airbus captain reported receiving a smoke aft ...,cabin_fumes_fire_smoke,fo,08,2017,1201-1800,procedure,light,...,0,0,0,0,0,0,0,0,2,0
8,anomaly.aircraft equipment problem : critical ...,"at fl370, smoke was observed emanating from th...",b767-300 flight crew reported smoke from an el...,cabin_fumes_fire_smoke,li,08,2017,1801-2400,aircraft,light dusk,...,0,0,0,0,0,0,0,0,2,0
9,anomaly.aircraft equipment problem : critical ...,"sleep, layover, and crew brief all uneventful ...",b737 flight crew reported that they smelled sm...,cabin_fumes_fire_smoke,li,08,2017,0601-1200,aircraft,light,...,0,0,0,0,0,0,0,0,2,0


In [20]:
df_parsed['primary_problem'].value_counts()

procedure                                        515
human factors                                    451
weather                                          208
aircraft                                         176
environment - non weather related                 53
staffing                                          24
company policy                                    15
airspace structure                                14
chart or publication                              14
equipment / tooling                                7
airport                                            6
incorrect / not installed / unavailable part       5
manuals                                            5
atc equipment / nav facility / buildings           5
mel                                                1
                                                   1
Name: primary_problem, dtype: int64

In [21]:
df_vectorized['type_target'] = df_parsed['type_vectorized']

In [22]:
'''
run tfidf on the 'narrative', 'synopsis', 'events' fields
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(stop_words='english', min_df=4, max_df=0.8)
svd = TruncatedSVD(n_components=1, n_iter=5) #reduce down to 1 dimension (why not 2?)

tfidf_data_narr = vectorizer.fit_transform(df_parsed['narrative'])
tfidf_data_syn = vectorizer.fit_transform(df_parsed['synopsis'])
tfidf_data_event = vectorizer.fit_transform(df_parsed['events'])

svd_data_narr = svd.fit_transform(tfidf_data_narr) 
svd_data_syn = svd.fit_transform(tfidf_data_syn)
svd_data_event = svd.fit_transform(tfidf_data_event)

df_vectorized['narr_svd'] = svd_data_narr
df_vectorized['syn_svd'] = svd_data_syn
df_vectorized['event_svd'] = svd_data_event


In [23]:
df_scrambled = df_vectorized.sample(frac=1)
df_scrambled

Unnamed: 0,state_,state_ :,state_ak,state_al,state_ar,state_az,state_ca,state_co,state_dc,state_fl,...,g200,a330,b737ng,b787,a320.,b757.,type_target,narr_svd,syn_svd,event_svd
1231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,24,0.208400,0.183013,0.733861
479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,9,0.161642,0.220395,0.356799
624,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,12,0.162531,0.119641,0.469560
315,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,6,0.265814,0.305829,0.698014
1226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,24,0.135513,0.165756,0.771774
1242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,24,0.207777,0.257561,0.759980
1285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,25,0.139629,0.097730,0.397724
255,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,5,0.338852,0.265873,0.146041
700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,14,0.137422,0.146205,0.336420
723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,14,0.211938,0.084696,0.594836


In [24]:
'''
Create subsets for x test and train data, and y test and train data
'''
df_xtrain = df_scrambled.drop(columns = ['type_target'])[:1200]
df_ytrain = df_scrambled['type_target'][:1200]

df_xtest = df_scrambled.drop(columns = ['type_target'])[1200:]
df_ytest = df_scrambled['type_target'][1200:]

In [25]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc.fit(df_xtrain, df_ytrain)
y_pred_test = dtc.predict(df_xtest)
score = dtc.score(df_xtest, df_ytest)
print(score)

0.14333333333333334


In [26]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
logreg.fit(df_xtrain, df_ytrain)
logreg_preict = logreg.predict(df_xtest)
score = logreg.score(df_xtest, df_ytest)
print(score)

0.27


In [27]:
from sklearn.neighbors import KNeighborsClassifier

def get_kmeans_acc(df_xtrain, df_ytrain, df_xtest, df_ytest):
    score = 0.0
    max_score = 0.0
    max_k = 0
    for k in range(1,8): # this part optimizes k
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(df_xtrain, df_ytrain)
        y_pred_test = knn.predict(df_xtest)
        score = knn.score(df_xtest, df_ytest)
        if score > max_score:
            max_score = score
            max_k = k
#   print('k = ', k, 'Accuracy on test data: {}'.format(score))
    return score
get_kmeans_acc(df_xtrain, df_ytrain, df_xtest, df_ytest)

0.24

In [28]:
from sklearn import svm
def get_svm_acc(df_xtrain, df_ytrain, df_xtest, df_ytest):
    svc = svm.SVC(kernel='linear')
    df_svc_xtrain = df_xtrain
    df_svc_ytrain = df_ytrain
    svc.fit(df_svc_xtrain, df_svc_ytrain)
    y_pred_test = svc.predict(df_xtest)
    score = svc.score(df_xtest, df_ytest)
    return score
get_svm_acc(df_xtrain, df_ytrain, df_xtest, df_ytest)

0.27

In [None]:
'''
SVM classification and logistic regression has the highest accuracy
'''