In [137]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [138]:
#read in csv
alaska = pd.read_csv('../datasets/alaska_single_engine_clean.csv')
pd.set_option('display.max_columns', None)
alaska

Unnamed: 0,NtsbNo,EventType,Mkey,City,N,HasSafetyRec,ReportType,HighestInjuryLevel,FatalInjuryCount,SeriousInjuryCount,MinorInjuryCount,ProbableCause,Latitude,Longitude,Make,Model,AirCraftCategory,AirportID,AirportName,AmateurBuilt,Scheduled,PurposeOfFlight,FAR,AirCraftDamage,WeatherCondition,Operator,EventYear,EventMonth,EventDay,EventTime,EventSeason
0,ANC23LA086,ACC,193153,Trimble River / Skwentna,N2586R,0,DirectorBrief,None Reported,0,0,0,Unknown,61.775160,-152.152630,CESSNA,182K,AIR,Unknown,Trimble River,0,Unknown,PERS,091,Substantial,Unknown,Unknown,2023,9,24,08:30:00,Fall
1,ANC23LA084,ACC,193128,Bethel,N8192D,0,DirectorBrief,None Reported,0,0,0,Unknown,60.805019,-161.786480,PIPER,PA-18-150,AIR,Unknown,Unknown,0,Unknown,PERS,091,Substantial,Unknown,Unknown,2023,9,20,12:00:00,Fall
2,ANC23LA080,ACC,193097,Homer,N7558H,0,DirectorBrief,None Reported,0,0,0,Unknown,59.646929,-151.493230,CESSNA,A185F,AIR,5BL,HOMER-BELUGA LAKE,0,Unknown,BUS,091,Substantial,VMC,Adventure Airways,2023,9,18,13:00:00,Fall
3,ANC23LA082,ACC,193105,Beaver Creek,N713C,0,DirectorBrief,None Reported,0,0,0,Unknown,64.267579,-147.687040,HELIO,H-295,AIR,Unknown,Unknown,0,NSCH,BUS,135,Substantial,Unknown,WRIGHT AIR SERVICE INC,2023,9,16,16:50:00,Fall
4,ANC23LA078,ACC,193088,NENANA,N907W,0,DirectorBrief,None Reported,0,0,0,Unknown,64.650753,-149.836390,RHODES STEVEN D,SR3500,AIR,Unknown,Unknown,1,Unknown,PERS,091,Substantial,VMC,Unknown,2023,9,16,15:00:00,Fall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,ANC82DA014,ACC,69764,ANCHORAGE,N63516,0,DirectorBrief,Minor,0,0,1,Unknown,61.219909,-149.850982,CESSNA,150,AIR,MRI,MERRILL FIELD,0,Unknown,INST,091,Destroyed,VMC,"AERO TECH FLIGHT SERVICE, WC",1982,1,17,10:12:00,Winter
5655,ANC82DA015,ACC,71905,NEAR NUIQSUT,N1459T,0,DirectorBrief,None Reported,0,0,0,Unknown,71.280578,-156.779296,de Havilland,DHC-2-MK3,AIR,Unknown,Unknown,0,NSCH,UNK,135,Substantial,VMC,"SEA AIRMOTIVE, INC.",1982,1,15,17:00:00,Winter
5656,ANC82DA013,ACC,72330,QUINHAGAK,N756YN,0,DirectorBrief,Minor,0,0,3,Unknown,59.740913,-161.889205,CESSNA,U206,AIR,Unknown,Unknown,0,NSCH,UNK,135,Substantial,VMC,TRI-CITY AIR SERVICE,1982,1,10,17:55:00,Winter
5657,ANC82FA012,ACC,69937,BETHEL,N3343S,0,DirectorBrief,Serious,0,2,0,Unknown,60.809120,-161.849258,CESSNA,A185F,AIR,BET,BETHEL,0,NSCH,UNK,135,Substantial,VMC,EXECUTIVE CHARTER SERVICE,1982,1,5,09:39:00,Winter


In [139]:
#function to preprocess the 'ProbableCause' column
#this function takes the text from the csv and preprocesses it 
#(changes to lower case, sent tokenzies, word tokenizes,
#removes unnecessary characters, lemmatizes, and puts it in a dataframe)
#help for this function was used from https://git.generalassemb.ly/dsi-911/504-lesson-nlp-i/blob/master/solution-code/solution-code.ipynb and my previous project 3
def preprocessing(dataframe, prob_cause):
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer('\w+')
    text = dataframe[prob_cause]
    
    lemmatized_data = []
    text = [sent_tokenize(t) for t in text]
    
    for sentences in text:
        sentence = []
        for s in sentences:
            words = tokenizer.tokenize(s)
            words_lemmatized = [lemmatizer.lemmatize(word.lower()) for word in words]
            sentences_lemmatized = ' '.join(words_lemmatized)
            sentence.append(sentences_lemmatized)
        lemmatized_data.append(sentence)
    
    new_data = [' '.join(s) for s in lemmatized_data]
    new_df = pd.DataFrame({'probable_cause': new_data, 'mkey': alaska['Mkey'], 'weather': alaska['WeatherCondition'], 'season': alaska['EventSeason'], 'make': alaska['Make'], 'model': alaska['Model'], 'damage': alaska['AirCraftDamage'], 'latitude': alaska['Latitude'], 'longitude': alaska['Longitude'], 'highest_injury': alaska['HighestInjuryLevel']})
    
    return new_df

In [140]:
new_alaska = preprocessing(alaska, 'ProbableCause')
new_alaska.head()

Unnamed: 0,probable_cause,mkey,weather,season,make,model,damage,latitude,longitude,highest_injury
0,unknown,193153,Unknown,Fall,CESSNA,182K,Substantial,61.77516,-152.15263,None Reported
1,unknown,193128,Unknown,Fall,PIPER,PA-18-150,Substantial,60.805019,-161.78648,None Reported
2,unknown,193097,VMC,Fall,CESSNA,A185F,Substantial,59.646929,-151.49323,None Reported
3,unknown,193105,Unknown,Fall,HELIO,H-295,Substantial,64.267579,-147.68704,None Reported
4,unknown,193088,VMC,Fall,RHODES STEVEN D,SR3500,Substantial,64.650753,-149.83639,None Reported


In [141]:
#X = new_alaska.drop('highest_injury', axis=1)
X = new_alaska['probable_cause']
y = new_alaska['highest_injury']

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# CountVectorizer

In [143]:
cvec = CountVectorizer(stop_words='english')

In [144]:
cvec.fit(X_train)

In [145]:
X_train = cvec.transform(X_train)

In [146]:
X_test = cvec.transform(X_test)

In [147]:
X_train_df = pd.DataFrame(X_train.todense(), columns=cvec.get_feature_names_out())

# Logistic Regression Model

In [148]:
lr = LogisticRegression(penalty = 'l1', solver='liblinear')

lr.fit(X_train, y_train)

In [149]:
preds = lr.predict(X_test)

In [150]:
lr.score(X_train, y_train)

0.782516493873704

In [151]:
lr.score(X_test, y_test)

0.7279151943462897

# Random Forest Classifier

In [152]:
rf = RandomForestClassifier(oob_score=True, max_features='sqrt')
rf.fit(X_train, y_train)

In [153]:
rf.score(X_test, y_test)

0.734982332155477

In [154]:
rf.oob_score_

0.7363336475023563

In [155]:
pd.DataFrame({
    'features': cvec.get_feature_names_out(),
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False).head(15)

Unnamed: 0,features,importance
1229,landing,0.020085
914,flight,0.014556
2073,stall,0.012902
2197,terrain,0.011929
1609,pilot,0.009785
1373,meteorological,0.009745
2180,takeoff,0.009702
123,airplane,0.009657
1831,resulted,0.009516
455,condition,0.009133
