**Table of contents**<a id='toc0_'></a>    
- [Inputs](#toc1_)    
- [Load data](#toc2_)    
- [Preprocessing](#toc3_)    
- [Modelling](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
# ! pip install inflect

# <a id='toc1_'></a>[Inputs](#toc0_)

In [2]:
import re
import textwrap

import inflect
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = stopwords.words('english')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import resample

from sklearn.svm import SVC
from sklearn.metrics import classification_report

# import pycaret.classification as pc

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# <a id='toc2_'></a>[Load data](#toc0_)

In [3]:
filename = r'/kaggle/input/asrs-aeroguard/01_df_train_val_test.pkl'
df_dev = pd.read_pickle(filename)
df_dev

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test,2,Medium risk


# <a id='toc3_'></a>[Preprocessing](#toc0_)

In [5]:
p = inflect.engine()


def remove_stopwords_and_punctuation(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def convert_numbers_to_words(text):
    parts = re.findall(r'\d+|\D+', text)
    parts = [p.number_to_words(int(part)) if part.isdigit() else part for part in parts]
    text_num2word = ' '.join(parts)
    return text_num2word

def preprocess_inputs(df):
    df = df.copy()
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].str.lower()
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].apply(convert_numbers_to_words)
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].apply(remove_stopwords_and_punctuation)
    return df

def upsample_data(df, target_col):
    class_counts = df[target_col].value_counts()
    max_count = class_counts.max()

    upsampled_dfs = []
    for label, count in class_counts.items():
        class_df = df[df[target_col] == label]
        upsampled_df = resample(class_df, replace=True, n_samples=max_count, random_state=42)
        upsampled_dfs.append(upsampled_df)

    upsampled_df = pd.concat(upsampled_dfs)

    return upsampled_df

In [6]:
df = preprocess_inputs(df_dev)
df_upsampled = upsample_data(df, 'EVENT_RISK')
df

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,b seven hundred thirty-seven seven hundred fli...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,captain reports sighting drone one zero feet a...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,sr twenty-two pilot became disoriented approac...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,fifty-eight pilot experiences pitot heat failu...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,b seven hundred thirty-seven first officer rep...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,crj- nine hundred flight crew reported dispatc...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,ce seven hundred fifty flight crew departing p...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,air carrier first officer reported re-qualific...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,initial climb via goetz rnav sid mem flight cr...,Test,2,Medium risk


In [7]:
df_upsampled

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
45523,1677539,201908,1201-1800,ZZZ.Airport,US,,,,2300.0,VMC,...,Aircraft,Aircraft,On visual approach Runway XR ZZZ approximately...,,On final to [Runway] XR at ZZZ we attempted to...,,three hundred twenty flight crew reported land...,Test,2,Medium risk
2436,1602786,201812,1201-1800,SJC.Airport,CA,,,,8000.0,VMC,...,Chart Or Publication; Procedure,Chart Or Publication,I inputted the SJC2 Departure which has a tran...,,I inputted the SJC2 Departure which has a tran...,,embraer aircraft flight crew reported confusio...,Train,2,Medium risk
15575,1021250,201207,0601-1200,ZZZZ.ARTCC,FO,,,,33000.0,VMC,...,Aircraft,Aircraft,While enroute at 33;000 FT; Engine EICAS indic...,,,,b seven hundred forty-seven four hundred capta...,Train,2,Medium risk
34424,1829993,202108,0601-1200,MHT.Airport,NH,90.0,35.0,,12000.0,,...,Procedure; Weather,Procedure,We had spoken with Dispatch two hours before t...,,,,air carrier captain reported filing reroute av...,Train,2,Medium risk
32550,1502244,201712,1801-2400,SEA.Airport,WA,,,800.0,,VMC,...,Airspace Structure; Procedure,Procedure,We had TCAS/RA on short final to runway 34L at...,,,,air carrier captain reported ra final approach...,Train,2,Medium risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11144,1237585,201502,,,,,,0.0,,,...,Aircraft; Company Policy,Ambiguous,Arrived at aircraft and discovered maintenance...,,,,b seven hundred thirty-seven captain believes ...,Train,1,Moderately medium risk
2804,1069537,201302,0601-1200,DEN.Airport,CO,,,0.0,,,...,Weather; Aircraft,Ambiguous,Snow in Denver. We began our pushback and stop...,,,,b seven hundred thirty-seven captain used reve...,Train,1,Moderately medium risk
19010,1032109,201208,1201-1800,ACK.Airport,MA,158.0,25.0,,3000.0,VMC,...,Aircraft,Aircraft,While en route (VFR) to ACK at 3;000 FT; I not...,Reporter states that maintenance found a loose...,,,c one hundred seventy-two lost power three zer...,Train,1,Moderately medium risk
9971,1280787,201507,0001-0600,ZAB.ARTCC,NM,,,,,,...,Procedure; Weather; Human Factors,Weather,I was working sector 17 as an R-side. There w...,,,,controller receives call aircraft n't airspace...,Train,1,Moderately medium risk


# <a id='toc4_'></a>[Modelling](#toc0_)

In [8]:
df = df_upsampled.sample(frac=0.2).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.6196483143385563


In [9]:
df = df_upsampled.sample(frac=0.4).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.6845458485195945


In [10]:
df = df_upsampled.sample(frac=0.6).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.7480353929653658


In [11]:
df = df_upsampled.sample(frac=0.8).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.7797465738198522


In [12]:
df = df_upsampled.copy()

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.8243793926744251


In [13]:
test_predictions = svm.predict(test_tfidf)

f1 = f1_score(test_df['EVENT_RISK'], test_predictions, average='weighted')
print(f'{f1=}')

f1=0.8245770193192952
