In [1]:
# ! pip install inflect
# ! pip install pycaret

# Inputs

In [2]:
import re
import textwrap

import inflect

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = stopwords.words('english')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.metrics import classification_report

# import pycaret.classification as pc

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load data

In [3]:
filename = r'/kaggle/input/asrs-aeroguard/01_df_train_val_test.pkl'
df_dev = pd.read_pickle(filename)
df_dev

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test,2,Medium risk


In [4]:
# df_dev = df_dev.sample(frac=0.5).reset_index(drop=True)
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(df_dev['REPORT_1_2_SYNOPSIS'])
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# tfidf_df['target'] = df_dev['EVENT_RISK']
# s1 = pc.setup(data=tfidf_df, target='target')
# best = pc.compare_models()

# Preprocessing

In [5]:
p = inflect.engine()


def remove_stopwords(text):
    tokens = word_tokenize(text)  # Tokenize the text into words
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]  # Filter out stopwords
    filtered_text = ' '.join(filtered_tokens)  # Join the filtered tokens back into a text string
    return filtered_text

def convert_numbers_to_words(text):
    parts = re.findall(r'\d+|\D+', text)
    parts = [p.number_to_words(int(part)) if part.isdigit() else part for part in parts]
    text_num2word = ' '.join(parts)
    return text_num2word

def preprocess_inputs(df):
    df = df.copy()
    
    # Make any numbers be their word equivalent
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].apply(convert_numbers_to_words)
    
    # Remove stopwords
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].apply(remove_stopwords)
    
    return df

In [6]:
df = preprocess_inputs(df_dev)
df

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B seven hundred thirty-seven - seven hundred f...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting drone one ; zero feet...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR twenty-two pilot became disoriented approac...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,fifty-eight pilot experiences pitot heat failu...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B seven hundred thirty-seven First Officer rep...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ- nine hundred flight crew reported dispatc...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE seven hundred fifty flight crew departing P...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported re-qualific...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,initial climb via GOETZ RNAV SID MEM ; flight ...,Test,2,Medium risk


# Modelling

In [7]:
df = df_dev.sample(frac=0.2).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.5042925212409883


In [8]:
df = df_dev.sample(frac=0.4).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.5376660127725034


In [9]:
df = df_dev.sample(frac=0.6).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
print(f'{f1=}')

f1=0.5190737161563364


In [10]:
# df = df_dev.sample(frac=0.8).reset_index(drop=True)

# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# vectorizer = TfidfVectorizer()

# train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

# val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
# test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

# svm = SVC()

# svm.fit(train_tfidf, train_df['EVENT_RISK'])

# val_predictions = svm.predict(val_tfidf)

# f1 = f1_score(val_df['EVENT_RISK'], val_predictions, average='weighted')
# print(f'{f1=}')