In [1]:
# ! pip install inflect
# ! pip install pycaret

# Inputs

In [6]:
import re
import textwrap

import inflect

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.metrics import classification_report

# import pycaret.classification as pc

# Load data

In [3]:
filename = r'/kaggle/input/asrs-aeroguard/01_df_train_val_test.pkl'
df_dev = pd.read_pickle(filename)
df_dev

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test,2,Medium risk


# Preprocessing

In [None]:
p = inflect.engine()


def convert_numbers_to_words(text):
    parts = re.findall(r'\d+|\D+', text)
    parts = [p.number_to_words(int(part)) if part.isdigit() else part for part in parts]
    text_num2word = ' '.join(parts)
    return text_num2word


def preprocess_inputs(df):
    df = df.copy()
    
    # Make any numbers be their word equivalent
    df['REPORT_1_2_SYNOPSIS'] = df['REPORT_1_2_SYNOPSIS'].apply(convert_numbers_to_words)
        
    return df

In [None]:
df = preprocess_inputs(df)
df

# Modelling

In [7]:
df = df_dev.sample(frac=0.2).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

val_report = classification_report(val_df['EVENT_RISK'], val_predictions)
print("Validation Report:")
print(val_report)

Validation Report:
              precision    recall  f1-score   support

           0       0.52      0.49      0.50       327
           1       0.65      0.24      0.35       214
           2       0.48      0.80      0.60       522
           3       0.48      0.18      0.26       220
           4       0.68      0.51      0.58       245

    accuracy                           0.52      1528
   macro avg       0.56      0.44      0.46      1528
weighted avg       0.54      0.52      0.49      1528

Test Report:
              precision    recall  f1-score   support

           0       0.53      0.51      0.52       421
           1       0.61      0.24      0.35       224
           2       0.50      0.78      0.61       691
           3       0.41      0.14      0.21       283
           4       0.66      0.53      0.59       290

    accuracy                           0.53      1909
   macro avg       0.54      0.44      0.45      1909
weighted avg       0.53      0.53      0.50  

In [8]:
df = df_dev.sample(frac=0.4).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

val_report = classification_report(val_df['EVENT_RISK'], val_predictions)
print("Validation Report:")
print(val_report)

Validation Report:
              precision    recall  f1-score   support

           0       0.58      0.54      0.56       744
           1       0.64      0.30      0.41       397
           2       0.48      0.77      0.59      1027
           3       0.49      0.22      0.31       445
           4       0.64      0.49      0.56       442

    accuracy                           0.53      3055
   macro avg       0.57      0.47      0.49      3055
weighted avg       0.55      0.53      0.52      3055

Test Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.58       807
           1       0.59      0.32      0.41       458
           2       0.51      0.79      0.62      1318
           3       0.54      0.21      0.30       587
           4       0.70      0.52      0.60       648

    accuracy                           0.55      3818
   macro avg       0.58      0.48      0.50      3818
weighted avg       0.57      0.55      0.53  

In [9]:
df = df_dev.sample(frac=0.6).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

val_report = classification_report(val_df['EVENT_RISK'], val_predictions)
print("Validation Report:")
print(val_report)

Validation Report:
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1049
           1       0.60      0.32      0.41       562
           2       0.52      0.76      0.62      1585
           3       0.50      0.27      0.35       662
           4       0.70      0.57      0.63       724

    accuracy                           0.56      4582
   macro avg       0.58      0.50      0.52      4582
weighted avg       0.57      0.56      0.54      4582

Test Report:
              precision    recall  f1-score   support

           0       0.56      0.56      0.56      1312
           1       0.59      0.33      0.42       686
           2       0.52      0.76      0.62      2016
           3       0.52      0.26      0.35       865
           4       0.69      0.54      0.60       848

    accuracy                           0.55      5727
   macro avg       0.58      0.49      0.51      5727
weighted avg       0.56      0.55      0.54  

In [10]:
df = df_dev.sample(frac=0.8).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['REPORT_1_2_SYNOPSIS'])

val_tfidf = vectorizer.transform(val_df['REPORT_1_2_SYNOPSIS'])
test_tfidf = vectorizer.transform(test_df['REPORT_1_2_SYNOPSIS'])

svm = SVC()

svm.fit(train_tfidf, train_df['EVENT_RISK'])

val_predictions = svm.predict(val_tfidf)

val_report = classification_report(val_df['EVENT_RISK'], val_predictions)
print("Validation Report:")
print(val_report)

Validation Report:
              precision    recall  f1-score   support

           0       0.59      0.59      0.59      1401
           1       0.54      0.31      0.39       738
           2       0.51      0.76      0.61      2093
           3       0.49      0.23      0.31       927
           4       0.71      0.55      0.62       950

    accuracy                           0.55      6109
   macro avg       0.57      0.49      0.51      6109
weighted avg       0.56      0.55      0.54      6109

Test Report:
              precision    recall  f1-score   support

           0       0.57      0.56      0.57      1684
           1       0.55      0.35      0.42       895
           2       0.53      0.78      0.63      2634
           3       0.55      0.26      0.35      1217
           4       0.70      0.56      0.62      1206

    accuracy                           0.56      7636
   macro avg       0.58      0.50      0.52      7636
weighted avg       0.57      0.56      0.55  