In [1]:
# ! pip install -U git+https://github.com/huggingface/transformers.git
# ! pip install -U git+https://github.com/huggingface/accelerate.git
# ! pip install inflect

# Inputs

In [2]:
import re
import textwrap

import inflect

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


'cuda'

# Load data

In [3]:
filename = r'/kaggle/input/asrs-aeroguard/01_df_train_val_test.pkl'
df_dev = pd.read_pickle(filename)
df_dev

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test,2,Medium risk


# Preprocessing

In [4]:
MODELLING_COLUMNS = ['REPORT_1_NARRATIVE', 'EVENT_RISK', 'TRAIN_VAL_TEST_SPLIT']

p = inflect.engine()


def convert_numbers_to_words(text):
    parts = re.findall(r'\d+|\D+', text)
    parts = [p.number_to_words(int(part)) if part.isdigit() else part for part in parts]
    text_num2word = ' '.join(parts)
    return text_num2word


def preprocess_inputs(df):
    df = df.copy()
    
    # Leave only needed columns
    df = df[MODELLING_COLUMNS]
    
    # Make any numbers be their word equivalent
    df['REPORT_1_NARRATIVE'] = df['REPORT_1_NARRATIVE'].apply(convert_numbers_to_words)
    
    # Make each text entry in 'REPORT_1_NARRATIVE' be of maximum length "max_len"
    max_len = 512
    df['REPORT_1_NARRATIVE'] = df['REPORT_1_NARRATIVE'].apply(lambda s: textwrap.wrap(s, max_len))
    df = df.explode(column='REPORT_1_NARRATIVE')
    
    df = df.sample(frac=0.5)
    
    # Split into X and y
    X = df['REPORT_1_NARRATIVE']
    y = df['EVENT_RISK']
    
    # Train-val-test split
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, shuffle=True)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size=0.5, shuffle=True)
    
    # Compute class weights
    class_weights = (1 - (y_train.value_counts().sort_index() / y_train.shape[0])).values
    
    # Convert to "Trainer"-friendly data types
    X_train = X_train.tolist()
    y_train = y_train.values
    X_val = X_val.tolist()
    y_val = y_val.values
    X_test = X_test.tolist()
    y_test = y_test.values
    
    return X_train, y_train, X_val, y_val, X_test, y_test, class_weights

In [5]:
X_train, y_train, X_val, y_val, X_test, y_test, class_weights = preprocess_inputs(df_dev)
X_train[:5]

["eventful.I don't have good reasoning or excuse except to say that I was afflicted with some 'get-there-itis' while operating in these conditions; which should not have happened. I know better and should have exercised better judgment; for I put myself in a dangerous position and ATC in the awkward position of trying to help out a 'poor fool' - this is the part I am most ashamed of. 'Never again' are the operative words in this case.",
 'someone actually does land on the taxiway; please have Jeppesen page XX- zero - two  fixed to show that RWY  thirty-one L actually has a PAPI on the right; not a VASI on the left.',
 "I was providing flight instruction in order for the student to obtain complex time. We completed multiple maneuvers and had returned to Chino Airport (CNO) and perform a couple of takeoffs and landings on Runway  twenty-six L. We had just performed a touch-and-go and were approximately  five hundred  FT AGL when the engine instantly stopped. I asked the student; 'What di

In [6]:
print(f'{len(X_train)=}')
print(f'{len(y_train)=}')
print(f'{len(X_val)=}')
print(f'{len(y_val)=}')
print(f'{len(X_test)=}')
print(f'{len(y_test)=}')


len(X_train)=80307
len(y_train)=80307
len(X_val)=4462
len(y_val)=4462
len(X_test)=4462
len(y_test)=4462


In [7]:
print(y_train[:5])

class_weights = torch.from_numpy(class_weights).float().to(device)
class_weights

[2 2 3 1 2]


tensor([0.7976, 0.8785, 0.6438, 0.8387, 0.8414], device='cuda:0')

# Dataset

In [8]:
class ASRSDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Tokenize

In [9]:
# MODEL_NAME = 'microsoft/MiniLM-L12-H384-uncased'
MODEL_NAME = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(X_train, truncation=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, max_length=512)
test_encodings = tokenizer(X_test, truncation=True, max_length=512)

train_dataset = ASRSDataset(train_encodings, y_train)
val_dataset = ASRSDataset(val_encodings, y_val)
test_dataset = ASRSDataset(test_encodings, y_test)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

# Custom loss

In [10]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits', None)
        labels = inputs.get('labels', None)
        
        if logits is None:
            raise ValueError('logits was None')
        if labels is None:
            raise ValueError('labels was None')

        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Load Pretrained Model

In [11]:
num_labels = len(set(y_train))
print(f'{num_labels=}')

id2label = {
    0: 'Low risk',
    1: 'Moderately medium risk',
    2: 'Medium risk',
    3: 'Moderately high risk',
    4: 'High risk',
}
print(f'{id2label=}')

label2id = {v:k for k,v in id2label.items()}
print(f'{label2id=}')

num_labels=5
id2label={0: 'Low risk', 1: 'Moderately medium risk', 2: 'Medium risk', 3: 'Moderately high risk', 4: 'High risk'}
label2id={'Low risk': 0, 'Moderately medium risk': 1, 'Medium risk': 2, 'Moderately high risk': 3, 'High risk': 4}


In [12]:
# model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
model

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Custom metrics

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'f1': f1}

# Modelling

In [14]:
batch_size= 128

logging_steps = len(train_dataset) // batch_size
output_dir = '/kaggle/working/'

training_args = TrainingArguments(
    report_to="none", # Turn off logging to weights and biases (WanDB)
    output_dir=output_dir,
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_steps=logging_steps,
    fp16=True, # Faster training if cuda is enabled
)

In [15]:
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.388,1.325658,0.443869
2,1.2785,1.290948,0.468907
3,1.1976,1.279182,0.475182
4,1.1186,1.305081,0.479017
