# Inputs

In [1]:
%reset -f

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW

from transformers import RobertaTokenizer

from controller import Controller

In [3]:
LOAD_TENSORS = True
c = Controller('i02')
device = device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# Load data

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [5]:
filepath = fr'{c.get_path_data_prepared()}/01_df_train_val_test.pkl'
df_train_val_test = pd.read_pickle(filepath)

df_train = df_train_val_test.query('TRAIN_VAL_TEST_SPLIT == "Train"')
print(f'{df_train.shape=}')
df_val = df_train_val_test.query('TRAIN_VAL_TEST_SPLIT == "Validation"')
print(f'{df_val.shape=}')
df_test = df_train_val_test.query('TRAIN_VAL_TEST_SPLIT == "Test"')
print(f'{df_test.shape=}')

df_train

df_train.shape=(38655, 114)
df_val.shape=(4295, 114)
df_test.shape=(4773, 114)


Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38650,1093770,201306,,MRY.Airport,CA,330.0,3.0,,2500.0,IMC,...,Human Factors; Aircraft,Ambiguous,Maintain directional control during climbout t...,,,,C172 instructor pilot reports difficulties wit...,Train,2,Medium risk
38651,1580993,201809,1201-1800,DTW.Airport,MI,,,250.0,,,...,Environment - Non Weather Related; Procedure,Procedure,Given takeoff clearance from 21R in DTW behind...,,,,CRJ-900 Captain reported encountering wake tur...,Train,2,Medium risk
38652,1438881,201704,,ZZZ.Airport,US,,,0.0,,,...,Company Policy; Human Factors,Company Policy,I did not witness passenger boarding as I was ...,,,,Boeing 787 Flight Attendant reported an adult ...,Train,0,Low risk
38653,1614309,201901,0601-1200,ZZZ.ARTCC,US,,,,45000.0,VMC,...,Aircraft; Human Factors,Aircraft,Extreme cold cockpit temperatures were experie...,,[Report narrative contained no additional info...,,EMB-505 flight crew reported the loss of tempe...,Train,2,Medium risk


# Prepare data

In [6]:
features = ['REPORT_1_NARRATIVE', 'REPORT_1_1_CALLBACK', 'REPORT_2_NARRATIVE', 'REPORT_2_1_CALLBACK', 'REPORT_1_2_SYNOPSIS']

## Check missing values

In [7]:
for feature in features:
    missing = df_train.query(f'{feature} == "NA"').shape[0]
    total = df_train.shape[0]
    print(f'{feature}, {missing=}, {total=} => {missing / total:.2%}')

REPORT_1_NARRATIVE, missing=0, total=38655 => 0.00%
REPORT_1_1_CALLBACK, missing=37474, total=38655 => 96.94%
REPORT_2_NARRATIVE, missing=29544, total=38655 => 76.43%
REPORT_2_1_CALLBACK, missing=38592, total=38655 => 99.84%
REPORT_1_2_SYNOPSIS, missing=0, total=38655 => 0.00%


## Get max length for narratives

In [8]:
for feature in features:
    lens = df_train[feature].map(len)
    print(f'{feature} => max={np.max(lens)} avg={np.mean(lens):.2f}')

REPORT_1_NARRATIVE => max=11991 avg=1586.71


REPORT_1_1_CALLBACK => max=3959 avg=20.95
REPORT_2_NARRATIVE => max=11955 avg=228.82
REPORT_2_1_CALLBACK => max=2749 avg=3.04
REPORT_1_2_SYNOPSIS => max=959 avg=171.53


In [9]:
def preprocess_inputs(df):
    df = df.copy()
    
    X = df['REPORT_1_NARRATIVE']
    y = df['EVENT_RISK']
    
    X = X.apply(lambda x: tokenizer.encode(x, padding='max_length', truncation=True))
    X = X.apply(pd.Series)
    
    return X, y

In [10]:
if not LOAD_TENSORS:
    X_train, y_train = preprocess_inputs(df_train)
    X_val, y_val = preprocess_inputs(df_val)
    X_test, y_test = preprocess_inputs(df_test)
    
    X_train = torch.tensor(X_train.values).to(device)
    y_train = torch.tensor(y_train.values).long().to(device)
    X_val = torch.tensor(X_val.values).to(device)
    y_val = torch.tensor(y_val.values).long().to(device)
    X_test = torch.tensor(X_test.values).to(device)
    y_test = torch.tensor(y_test.values).long().to(device)
else:
    X_train = torch.load(fr'{c.get_path_iteration()}/X_train.pt').to(device)
    y_train = torch.load(fr'{c.get_path_iteration()}/y_train.pt').to(device)
    X_val = torch.load(fr'{c.get_path_iteration()}/X_val.pt').to(device)
    y_val = torch.load(fr'{c.get_path_iteration()}/y_val.pt').to(device)
    X_test = torch.load(fr'{c.get_path_iteration()}/X_test.pt').to(device)
    y_test = torch.load(fr'{c.get_path_iteration()}/y_test.pt').to(device)

# Model development

Use word embeddings for `REPORT_1_NARRATIVE`.

In [11]:
tokenizer.vocab_size

50265

In [12]:
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, sequence_length, embedding_size, hidden_size, output_size):
        super(WordEmbeddingModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        self.fcs = [
            nn.Linear(sequence_length * embedding_size, hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.Linear(hidden_size, output_size),
        ]
        
        self.leaky_relu = nn.LeakyReLU()
    
    def forward(self, x):
        # print(f'{x.shape=}')
        embedded = self.embedding(x)
        # print(f'{embedded.shape=}')
        
        flattened = torch.flatten(embedded, 1)
        # print(f'{flattened.shape=}')
        
        out = flattened
        
        for fc in self.fcs:
            out = fc(out)
            out = self.leaky_relu(out)
            # print(f'{out.shape=}')
    
        return out

In [13]:
vocab_size = tokenizer.vocab_size
embedding_size = 2048
hidden_size = 32
output_size = 5
sequence_length = 512

learning_rate = 0.01
REG_PARAM = 0.02
num_epochs = 100
batch_size = 1024

model = WordEmbeddingModel(vocab_size, sequence_length, embedding_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=REG_PARAM)

In [None]:
# Create data loaders for train, validation, and test sets
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(num_epochs):
    train_losses = []
    val_losses = []
    
    # Train
    model.train()
    for inputs, labels in train_loader:
        inputs.to(device)
        labels.to(device)
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        train_losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs.to(device)
            labels.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            val_losses.append(val_loss)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Report metrics
    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    val_accuracy = 100.0 * correct / total
    print(f'epoch={epoch+1:02d} | train_loss={train_loss:.5f} | val_loss={val_loss:.5f} | val_acc={val_accuracy:.2f}%')

# Validate model

In [199]:
model.eval()
test_losses = []
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs.to(device)
        labels.to(device)
        outputs = model(inputs)
        test_losses.append(criterion(outputs, labels).item())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Print test metrics
test_loss = np.mean(test_losses)
test_accuracy = 100.0 * correct / total
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

Test Loss: 1.4459, Test Accuracy: 39.93%


# Saving to files

In [101]:
# Save the tensors
torch.save(X_train, fr'{c.get_path_iteration()}/X_train.pt')
torch.save(y_train, fr'{c.get_path_iteration()}/y_train.pt')
torch.save(X_val, fr'{c.get_path_iteration()}/X_val.pt')
torch.save(y_val, fr'{c.get_path_iteration()}/y_val.pt')
torch.save(X_test, fr'{c.get_path_iteration()}/X_test.pt')
torch.save(y_test, fr'{c.get_path_iteration()}/y_test.pt')