In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW, Adam

from transformers import RobertaTokenizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.metrics import f1_score

device = device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

'cuda'

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [5]:
filepath = fr'/kaggle/input/asrs-aeroguard/01_df_train_val_test.pkl'
df_train_val_test = pd.read_pickle(filepath)
df_train_val_test

Unnamed: 0,ACN_NUM_ACN,TIME_DATE,TIME_1_LOCAL_TIME_OF_DAY,PLACE_LOCALE_REFERENCE,PLACE_1_STATE_REFERENCE,PLACE_2_RELATIVE_POSITION_ANGLE_RADIAL,PLACE_3_RELATIVE_POSITION_DISTANCE_NAUTICAL_MILES,PLACE_4_ALTITUDE_AGL_SINGLE_VALUE,PLACE_5_ALTITUDE_MSL_SINGLE_VALUE,ENVIRONMENT_FLIGHT_CONDITIONS,...,ASSESSMENTS_CONTRIBUTING_FACTORS_SITUATIONS,ASSESSMENTS_1_PRIMARY_PROBLEM,REPORT_1_NARRATIVE,REPORT_1_1_CALLBACK,REPORT_2_NARRATIVE,REPORT_2_1_CALLBACK,REPORT_1_2_SYNOPSIS,TRAIN_VAL_TEST_SPLIT,EVENT_RISK,EVENT_RISK_STR
0,1574675,201808,0601-1200,SNA.Airport,CA,,,,5000.0,,...,Human Factors,Human Factors,SNA RNP-Z to Runway 20R. The FMC was properly ...,,We were cleared for the RNP RNAV Z 20R Approac...,,B737-700 flight crew reported failing to make ...,Train,2,Medium risk
1,1224894,201412,0601-1200,MSY.Airport,LA,,,1000.0,,VMC,...,Human Factors,Human Factors,On base to final turn to runway 1 in MSY at ap...,,,,Captain reports sighting of a drone at 1;000 f...,Train,0,Low risk
2,1134202,201312,1201-1800,ZZZ.ARTCC,US,,,,2600.0,IMC,...,Human Factors; Aircraft; Procedure; Weather,Aircraft,I climbed to my filed altitude of 5;000 FT; an...,,,,SR22 pilot became disoriented on approach in I...,Train,3,Moderately high risk
3,1222074,201411,1201-1800,CWA.Airport,WI,,20.0,,4000.0,,...,Aircraft,Aircraft,I had my pitot heat checked prior to winter an...,,,,BE58 pilot experiences pitot heat failure desc...,Train,2,Medium risk
4,1733019,202003,1801-2400,ZDV.ARTCC,CO,,,,32000.0,,...,Airspace Structure; Weather,Weather,At 32000 ft. just north of PUB the aircraft ex...,,,,B737 First Officer reported unexpected moderat...,Train,3,Moderately high risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47718,1341108,201603,0601-1200,ZZZ.Airport,US,,,,,IMC,...,Company Policy; Human Factors,Human Factors,We did an originator out of ZZZ and had a main...,,[Report narrative contained no additional info...,,CRJ-900 flight crew reported being dispatched ...,Test,3,Moderately high risk
47719,1087474,201305,1201-1800,PHX.Airport,AZ,,,,8000.0,VMC,...,Aircraft; Human Factors; Procedure,Ambiguous,PHX takeoff Runway 25R flaps 5. CLEARANCE: MAX...,,There was an Airbus that departed before us an...,,CE750 flight crew departing PHX Runway 25R on ...,Test,3,Moderately high risk
47720,1756601,202008,,,,,,0.0,,,...,Environment - Non Weather Related; Company Pol...,Company Policy,I was scheduled to complete the one day traini...,,,,Air carrier First Officer reported that re-qua...,Test,0,Low risk
47721,1102938,201307,0601-1200,MEM.Airport,TN,,,,,VMC,...,Airspace Structure; Chart Or Publication; Proc...,Human Factors,We departed Runway 36C in MEM via the GOETZ TW...,,,,On initial climb out via the GOETZ RNAV SID fr...,Test,2,Medium risk


In [6]:
def preprocess_inputs(df):
    df = df[['REPORT_1_NARRATIVE', 'EVENT_RISK', 'TRAIN_VAL_TEST_SPLIT']].copy()
    
    df['REPORT_1_NARRATIVE'] = df['REPORT_1_NARRATIVE'].str.split('.')
    df = df.explode('REPORT_1_NARRATIVE', ignore_index=True)
    
    df_train = df.query('TRAIN_VAL_TEST_SPLIT == "Train"')
    print(f'{df_train.shape=}')
    df_val = df.query('TRAIN_VAL_TEST_SPLIT == "Validation"')
    print(f'{df_val.shape=}')
    df_test = df.query('TRAIN_VAL_TEST_SPLIT == "Test"')
    print(f'{df_test.shape=}')
    
    X_train = df_train['REPORT_1_NARRATIVE']
    y_train = df_train['EVENT_RISK']
    X_val = df_val['REPORT_1_NARRATIVE']
    y_val = df_val['EVENT_RISK']
    X_test = df_test['REPORT_1_NARRATIVE']
    y_test = df_test['EVENT_RISK']
    
    X_train = tokenizer(X_train.tolist(), padding='max_length', truncation=True)
    X_val = tokenizer(X_val.tolist(), padding='max_length', truncation=True)
    X_test = tokenizer(X_test.tolist(), padding='max_length', truncation=True)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [7]:
X_train, y_train, X_val, y_val, X_test, y_test = preprocess_inputs(df_train_val_test)

df_train.shape=(647711, 3)
df_val.shape=(71733, 3)
df_test.shape=(79254, 3)


In [8]:
y_train.value_counts(normalize=True).sort_index()

0    0.196832
1    0.120687
2    0.355859
3    0.165231
4    0.161391
Name: EVENT_RISK, dtype: float64

In [9]:
y_val.value_counts(normalize=True).sort_index()

0    0.187501
1    0.114159
2    0.354885
3    0.169016
4    0.174439
Name: EVENT_RISK, dtype: float64

In [10]:
y_test.value_counts(normalize=True).sort_index()

0    0.199700
1    0.123325
2    0.351414
3    0.168761
4    0.156800
Name: EVENT_RISK, dtype: float64

In [11]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
weights = torch.tensor(class_weights,dtype=torch.float).to(device)
print(f'{weights=}')

weights=tensor([1.0161, 1.6572, 0.5620, 1.2104, 1.2392], device='cuda:0')


In [12]:
X_train = torch.tensor(X_train['input_ids']).to(device)
y_train = torch.tensor(y_train.values).long().to(device)
X_val = torch.tensor(X_val['input_ids']).to(device)
y_val = torch.tensor(y_val.values).long().to(device)
X_test = torch.tensor(X_test['input_ids']).to(device)
y_test = torch.tensor(y_test.values).long().to(device)

In [13]:
torch.save(X_train, 'X_train.pt')
torch.save(y_train, 'y_train.pt')
torch.save(X_val, 'X_val.pt')
torch.save(y_val, 'y_val.pt')
torch.save(X_test, 'X_test.pt')
torch.save(y_test, 'y_test.pt')

In [14]:
tokenizer.vocab_size

50265

In [18]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.4):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
        self.bilstm1 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        output1, _ = self.bilstm1(embedded)
#         output1 = self.dropout1(output1)  
        output2, _ = self.bilstm2(output1)
#         output2 = self.dropout2(output2)  
        output = self.fc(output2[:, -1, :])
        return output

In [19]:
vocab_size = tokenizer.vocab_size
hidden_size = 64
num_layers = 2
output_size = 5
learning_rate = 0.1
batch_size = 64
num_epochs = 10

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = BiLSTM(vocab_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = AdamW(model.parameters(), lr=learning_rate)

model = model.to(device)
criterion = criterion.to(device)

for epoch in range(num_epochs):
    train_losses = []
    val_losses = []
    val_predictions = []
    val_true_labels = []
    
    model.train()
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
    
    model.eval()
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_losses.append(loss.item())
            
            _, predicted = torch.max(outputs.data, 1)
            
            val_predictions.extend(predicted.tolist())
            val_true_labels.extend(labels.tolist())
    
    train_loss = sum(train_losses) / len(train_losses)
    val_loss = sum(val_losses) / len(val_losses)
    val_f1_score = f1_score(val_true_labels, val_predictions, average='weighted')
    
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1-Score: {val_f1_score:.4f}")

Epoch 1/10 | Train Loss: 1.7123 | Val Loss: 1.7127 | Val F1-Score: 0.0234
Epoch 2/10 | Train Loss: nan | Val Loss: nan | Val F1-Score: 0.0592
Epoch 3/10 | Train Loss: nan | Val Loss: nan | Val F1-Score: 0.0592


KeyboardInterrupt: 