In [None]:
# !pip install torch transformers
!pip install pandas --upgrade

In [1]:
# import section
import time, datetime, random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import(
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
    random_split
)
import transformers
from transformers import(
    LongformerTokenizer,
    LongformerForSequenceClassification,
    get_linear_schedule_with_warmup,
    AdamW
)
from nltk.metrics import ConfusionMatrix # Looks better than the sklearn CM
from sklearn.metrics import classification_report

import boto3
from boto3 import client
import pandas as pd 
from io import StringIO 
import pickle

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
if torch.cuda.is_available():
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        device = torch.device("cuda") # cuda:0 for multi-gpu
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [4]:
### LOAD DATA ###

# Make s3 bucket
s3 = boto3.client('s3')
# Read raw data and labels
data = s3.get_object(Bucket='wp-safety-incidents', Key='Data_2018_to_MAY_2020_UTF.csv')
labels = s3.get_object(Bucket='wp-safety-incidents', Key='July_19_August_20_Categorization.csv')
# Convert to dataframe
dffinal  = pd.read_csv(data['Body'], delimiter = ',')

# Get all labels
labels = []

# Loop through dataframe and assign numbers to labels
targetList = []
textList = []
labelList = []
for index, row in dffinal.iterrows():
    temp_labels = row['Root Cause'].split(',')
    for l in temp_labels:
        if l.strip() == '':
            labels.append('NA')
        else:
            labels.append(l.strip())
        
labels = list(set(labels))
labels.sort()
print(labels)
for index, row in dffinal.iterrows():
    temp_labels = row['Root Cause'].split(', ')
    for l in temp_labels:
        textList.append(row['Event Title'] + ': ' + row['Brief Event Description'])
        if l.strip() == '':
            targetList.append(labels.index('NA'))
            labelList.append('NA')
        else:
            targetList.append(labels.index(l.strip()))
            labelList.append(l.strip())
        
#Convert to training format
df = pd.DataFrame()
df['Text'] = textList
df['Target'] = targetList
df['Label'] = labelList
df.to_pickle('df_root_causes.pkl')
print(df.groupby(['Label','Target'])['Target'].count())

['Equipment', 'Fitness for Duty', 'Hazard Identified / Not Eliminated or Controlled', 'Hazard not Identified', 'NA', 'OTHER Root Cause', 'Personal Protective Equipment (PPE)', 'Poor Design - Equipment / Work Area', 'Poor Design - Work Process', 'Poor Housekeeping', 'To Be Determined (Temporary Placeholder Selection)', 'Training', 'Unsafe Practices or Unsafe Behavior']
Label                                               Target
Equipment                                           0         817
Fitness for Duty                                    1          15
Hazard Identified / Not Eliminated or Controlled    2         209
Hazard not Identified                               3         342
NA                                                  4         773
OTHER Root Cause                                    5         318
Personal Protective Equipment (PPE)                 6         104
Poor Design - Equipment / Work Area                 7         489
Poor Design - Work Process                

In [5]:
target = 0
df.loc[df['Target'] == target, 'Target'] = -1
df.loc[df['Target'] >= 0, 'Target'] = 0
df.loc[df['Target'] == -1, 'Target'] = 1
df.to_pickle('df_root_causes_'+str(target)+'.pkl')
print(df.groupby(['Label','Target'])['Target'].count())

Label                                               Target
Equipment                                           1         817
Fitness for Duty                                    0          15
Hazard Identified / Not Eliminated or Controlled    0         209
Hazard not Identified                               0         342
NA                                                  0         773
OTHER Root Cause                                    0         318
Personal Protective Equipment (PPE)                 0         104
Poor Design - Equipment / Work Area                 0         489
Poor Design - Work Process                          0         413
Poor Housekeeping                                   0          82
To Be Determined (Temporary Placeholder Selection)  0          58
Training                                            0         344
Unsafe Practices or Unsafe Behavior                 0         781
Name: Target, dtype: int64


In [6]:
def calculate_stat(predictions, actual) :
    #Flatten predictions array
    preds = [np.argmax(subarr) for arr in predictions for subarr in arr]
    true_labels_1d = []

    #Flatten true_labels array
    for arr in actual:
        true_labels_1d.extend(arr.tolist())

    #Print confusion matrixs and measures
    cm = ConfusionMatrix(true_labels_1d, preds)
    class_rep = classification_report(true_labels_1d, preds)
    print(cm)
    print(class_rep)
    return(preds, true_labels_1d)

In [7]:
# Tokenize all of the sentences and map the tokens to their word IDs.
# Max is 512 if using BERT-based models, higher for longformer (2000+)
def toke_and_enc(sentences, max_len):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = max_len,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                            truncation = True
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

In [8]:
def create_data_loader(train_dataset, batch_size=16) :
    
    train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

    return train_dataloader

In [9]:
def train_model(train_dataset, num_labels, epochs = 4, batch_size = 64) : 
    train_dataloader = create_data_loader(train_dataset, batch_size = batch_size)

    #Change the model name and num_labels depending on the task.
    model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                                gradient_checkpointing=True, # New to v3 - doesn't work with DataParallel
                                                                num_labels=num_labels)
    model.to(device)
    optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )
    

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



    for epoch_i in range(0, epochs):

        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

        total_train_loss = 0
   
        model.train()
        print("here")

        for step, batch in enumerate(train_dataloader):
            print('.', end ="")
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()        

            loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,  labels=b_labels)

            total_train_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)            
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
    
    print("Training complete!")
    return(model)

In [10]:
def test_model(test_dataset, model, batch_size = 64) : 
    
    test_sampler = SequentialSampler(test_dataset)

    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)
   
    print('here')
    model.to(device)
    model.eval()

    predictions , true_labels = [], []

    for batch in test_dataloader:
        print(".", end =" ")
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
  
        with torch.no_grad():
              outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
  
        predictions.append(logits)
        true_labels.append(label_ids)

    print('DONE.')
    return(predictions, true_labels)

In [None]:
# Divide the dataset by randomly selecting samples.
dftrain=df.sample(frac=0.8)
dftest=df.drop(dftrain.index)
print('{:>5,} training samples'.format(len(dftrain)))
print('{:>5,} test samples'.format(len(dftest)))

# set tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

max_len = 0
for text in df['Text'].values:
    len_txt = len(tokenizer.encode(text, add_special_tokens=True))
    max_len = max(max_len, len_txt)
print('Max sentence length: ', max_len)

# construct the input for the training phase
text_train = dftrain['Text'].values # Use appropriate column names
labels_train = dftrain['Target'].values
Input_ids_train, Attention_masks_train = toke_and_enc(text_train, max_len)
labels_train = torch.tensor(labels_train)
Train_dataset = TensorDataset(Input_ids_train, Attention_masks_train, labels_train)
model_train = train_model(Train_dataset, num_labels = 2)
# Save model (optional)
model_train.save_pretrained('model_'+str(target)) # Uncomment to save model

3,796 training samples
  949 test samples
Max sentence length:  1267


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

here
.....

In [None]:
# Run the model on the training data set to collect stat and output
print('Training result')
trainpred, trainactual = test_model(Train_dataset, model_train)
train_pred, train_actual  = calculate_stat(trainpred, trainactual)
dftrain['Mpred'] = train_pred
dftrain['Mactual'] = train_actual
dftrain.to_csv('outtrain_'+str(target)+'.csv', encoding='utf-8')
print()

# Run the model on the test data set to collect stat and output
print('Testing result')
text_test = dftest['Text'].values
labels_test = dftest['Target'].values
Input_ids_test, Attention_masks_test = toke_and_enc(text_test, max_len)
labels_test = torch.tensor(labels_test)
Test_dataset = TensorDataset(Input_ids_test, Attention_masks_test, labels_test)
testpred, testactual = test_model(Test_dataset, model_train)
test_pred, test_actual  = calculate_stat(testpred, testactual)
dftest['Mpred'] = test_pred
dftest['Mactual'] = test_actual
dftest.to_csv('outtest_'+str(target)+'.csv', encoding='utf-8')