### BERT model with fine tuning ###

In [None]:
###### for importing the packages #####
#!pip3 install transformers
#!pip3 install pytorch-lightning

In [None]:
#### import libraries

import numpy as np
import pandas as pd
import re
from stop_words import get_stop_words
import calendar
import datetime
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import random

import time
from sklearn.metrics import classification_report,f1_score,precision_score,recall_score,confusion_matrix,accuracy_score
import itertools
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
#### import Train, Validation and Test dataset respectively
df1 = pd.read_csv('GL_subtypes_data/Train_data_GL.csv')
df2 = pd.read_csv('GL_subtypes_data/Val_data_GL.csv')
df3 = pd.read_csv('GL_subtypes_data/Test_data_GL.csv')
print(df1.shape)
print(df2.shape)
print(df3.shape)

In [None]:
df1['Label'].value_counts().plot(kind = 'bar')

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Prepare dataset to be given as input to bert model
# Function for preparing Training , Validation & Test dataset for model input
def get_embeddings(df):
    label_counts = pd.DataFrame(df['Label'].value_counts())
    label_values = list(label_counts.index)
    df['Label'] = df['Label'].astype(int)

    # Get the lists of sentences and their labels.
    texts = df.Text.values
    labels = df.Label.values
    
    text_lengths = [len(texts[i].split()) for i in range(len(texts))]
    
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    # For every sentence...
    for sent in texts:
       # encode` will:
       # (1) Tokenize the sentence.
       # (2) Prepend the `[CLS]` token to the start. special classification token
       # (3) Append the `[SEP]` token to the end. special token
       # (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, 
                            #max_length = 128,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                       )
        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)
    
    text_ids_lengths = [len(input_ids[i]) for i in range(len(input_ids))]

    # We'll borrow the `pad_sequences` utility function to do this.
    #from keras.preprocessing.sequence import pad_sequences
    # Set the maximum sequence length.
    # I've chosen 145 somewhat arbitrarily. It's slightly larger than the
    # maximum training sentence length of 118...
    MAX_LEN = 145

    # Pad our input tokens with value 0.
    # "post" indicates that we want to pad and truncate at the end of the sequence,
    # as opposed to the beginning.
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

    

    # Create attention masks
    att_masks = []

    # For each sentence...
    for sent in input_ids:
        # Create the attention mask.
        #- If a token ID is 0, then it's padding, set the mask to 0.
        #- If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        # Store the attention mask for this sentence.
        att_masks.append(att_mask)
    print('Done!')   
    return input_ids, labels, att_masks

In [None]:
train_x, train_y, train_m = get_embeddings(df1)
val_x, val_y, val_m = get_embeddings(df2)
test_x, test_y, test_m = get_embeddings(df3)


# Our model expects PyTorch tensors rather than numpy.ndarrays
# convert arrays to tensors
train_x = torch.tensor(train_x)
test_x = torch.tensor(test_x)
val_x = torch.tensor(val_x)
train_y = torch.tensor(train_y)
test_y = torch.tensor(test_y)
val_y = torch.tensor(val_y)
train_m = torch.tensor(train_m)
test_m = torch.tensor(test_m)
val_m = torch.tensor(val_m)

print(train_x.shape,train_y.shape,train_m.shape)
print(test_x.shape,test_y.shape,test_m.shape)
print(val_x.shape,val_y.shape,val_m.shape)

In [None]:
# We’ll also create an iterator for our dataset using the torch DataLoader class
# The DataLoader needs to know our batch size for training, so we specify it 
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_x, train_m, train_y)
# define samplers for obtaining training batches
train_sampler = RandomSampler(train_data)
# load training data in batches
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_x, val_m, val_y)
# define samplers for obtaining validation batches
val_sampler = SequentialSampler(val_data)
# load validation data in batches
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# getting the number of classes
num_labels = len(set(df1.Label))
print(num_labels)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
# Use the 12-layer BERT model, with an uncased vocab.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = num_labels, output_attentions = False, output_hidden_states = False)

#Get the GPU to be used else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = model.to(device)

##### count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Number of trainable parameters:', count_parameters(model), '\n', model)

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
learning_rate = 1e-5
#The epsilon parameter eps = 1e-8 is “a very small number to prevent any division by zero in the implementation”
adam_epsilon = 1e-8

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.2},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# specify optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

num_epochs = 6
total_steps = len(train_dataloader) * num_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

seed_val = 111

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
##### Run this cell for training the model ######

# to track the training loss as the model trains
train_losses = []
# to track the validation loss as the model trains
val_losses = []
val_losses1 = []
num_mb_train = len(train_dataloader)
num_mb_val = len(val_dataloader)

if num_mb_val == 0:
    num_mb_val = 1
    
for n in range(num_epochs):
    train_loss = 0
    val_loss = 0
    start_time = time.time()
    
        ###################
        # train the model #
        ###################
    for k, (mb_x, mb_m, mb_y) in enumerate(train_dataloader):
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # prep model for training
        model.train()
        
        mb_x = mb_x.to(device)
        mb_m = mb_m.to(device)
        mb_y = mb_y.to(device)
        
        # forward pass: compute predicted outputs by passing inputs to the model
        outputs = model(mb_x, attention_mask=mb_m, labels=mb_y)
        # calculate the loss
        loss = outputs[0]
        #loss = model_loss(outputs[1], mb_y)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # perform a single optimization step (parameter update)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.data / num_mb_train
    
    print ("\nTrain loss after itaration %i: %f" % (n+1, train_loss))
    #record training loss
    train_losses.append(train_loss.cpu())
        ######################    
        # validate the model #
        ######################
    with torch.no_grad():
        # prep model for evaluation
        model.eval()
        
        for k, (mb_x, mb_m, mb_y) in enumerate(val_dataloader):
            mb_x = mb_x.to(device)
            mb_m = mb_m.to(device)
            mb_y = mb_y.to(device)
        
            # forward pass: compute predicted outputs by passing inputs to the model
            outputs = model(mb_x, attention_mask=mb_m, labels=mb_y)
            # calculate the loss
            loss = outputs[0]
            #loss = model_loss(outputs[1], mb_y)
            
            val_loss += loss.data / num_mb_val
            
        print ("Validation loss after itaration %i: %f" % (n+1, val_loss))
        # record validation loss
        val_losses.append(val_loss.cpu())
        
        ###### early stopping #####
        val_loss1=float(val_loss.cpu())
        val_losses1.append(val_loss1)
        best_loss=min(val_losses1)
        if val_loss1 <= best_loss:
            best_loss = val_loss
            check_without_progress = 0
        else:
            check_without_progress +=1
            if check_without_progress >= 3:
                print("Early stopping!")
                break
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Time: {epoch_mins}m {epoch_secs}s')

In [None]:
####### SAVE THE MODEL ######
import pickle
import os

out_dir = './subtype_bert_EO_prop_merge_upsampled_final'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir + '/train_losses.pkl', 'wb') as f:
    pickle.dump(train_losses, f)
    
with open(out_dir + '/val_losses.pkl', 'wb') as f:
    pickle.dump(val_losses, f)

In [None]:
####### LOAD THE MODEL ###### only to be run when model is saved
import pickle
import os
out_dir = './subtype_bert_EO_prop_merge_final'

model = BertForSequenceClassification.from_pretrained(out_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

with open(out_dir + '/train_losses.pkl', 'rb') as f:
    train_losses = pickle.load(f)
    
with open(out_dir + '/val_losses.pkl', 'rb') as f:
    val_losses = pickle.load(f)

#### Predicting the results and analyzing the metrics ###

In [None]:
# visualize the train loss as the network trained
plt.figure()
plt.plot(train_losses)

In [None]:
# visualize the validation loss as the network trained
plt.figure()
plt.plot(val_losses)

In [None]:
batch_size = 32

test_data = TensorDataset(test_x, test_m)
# define samplers for obtaining test batches
test_sampler = SequentialSampler(test_data)
# load test data in batches
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
outputs = []
with torch.no_grad():
    model.eval()# prep model for evaluation
    for k, (mb_x, mb_m) in enumerate(test_dataloader):
        mb_x = mb_x.to(device)
        mb_m = mb_m.to(device)
        # get sample outputs
        output = model(mb_x, attention_mask=mb_m)
        outputs.append(output[0].to('cpu'))
        
outputs = torch.cat(outputs)

In [None]:
# convert output logits to predicted class
_, predicted_values = torch.max(outputs, 1)
predicted_values = predicted_values.numpy()
true_values = test_y.numpy()

# calculate test accuracy
test_accuracy = np.sum(predicted_values == true_values) / len(true_values)
print ("Test Accuracy:", test_accuracy)

In [None]:
print(f1_score(true_values, predicted_values,average='macro'))
print(precision_score(true_values, predicted_values,average='macro'))
print(recall_score(true_values, predicted_values,average='macro'))

In [None]:
pd.DataFrame(recall_score(true_values, predicted_values,average=None)).T

In [None]:
# plot confusion matrix
# code borrowed from scikit-learn.org
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):


    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
num_labels=list(np.unique(df1.Label))
total_labels=len(np.unique(df1.Label))

if total_labels==19:
    #### Labels for Property
    labels = ["Acc","Acc_Clean","Acc_Install","Acc_Mowing","BOC_Work","Data/Cyber","Equip","Equip/Water",
              "Equip/drop","Fire","Misc","Missing","Natural","Theft","Unknown","Vehicle","Water","Water/leak","Water/pipe"]
elif total_labels==16:
    #### Labels For GL
     labels = ["Acc","Acc_Clean","Acc_Install","Acc_Mowing","BI-Slip/Fall","BI-others","BOC/Work","Equip","Equip/drop",
            "Fire","Misc","Theft/Missing","Unknown","Vehicle","Water","Water/leak"]
elif total_labels==18:             
    #### Labels for E&O
    labels = ["BOC","BI","Copyright","Data_Cyber","Disclose","Disc_Miscon","Financial","Fin_Debt_Cr","Fin_For","Fraud_Misrep",
              "Misc","Misrep","Negl","Negl_Def_Const","Neg_Tax","Property","Unknown","Work"]

In [None]:
cm_test = confusion_matrix(true_values, predicted_values)
np.set_printoptions(precision=2)
plt.figure(figsize=(9,9))
plot_confusion_matrix(cm_test, classes=labels, title='Confusion Matrix - Test Dataset')

##### Predicted Dataset ####

In [None]:
data_pred=pd.DataFrame({"Text":df3.Text,"Prediction":predicted_values})
data_pred["Prediction"]= data_pred["Prediction"].replace(num_labels,labels)
data_pred

##### CLASSIFICATION MODEL "TEST & USE" and "INTERPRETATION" #########

In [None]:
# Get sample test data for interpretability
def get_test_data(docs):
    
    text_lengths = [len(docs[i].split()) for i in range(len(docs))]
    input_ids = []

    for sent in docs:
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, 
                            #max_length = 128,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                       )
        input_ids.append(encoded_sent)
    
    text_ids_lengths = [len(input_ids[i]) for i in range(len(input_ids))]

    from keras.preprocessing.sequence import pad_sequences
    MAX_LEN = 145

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

    # Create attention masks
    att_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        att_masks.append(att_mask)
    print('Done!')   
    return input_ids, att_masks

In [None]:
# Get prediction outputs(probabilities) on test data set to be used for interpretation
def prediction(docs):
    
    batch_size = 32
    
    test_x, test_m = get_test_data(docs)
    test_x = torch.tensor(test_x)
    test_m = torch.tensor(test_m)    
    test_data = TensorDataset(test_x, test_m)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    outputs = []
    with torch.no_grad():
        model.eval()
        for k, (mb_x, mb_m) in enumerate(test_dataloader):
            mb_x = mb_x.to(device)
            mb_m = mb_m.to(device)
            output = model(mb_x, attention_mask=mb_m)
            outputs.append(output[0].to('cpu'))

    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs)
    outputs = outputs.cpu().detach().numpy()
    return outputs

In [None]:
#Get test dataset
df3 = pd.read_csv('Property Datasets/Simple_Data/Test_data_property.csv')
df3['Label'] = df3['Label'].astype(int)
test_texts = df3.Text.values

In [None]:
#y_pred = prediction(test_texts)

In [None]:
# Get labels..
PD_labels = ["BOC/Work","Data/Cyber","Equipment","Fire","Misc","Natural","Property","Theft/Missing","Unknown","Vehicle","Water"]
GL_labels=["Accidental","BI-Others","BI-Slip/Fall","BOC/Work","Equipment","Fire","Miscellaneous","Theft/Missing","Unknown","Vehicle","Water"]

In [None]:
import eli5
from eli5.lime import TextExplainer
te = TextExplainer(random_state=2019)
te.fit(test_texts[1], prediction)
te.show_prediction(target_names=PD_labels,top=4)

In [None]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=PD_labels)

In [None]:
print(test_texts[19])

In [None]:
print(test_texts[19])
exp = explainer.explain_instance(test_texts[19], prediction, num_features=5, top_labels=4)
exp.show_in_notebook(text=True)

In [None]:
##### For Saving File #######
exp.save_to_file('lime1_PD2.html')

In [3]:
!gsutil -m cp -r "OCR_cloud_vision_API.ipynb" "gs://sftp-uploaded-files/BERT_final_models"

Copying file://OCR_cloud_vision_API.ipynb [Content-Type=application/octet-stream]...
/ [1/1 files][  7.6 KiB/  7.6 KiB] 100% Done                                    
Operation completed over 1 objects/7.6 KiB.                                      
