In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np

Using TensorFlow backend.


In [3]:
def encodeValues(encoder,df):
    df_encoded = pd.DataFrame()
    temp = pd.Series()
    
    df_encoded['ORT LEVEL 1'] = encoder.fit_transform(df['ORT Level 1'])
    encoded_values = df_encoded['ORT LEVEL 1'].unique()
    
    for element in encoded_values:
        temp = temp.append(pd.Series(encoder.fit_transform(df['ORT Level 2'].where(df_encoded['ORT LEVEL 1'] == element).dropna())),ignore_index=True)
    
    df_encoded['ORT LEVEL 2'] = temp
    
    return df_encoded

In [4]:
def createDictFunc(df,df_encoded):
    ORT_LEVEL_1_DICT = dict(zip(df['ORT Level 1'].unique(),df_encoded['ORT LEVEL 1'].unique()))
    
    elements = df['ORT Level 2'].unique()
    indices = []
    
    for element in elements:
        indices.append(df[df['ORT Level 2'] == element].index[0])

    ORT_LEVEL_2_DICT = dict(zip(df['ORT Level 2'].unique(),df_encoded['ORT LEVEL 2'].iloc[indices]))
    
    return ORT_LEVEL_1_DICT,ORT_LEVEL_2_DICT

In [5]:
def encoderFunc(df,level1_dict,level2_dict):
    df['ORT LEVEL 1'] = df['ORT LEVEL 1'].map(level1_dict)
    df['ORT LEVEL 2'] = df['ORT LEVEL 2'].map(level2_dict)
    return df

In [6]:
def training(model,dataLoader):
    train_loss_set = []

    # Number of training epochs
    epochs = 4

    for _ in trange(epochs, desc="Epoch"):
      # Set our model to training mode
      model.train()
  
      # Tracking variables
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
  
      # Train the data for one epoch
    for step, batch in enumerate(dataLoader):
        b_input_ids, b_labels = batch
        # Clear out the gradients
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids.to(torch.int64), token_type_ids=None,labels=b_labels.to(torch.int64))
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
        # Update variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

In [7]:
def validate(model,dataLoader):
    model.eval()
    eval_accuracy = 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in dataLoader:
        # Unpack the inputs from our dataloader
        #b_input_ids, b_input_mask, b_labels = batch
        b_input_ids, b_labels = batch
        # do not compute or store gradients
        with torch.no_grad():
            # calculate logit predictions
            logits = model(b_input_ids.to(torch.int64), token_type_ids=None)
        tmp_eval_accuracy = accuracy(logits, b_labels.to(torch.int64))
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [8]:
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return torch.sum(pred_flat == labels_flat) / len(labels_flat)

In [9]:
df = pd.read_excel('D:/RiskProject/RCATestDataForBERT.csv',encoding='ISO-8859-1', usecols=['RISK DESCRIPTION','ORT LEVEL 1','ORT LEVEL 2'])
df_mapData = pd.read_excel("D:/RiskProject/RiskCategories.xlsx",usecols = ['ORT Level 1','ORT Level 2'],)

In [10]:
df_encoded = encodeValues(LabelEncoder(),df_mapData)

In [11]:
ORT_LEVEL_1_DICT,ORT_LEVEL_2_DICT = createDictFunc(df_mapData,df_encoded)

In [12]:
df = encoderFunc(df,ORT_LEVEL_1_DICT,ORT_LEVEL_2_DICT)

In [13]:
df['RISK DESCRIPTION'] = ["[CLS] " + desc + " [SEP]" for desc in df['RISK DESCRIPTION'].values]

In [14]:
num_labels_L1 = len(df['ORT LEVEL 1'].unique())
num_labels_L20 = len(df['ORT LEVEL 2'].where(df['ORT LEVEL 1'] == 0).dropna().unique())
num_labels_L21 = len(df['ORT LEVEL 2'].where(df['ORT LEVEL 1'] == 1).dropna().unique())
num_labels_L22 = len(df['ORT LEVEL 2'].where(df['ORT LEVEL 1'] == 2).dropna().unique())

In [15]:
tokenizer = BertTokenizer.from_pretrained('D:/Bert_Models/bert-base-uncased')
df['RISK DESCRIPTION'] = [tokenizer.tokenize(description) for description in df['RISK DESCRIPTION'].values]

In [16]:
max_length = 512
df['RISK DESCRIPTION'] = [tokenizer.convert_tokens_to_ids(x) for x in df['RISK DESCRIPTION'].values]
df['RISK DESCRIPTION'] = pad_sequences(df['RISK DESCRIPTION'].values,maxlen=max_length,dtype='long', truncating='post',padding='post').tolist()

In [17]:
batch_size = 2

In [18]:
model1 = BertForSequenceClassification.from_pretrained("D:/Bert_Models/bert-base-uncased", num_labels=num_labels_L1)

In [19]:
param_optimizer = list(model1.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [20]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

W1010 07:38:13.629604  7204 optimization.py:46] t_total value of -1 results in schedule not being applied


In [21]:
train_inputs_L1, validation_inputs_L1, train_labels_L1, validation_labels_L1 = train_test_split(df['RISK DESCRIPTION'].values, df['ORT LEVEL 1'].values, test_size=0.1,random_state = 42,shuffle = True, stratify = df['ORT LEVEL 1'])

In [22]:
train_inputs_L1 = torch.tensor(list(train_inputs_L1))
validation_inputs_L1 = torch.tensor(list(validation_inputs_L1))
train_labels_L1 = torch.tensor(list(train_labels_L1))
validation_labels_L1 = torch.tensor(list(validation_labels_L1))

In [23]:
train_data_L1 = TensorDataset(train_inputs_L1, train_labels_L1)
train_sampler_L1 = RandomSampler(train_data_L1)
train_dataloader_L1 = DataLoader(train_data_L1, sampler=train_sampler_L1, batch_size=batch_size)

validation_data_L1 = TensorDataset(validation_inputs_L1, validation_labels_L1)
validation_sampler_L1 = SequentialSampler(validation_data_L1)
validation_dataloader_L1 = DataLoader(validation_data_L1, sampler=validation_sampler_L1, batch_size=batch_size)

In [None]:
training(model1,train_dataloader_L1)

Epoch: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


In [None]:
validate(model1,validation_dataloader_L1)

# GROUP THE DATA BY ORT LEVEL 1

In [None]:
grouped_data = df.groupby('ORT LEVEL 1')

In [None]:
df_data_0 = grouped_data.get_group(0)

In [None]:
df_data_0.columns

In [None]:
df_data_1 = grouped_data.get_group(1)

In [None]:
df_data_2 = grouped_data.get_group(2)

# CLASSIFY ORT LEVEL 2 for ORT LEVEL 1 == 0

In [None]:
model20 = BertForSequenceClassification.from_pretrained("D:/Bert_Models/bert-base-uncased", num_labels=num_labels_L20)

In [None]:
param_optimizer = list(model20.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

In [None]:
train_inputs_L20, validation_inputs_L20, train_labels_L20, validation_labels_L20 = train_test_split(df_data_0['RISK DESCRIPTION'].values, df_data_0['ORT LEVEL 2'].values, test_size=0.1,random_state = 42,shuffle = True, stratify = df_data_0['ORT LEVEL 2'])

In [None]:
train_inputs_L20 = torch.tensor(list(train_inputs_L20))
validation_inputs_L20 = torch.tensor(list(validation_inputs_L20))
train_labels_L20 = torch.tensor(list(train_labels_L20))
validation_labels_L20 = torch.tensor(list(validation_labels_L20))

In [None]:
train_data_L20 = TensorDataset(train_inputs_L20, train_labels_L20)
train_sampler_L20 = RandomSampler(train_data_L20)
train_dataloader_L20 = DataLoader(train_data_L20, sampler=train_sampler_L20, batch_size=batch_size)

validation_data_L20 = TensorDataset(validation_inputs_L20, validation_labels_L20)
validation_sampler_L20 = SequentialSampler(validation_data_L20)
validation_dataloader_L20 = DataLoader(validation_data_L20, sampler=validation_sampler_L20, batch_size=batch_size)

In [None]:
training(model20,train_dataloader_L20)

In [None]:
validate(model20,validation_dataloader_L20)

# CLASSIFY ORT LEVEL 2 for ORT LEVEL 1 == 1

In [None]:
model21 = BertForSequenceClassification.from_pretrained("D:/Bert_Models/bert-base-uncased", num_labels=num_labels_L21)

In [None]:
param_optimizer = list(model21.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

In [None]:
train_inputs_L21, validation_inputs_L21, train_labels_L21, validation_labels_L21 = train_test_split(df_data_1['RISK DESCRIPTION'].values, df_data_1['ORT LEVEL 2'].values, test_size=0.1,random_state = 42,shuffle = True, stratify = df_data_1['ORT LEVEL 2'])

In [None]:
train_inputs_L21 = torch.tensor(list(train_inputs_L21))
validation_inputs_L21 = torch.tensor(list(validation_inputs_L21))
train_labels_L21 = torch.tensor(list(train_labels_L21))
validation_labels_L21 = torch.tensor(list(validation_labels_L21))

In [None]:
train_data_L21 = TensorDataset(train_inputs_L21, train_labels_L21)
train_sampler_L21 = RandomSampler(train_data_L21)
train_dataloader_L21 = DataLoader(train_data_L21, sampler=train_sampler_L21, batch_size=batch_size)

validation_data_L21 = TensorDataset(validation_inputs_L21, validation_labels_L21)
validation_sampler_L21 = SequentialSampler(validation_data_L21)
validation_dataloader_L21 = DataLoader(validation_data_L21, sampler=validation_sampler_L21, batch_size=batch_size)

In [None]:
training(model20,train_dataloader_L20)

In [None]:
validate(model20,validation_dataloader_L20)

# CLASSIFY ORT LEVEL 2 for ORT LEVEL 1 == 2

In [None]:
model22 = BertForSequenceClassification.from_pretrained("D:/Bert_Models/bert-base-uncased", num_labels=num_labels_L22)

In [None]:
param_optimizer = list(model22.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

In [None]:
train_inputs_L22, validation_inputs_L22, train_labels_L22, validation_labels_L22 = train_test_split(df_data_2['RISK DESCRIPTION'].values, df_data_2['ORT LEVEL 2'].values, test_size=0.1,random_state = 42,shuffle = True, stratify = df_data_2['ORT LEVEL 2'])

In [None]:
train_inputs_L22 = torch.tensor(list(train_inputs_L22))
validation_inputs_L22 = torch.tensor(list(validation_inputs_L22))
train_labels_L22 = torch.tensor(list(train_labels_L22))
validation_labels_L22 = torch.tensor(list(validation_labels_L22))

In [None]:
train_data_L22 = TensorDataset(train_inputs_L22, train_labels_L22)
train_sampler_L22 = RandomSampler(train_data_L22)
train_dataloader_L22 = DataLoader(train_data_L22, sampler=train_sampler_L22, batch_size=batch_size)

validation_data_L22 = TensorDataset(validation_inputs_L22, validation_labels_L22)
validation_sampler_L22 = SequentialSampler(validation_data_L22)
validation_dataloader_L22 = DataLoader(validation_data_L22, sampler=validation_sampler_L22, batch_size=batch_size)

In [None]:
training(model20,train_dataloader_L20)

In [None]:
validate(model20,validation_dataloader_L20)

# Single input test

In [None]:
with open('F:/unique_desc/test_description.csv') as fin:
    description_test = fin.readline()
description_test = "[CLS] " + description_test + " [SEP]"
print(description_test)
tokenized_test_text = tokenizer.tokenize(description_test)
print(tokenized_test_text)
test_input_ids = tokenizer.convert_tokens_to_ids(tokenized_test_text)
print(test_input_ids)
test_input_ids = pad_sequences([test_input_ids],maxlen = max_length,dtype='long', truncating='post', padding='post')
test_input_ids = torch.tensor(test_input_ids)
predictionsL1 = model1(test_input_ids.to(torch.int64))
predicted_class_L1 = torch.argmax(predictionsL1).flatten()

In [None]:
predicted_class_L1.item()