In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, AutoTokenizer

In [40]:
file = open('../input/ner-hindi/hi_train.conll', 'r', encoding = 'utf8').read()
dataset = file.split('\n\n')
data = pd.DataFrame(columns = ['sentence', 'word_labels'])
for line in dataset:
    sentences = []
    ner = []
    split1 = line.split('\n')
    split2 = split1[1:]
    for each in split2:
        split3 = each.split('_ _ ')
        sentences.append(split3[0]), ner.append(split3[1])
    data.loc[len(data.index)] = [' '.join(sentences), ",".join(ner)]

In [41]:
data

In [42]:
labels_to_ids = {
    'B-CORP':1,
    'B-CW':0,
    'B-GRP':3,
    'B-LOC':2,
    'B-PROD':5,
    'B-PER':6,
    'I-CORP':4,
    'I-CW':7,
    'I-GRP':8,
    'I-LOC':9,
    'I-PROD':10,
    'I-PER':11,
    'O':12
}

In [43]:
ids_to_labels = {}
for key, value in labels_to_ids.items():
    ids_to_labels[value] = key

In [44]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [46]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True,
                            #  return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        # print(sentence)
        # print(word_labels)
        encoded_labels = np.ones(MAX_LEN, dtype=int) * -100
        i = -1
        for idx, j in enumerate(tokenizer.convert_ids_to_tokens(encoding["input_ids"])):
          # print(j)
          if j == "<pad>":
            break
          if j[0] == "[":
            continue
          if j[0] == "▁":
            i += 1
          try:
            encoded_labels[idx] = labels_to_ids[word_labels[i]]
          except:
            pass

          
        # print(encoded_labels)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        # encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        # print(labels)


        # set only labels whose first offset position is 0 and the second is not 0
        # i = 0
        # for idx, mapping in enumerate(encoding["offset_mapping"]):
        #   if mapping[0] == 0 and mapping[1] != 0:
        #     # overwrite label
        #     encoded_labels[idx] = labels[i]
        #     i += 1

        # step 4: turn everything into PyTorch tensors
        # for key, val in encoding.items():
        #   print({key: torch.as_tensor(val)})
        item = {key: np.array(val) for key, val in encoding.items()}
        item['labels'] = np.array(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [47]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [48]:
training_set[0]

In [49]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
    print('{0:10}  {1}'.format(token, label))

In [50]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

Preparing Dataset

In [51]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [52]:
# from transformers import TFAutoModelForTokenClassification

# model = TFAutoModelForTokenClassification.from_pretrained(
#     'ai4bharat/indic-bert', num_labels=len(labels_to_ids),from_pt=True
# )
model = BertForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=len(labels_to_ids))
model.to(device)
# model.to(device)

In [54]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [55]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss = model(input_ids=ids, attention_mask=mask, labels=labels)[0]
        tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)[1]
#         print(model(input_ids=ids, attention_mask=mask, labels=labels)[0])
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [56]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

In [57]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [58]:
labels, predictions = valid(model, testing_loader)

In [59]:
from seqeval.metrics import classification_report

labels = [[i] for i in labels]
predictions = [[i] for i in predictions]
# predictions = list(np.expand_dims(predictions, axis = 1))
print(classification_report(labels, predictions))