In [0]:
import pandas as pd
import numpy as np
import copy
import string

from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification

## Data preprocessing

In [172]:
data = pd.read_csv("./data/sample_0.csv").fillna(method="ffill")
data.drop(columns=['Number'], inplace=True)

data.head()

Unnamed: 0,Tag,Start,End,Content
0,ЛОК,1048,1061,Південний Буг
1,ЛОК,1073,1080,Гіпаніс
2,ЛОК,1104,1109,Дунай
3,ЛОК,1185,1192,Дністер
4,ЛОК,1358,1368,Балканська


In [0]:
# Replace tags with standard notation.
data.replace(['ЛОК', 'ОРГ', 'ПЕРС', 'РІЗН'], ['LOC', 'ORG', 'PERS', 'MISC'], inplace=True)

To mark the type of entity the BIO notation is used -sSo that the position of every word of a Named Entity (NE) is expressed via 'Tag' attribute's preffix. The first (and sometims only) part of entity is marked with preffix 'B' (beginning) and all its following parts - with 'I' (inner). If a word is not an entity it's marked with 'O' - meaning it's standing outside of a named entity group.

In [178]:
def is_marked(tag):
  return tag[1] == '-'


def mark_tag_position(tag, bio_mark):
  if not is_marked(tag):
    tag = bio_mark + tag
  return tag


for index, row in data.iterrows():
  content_split = row['Content'].find(' ');
  if content_split != -1:
    data.set_value(index,'Content', row['Content'][:content_split])
    for word in row['Content'].split(' ')[1:]:
      new_row = copy.copy(row)
      new_row['Content'] = word
      new_row['Tag'] = mark_tag_position(new_row['Tag'], 'I-')
      data.loc[data.shape[0]] = list(new_row)

  row['Content'] = row['Content']
  data.set_value(index, 'Tag', mark_tag_position(row['Tag'], 'B-'))

data.tail()



Unnamed: 0,Tag,Start,End,Content
35,I-LOC,2507,2525,губернію
36,I-LOC,2865,2883,держава
37,I-PERS,3766,3778,Лицар
38,I-LOC,4397,4412,Бугу
39,I-LOC,4759,4773,шляхи


In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

## Get the pre-trained model 

Using a pre-trained model allows us to have a decent stating point for our model (used insdead of random initial parameters), impying the task it is used for is quite similar. Here we import a multilingual BERT token-classification model (used for NER) from PyTorch lib. A pre-trained model can also be downloaded from Google AI's official repo https://github.com/google-research/bert, and then set up with config file from the archive. However the Transformers wrapper library provides a convenient importability of this model in PyTorch and TensorFlow.

In [0]:
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)

## Train our model