In [1]:
import pandas as pd
from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import logging

In [2]:
# convert txt train & test dataset to conll format dataset
def txt_convert_conll_format(filename):
    df = pd.read_csv(filename,
                     sep='\t', header=None, keep_default_na=False,
                     names=['words', 'labels'], skip_blank_lines=False)
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

In [3]:
# some NER_tag in NER dataset from HuggingFace are stored as number_id
# convert number_id to corresponding NER tag
def ner_id2tag(id):
    id_map = {
        0: 'O',
        1: 'B-PER',
        2: 'I-PER',
        3: 'B-ORG',
        4: 'I-ORG',
        5: 'B-LOC',
        6: 'I-LOC',
    }

    return id_map[id]

In [4]:
# read train & test dataset
train_data = pd.read_csv('../Datasets/final_version_dataset/train_data.csv')
test_data = txt_convert_conll_format('../Datasets/final_version_dataset/test_data.txt')

In [None]:
# count distinct sentences in train & test dataset
data = [[train_data['sentence_id'].nunique(), test_data['sentence_id'].nunique()]]
pd.DataFrame(data, columns=["Train", "Test"])

In [6]:
# training arguments
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': 64,
    'num_train_epochs': 15,
    'train_batch_size': 32,
    'fp16': True,
    'output_dir': '/outputs/',
}

In [7]:
# origin label only contains GPE, ORG, PER, MISC and O

# custom_label = ['B-GPE','I-GPE','B-PER','I-PER','B-DATE','I-DATE','B-ORG','I-ORG','B-CARDINAL','I-CARDINAL','B-NORP','I-NORP','B-LOC','I-LOC','B-TIME','I-TIME','B-FAC','I-FAC','B-MONEY','I-MONEY','B-ORDINAL','I-ORDINAL','B-EVENT','I-EVENT','B-WFA','I-WFA','B-QUANTITY','I-QUANTITY','B-PERCENT','I-PERCENT','B-LANGUAGE','I-LANGUAGE','B-PRODUCT','I-PRODUCT','B-LAW','I-LAW','O']

In [None]:
logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

# use the bert base chinese pre-trained model.
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

# NERModel(typeof model, pretrained_model or model, training arguments)
model = NERModel('bert', '../model_output/checkpoint-35835-epoch-15')

# train the model
# model.train_model(train_data, output_dir='../model_output')

# Evaluate the model in terms of accuracy score
# precision, recall, f1 scores
result, model_outputs, preds_list = model.eval_model(test_data)

In [None]:
# string input, predict label
strs = """騎車在路上如果十多隻狗擋在路中央，相信沒有人敢騎過去。台南四鯤鯓上的一條道路出現十多隻狗群聚遊蕩，讓附近的居民沒人敢經過，狗群還會跑進周邊的社區裏，而養狗的婦人不讓動保處的人處理犬隻，也不關籠繫繩，真的讓居民很頭痛。民眾開車來到台南四鯤鯓，一間國小後方的這條道路，被路中央的景象嚇了一跳，擋了好幾十隻的狗，有的走來走去有的則躺在路中，車輛開過狗群的旁邊，躺在地上的狗完全不理會。不過就有民眾投訴，騎車經過狗群就會追上來，往往把騎士嚇得半死，有人還會摔車。周邊居民：「有很多人來向她抗議，因為要從這裡經過都被擋住，會怕那麼多狗，動保的來有跟他們反映，他們過來後說，你騎機車過去錄影一下，牠如果衝來你有錄影，他們就可以處理，結果沒有人敢這樣做。」而這群狗平常是由這名撿拾回收物的婦人飼養，也有愛狗人士會來餵食，因為有人會把狗丟過來，變成數量越來越多，曾有鄰居想架圍籬，把狗關在一起但是遭到拒絕。台南市鯤鯓里長陳金江：「動保處要處理她就是不讓人家處理，都和人家吵架，所以附近的居民很困擾，流浪狗都跑來裡面。」里長也很頭痛不知道怎麼處理，而成群的狗到處流竄，也真的是很危險的一件事。（封面圖／東森新聞）"""
samples = strs.split('。')
predictions, _ = model.predict(samples, split_on_space=False)
for idx, sample in enumerate(samples):
    print('{}: '.format(idx))
    for word in predictions[idx]:
        print('{}'.format(word), end=' ')
    print()