In [1]:
import pandas as pd


def read_conll(filename):
    df = pd.read_csv(filename,
                     sep='\t', header=None, keep_default_na=False,
                     names=['words', 'labels'], skip_blank_lines=False)
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

In [2]:
train_df = read_conll('../Datasets/sighan_2006_NER_dataset/sig_train.txt')
test_df = read_conll('../Datasets/sighan_2006_NER_dataset/sig_test.txt')
print(test_df)

       words labels  sentence_id
0          中  B-ORG            0
1          共  I-ORG            0
2          中  I-ORG            0
3          央  I-ORG            0
4          致      O            0
...      ...    ...          ...
176960     充      O         4364
176961     满      O         4364
176962     赞      O         4364
176963     誉      O         4364
176964     。      O         4364

[172601 rows x 3 columns]


In [3]:
data = [[train_df['sentence_id'].nunique(), test_df['sentence_id'].nunique()]]
pd.DataFrame(data, columns=["Train", "Test"])

Unnamed: 0,Train,Test
0,46364,4365


In [6]:
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': 64,
    'num_train_epochs': 2,
    'train_batch_size': 32,
    'fp16': True,
    'output_dir': '/outputs/',
}

In [7]:
from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

# We use the bert base cased pre-trained model.
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = NERModel('bert', '../model_output_sighan2006/checkpoint-6000', args=train_args)

# Train the model, there is no development or validation set for this dataset
# https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
model.train_model(train_df, output_dir='../model_output_sighan2006')

# Evaluate the model in terms of accuracy score
result, model_outputs, preds_list = model.eval_model(test_df)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (2): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-chinese/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/37 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:   Continuing training from checkpoint, will skip to saved global_step
INFO:simpletransformers.ner.ner_model:   Continuing training from epoch 4
INFO:simpletransformers.ner.ner_model:   Continuing training from global step 6000
INFO:simpletransformers.ner.ner_model:   Will skip the first 204 steps in the current epoch
INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to ../model_output_sighan2006.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/9 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/546 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.03916444929153584, 'precision': 0.940974866717441, 'recall': 0.9511162432640493, 'f1_score': 0.9460183767228177}


In [8]:
strs = """曾被譽為牛樟芝大王的劉威甫，2016年起用直銷手法，招攬會員投資培植牛樟芝，誆稱每單位保證金7萬5元，期滿還可全額領回保證金，涉嫌吸金30億，檢方訊問後，董事長劉威甫100萬交保，台灣分公司總經理莊立平則是諭令200萬交保，遭限制出境。平頭、白髮，遭檢調人員帶回北檢，他是被封為牛樟芝大王的台商劉威甫，2014年在中國風光成立中國珍菌堂集團成了董事長，如今卻是檢調人員頭號鎖定對象。集團總經理莊立平和公司掛名負責人張桂銘也都到北檢複訊，就是因為他們宣稱以「椴木培植法」培育牛樟芝，甚至還能直接拿商品買賣，利用直銷方式吸引會員。珍菌堂廣告：「建設有牛樟樹，種苗培育基地 。」集團經理莊立平在大陸認識劉威甫後，說服董事長在2016年回台開設分公司，還誆稱牛樟芝能治癌症，做噱頭，吸引3萬人成為會員，更誆稱期滿後本金可全數退回，短短兩年涉嫌吸金30億。記者吳欣倫：「珍菌堂他賣的不只是成品，還主打可以種植牛樟芝，讓大家成為小農，也就是成為養菌培植戶，只要每一平方公尺，就能來培植而且可以形成所謂的互利互助概念，更主打說只要投資花7萬5千元，甚至你只要每周付7百元，你就可以拿回所謂的保證金。」珍菌堂董事長劉威甫：「我們投入一些開發資金也好，都是公司這個行業前所未見。」但2018年開始，台灣投資人陸續收不到紅利，莊立平似乎怕東窗事發，把共享獎金設為浮動制 ，看牛樟液銷量多少才依比例發放，投資人氣得提告。大樓保全：「搬很久了，好久了 至少有5年了。」而總經理莊立平遭指控是全案主謀，他全盤否認 ，只說自己是珍菌堂海外經銷商，最後被檢方諭令200萬交保並遭限制出境，董事長劉威甫則供稱，所有買賣合約都是總經理負責，與他無關，最後被諭令百萬交保。●東森新聞關心您
不良行為，請勿模仿（封面圖／東森新聞）"""
samples = [' '.join(strs)]
predictions, _ = model.predict(samples)
for idx, sample in enumerate(samples):
    print('{}: '.format(idx))
    for word in predictions[idx]:
        print('{}'.format(word))

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

0: 
{'曾': 'O'}
{'被': 'O'}
{'譽': 'O'}
{'為': 'O'}
{'牛': 'O'}
{'樟': 'O'}
{'芝': 'O'}
{'大': 'O'}
{'王': 'O'}
{'的': 'O'}
{'劉': 'B-PER'}
{'威': 'I-PER'}
{'甫': 'I-PER'}
{'，': 'O'}
{'2': 'O'}
{'0': 'O'}
{'1': 'O'}
{'6': 'O'}
{'年': 'O'}
{'起': 'O'}
{'用': 'O'}
{'直': 'O'}
{'銷': 'O'}
{'手': 'O'}
{'法': 'O'}
{'，': 'O'}
{'招': 'O'}
{'攬': 'O'}
{'會': 'O'}
{'員': 'O'}
{'投': 'O'}
{'資': 'O'}
{'培': 'O'}
{'植': 'O'}
{'牛': 'O'}
{'樟': 'O'}
{'芝': 'O'}
{'，': 'O'}
{'誆': 'O'}
{'稱': 'O'}
{'每': 'O'}
{'單': 'O'}
{'位': 'O'}
{'保': 'O'}
{'證': 'O'}
{'金': 'O'}
{'7': 'O'}
{'萬': 'O'}
{'5': 'O'}
{'元': 'O'}
{'，': 'O'}
{'期': 'O'}
{'滿': 'O'}
{'還': 'O'}
{'可': 'O'}
{'全': 'O'}
{'額': 'O'}
{'領': 'O'}
{'回': 'O'}
{'保': 'O'}
{'證': 'O'}
{'金': 'O'}
