In [13]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

In [14]:
print(nltk.corpus.conll2002.fileids())

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']


In [15]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

print(1)
print(train_sents[0])

1
[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [16]:
'''
Features
'''


# define some features. In this example we use word identity(BOS), word suffix(单词后缀), word shape and word POS tag(NP);
# also, some information from nearby words is used.
# This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results

def word2features(sent, i):  # for i in range(len(sent))
    word = sent[i][0]
    postag = sent[i][1]
    features = [  # 每个当前的单词的features
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.extend([  # 前一个单词的features
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.extend([  # 后一个单词的features
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


print(sent2features(train_sents[0])[0])  # 第一个list的第一个位置上的word,按照自己定义的特征提取出一些信息
print(sent2labels(train_sents[0])[0])  # 第一个list的第一个位置上的label

['bias', 'word.lower=melbourne', 'word[-3:]=rne', 'word[-2:]=ne', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NP', 'postag[:2]=NP', 'BOS', '+1:word.lower=(', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=Fpa', '+1:postag[:2]=Fp']
B-LOC


In [17]:
X_train = [sent2features(s) for s in train_sents]  # 不加[]是generator类型，不会输出。加了[]，可直接输出加了[]之后的结果。
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]  # 格式 [[[每个word的特征],[每个word的特征]],（到此处是一个s）,[(每个word的特征)],...]，
y_test = [sent2labels(s) for s in test_sents]  # 最外面的一层[]是加上去的。

In [18]:
'''
Train the model
'''
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):  # zip,打包为元组的列表
    trainer.append(xseq, yseq)  # 要train的data

trainer.set_params({
    'c1': 1.0,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_transitions': True
})

print('trainer.params():', trainer.params())

trainer.params(): ['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']


In [19]:
trainer.train('conll2002_NER-esp.crfsuite')  # #含义是训练出的模型名为：conll2002_NER-esp.crfsuite。

print('trainer.logparser.last_iteration:', trainer.logparser.last_iteration)
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

trainer.logparser.last_iteration: {'num': 50, 'scores': {}, 'loss': 14807.577946, 'feature_norm': 79.110017, 'error_norm': 1262.912078, 'active_features': 11346, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.25}
50 {'num': 50, 'scores': {}, 'loss': 14807.577946, 'feature_norm': 79.110017, 'error_norm': 1262.912078, 'active_features': 11346, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.25}


In [20]:
'''
Make predictions
'''
tagger = pycrfsuite.Tagger()
tagger.open('conll2002_NER-esp.crfsuite')  # 使用训练后的模型，创建用于测试的标注器。

example_sent = test_sents[0]
print('example_sent:', example_sent)
print("token of example_sent:", ' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

example_sent: [('La', 'DA', 'B-LOC'), ('Coruña', 'NC', 'I-LOC'), (',', 'Fc', 'O'), ('23', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFECOM', 'NP', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]
token of example_sent: La Coruña , 23 may ( EFECOM ) .

Predicted: B-LOC I-LOC O O O O B-ORG O O
Correct:   B-LOC I-LOC O O O O B-ORG O O


In [36]:
print(sent2features(example_sent)[1])

['bias', 'word.lower=coruña', 'word[-3:]=uña', 'word[-2:]=ña', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NC', 'postag[:2]=NC', '-1:word.lower=la', '-1:word.istitle=True', '-1:word.isupper=False', '-1:postag=DA', '-1:postag[:2]=DA', '+1:word.lower=,', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=Fc', '+1:postag[:2]=Fc']


In [21]:
'''
Evalute the model
'''


def bio_classification_report(y_true, y_pred):
    '''
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    '''
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))


    tagset = set(lb.classes_) - {'O'}  # 从9个labels中去除大写字母O
    print(tagset)  # {'I-LOC', 'B-MISC', 'B-ORG', 'I-ORG', 'B-LOC', 'I-MISC', 'I-PER', 'B-PER'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    print(tagset)  # ['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}  # idx:0,1,2,3,4,5,6,7; cls:8个labels;

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )


y_pred = [tagger.tag(xseq) for xseq in X_test]


print(bio_classification_report(y_test, y_pred))

{'B-LOC', 'B-ORG', 'B-PER', 'I-ORG', 'I-LOC', 'I-PER', 'B-MISC', 'I-MISC'}
['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
             precision    recall  f1-score   support

      B-LOC       0.78      0.75      0.76      1084
      I-LOC       0.66      0.60      0.63       325
     B-MISC       0.69      0.47      0.56       339
     I-MISC       0.61      0.49      0.54       557
      B-ORG       0.79      0.81      0.80      1400
      I-ORG       0.80      0.79      0.80      1104
      B-PER       0.82      0.87      0.84       735
      I-PER       0.87      0.93      0.90       634

avg / total       0.77      0.76      0.76      6178



In [22]:
print('len of y_pred:', len(y_pred))

len of y_pred: 1517
['O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [26]:
print(len(X_test[0]))

9


In [27]:
print(y_test[0])

['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']


In [24]:
print(y_pred[0])

['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']


In [31]:
label_B_LOC = 0

for sent in y_pred:
    for label in sent:
        if label == 'B-LOC':
            label_B_LOC += 1
print('预测结果中label是B_LOC的单词共:', label_B_LOC)

预测结果中label是B_LOC的单词共: 1044


In [34]:
label_B_LOC1 = 0

for sent in test_sents:
    for pair in sent:
        if pair[2] == 'B-LOC':
            label_B_LOC1 += 1
print('测试集中label是B_LOC的单词共:', label_B_LOC1)

测试集中label是B_LOC的单词共: 1084


In [35]:
print("Predicted:", ' '.join(tagger.tag(example_sent)))

Predicted: B-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG
