In [12]:
import nltk
import sklearn
import sklearn_crfsuite
import numpy as np
import scipy.stats
import pandas
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

PreProcess

In [63]:
# Preprocess
def to_3arr(initial_conll_path, save=False, save_conll_path=None): 
    sentences = []
    sent_tmp = []   
    bio_tmp = []
    _3arr = []
    try:
        file = open(initial_conll_path, encoding = 'utf8')
        lines = file.readlines()
        for index, line in enumerate(lines):
            parse_word = line.strip().replace(u'\ufeff', '').split('\t')
            if len(parse_word) == 2:
                word = parse_word[0]
                word_bio = parse_word[1]
                sent_tmp.append(word)
                bio_tmp.append(word_bio)
                if index == len(lines) - 1:
                    sent_pos_tag = nltk.pos_tag(sent_tmp)
                    for index, item in enumerate(sent_pos_tag):
                        tmp_tup = item + (bio_tmp[index], )
                        _3arr.append(tmp_tup)
                    if save:
                        _3arr.append((np.nan,)) # a sentence end with NaN
                        sentences.extend(_3arr)
                    else:
                        sentences.append(_3arr)
            else:
                sent_pos_tag = nltk.pos_tag(sent_tmp)
                for index, item in enumerate(sent_pos_tag):
                    tmp_tup = item + (bio_tmp[index], )
                    _3arr.append(tmp_tup)
                if save:
                        _3arr.append((np.nan,)) # a sentence end with NaN
                        sentences.extend(_3arr)
                else:
                    sentences.append(_3arr)
                sent_tmp = []
                bio_tmp = []
                _3arr = []
    finally:
        if file:
            file.close()

    if save:
        df = pandas.DataFrame(data=sentences)
        df.to_csv(save_conll_path, index=None, header=None, sep='\t')
        return
    return sentences

Word2features

In [64]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def sent2pos(sent):
    return [postag for token, postag, label in sent]

<font size=5>Get statistics</font>

In [45]:
# obtain triple array
train_sents = to_3arr('wnut17train.conll')
test_sents = to_3arr('emerging.test.annotated')
dev_sents = to_3arr('emerging.dev.conll')

In [59]:
# sents
print('numer of sentences: ', len(train_sents))
print('numer of sentences: ', len(dev_sents))
print('numer of sentences: ', len(test_sents))


numer of sentences:  3394
numer of sentences:  1009
numer of sentences:  1287


In [58]:
# tokens/words
train_file = open('wnut17train.conll', encoding = 'utf8')
dev_file = open('emerging.dev.conll', encoding = 'utf8')
test_file = open('emerging.test.annotated', encoding = 'utf8')
train_lines = train_file.readlines()
dev_lines = dev_file.readlines()
test_lines = test_file.readlines()
print(len(train_lines))
print(len(dev_lines))
print(len(test_lines))

66123
16741
24680


Get types for POS

In [32]:
import string
y_train_pos = [sent2pos(s) for s in train_sents]
tmp_pos = []
for pos in y_train_pos:
    tmp_pos += list(set(pos))
type_pos = list(set(tmp_pos))
type_pos = [type for type in type_pos if type not in string.punctuation]
type_pos

['JJR',
 'NNPS',
 'VBG',
 'PRP',
 'NNS',
 'VBZ',
 'VBP',
 'RBS',
 'CD',
 'DT',
 'WRB',
 'POS',
 'WP',
 'MD',
 'IN',
 'RP',
 "''",
 'VB',
 'FW',
 'JJS',
 'PRP$',
 'RBR',
 'PDT',
 'NN',
 'VBN',
 'JJ',
 'EX',
 'RB',
 'NNP',
 'TO',
 'SYM',
 'UH',
 'CC',
 'WDT',
 'VBD']

In [65]:
# extract features and labels of sets
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

<font size=5>Run a baseline run (train -> test) with the features directly copied from the tutorial. </font>

In [66]:
# train and use L-BFGS
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')
print(labels) # get statistics

['B-location', 'I-location', 'B-group', 'B-corporation', 'B-person', 'B-creative-work', 'B-product', 'I-person', 'I-creative-work', 'I-corporation', 'I-group', 'I-product']


In [67]:
# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.13532451620051908
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.308     0.028     0.052       142
I-creative-work      0.333     0.032     0.059       218
        B-group      0.263     0.030     0.054       165
        I-group      0.200     0.029     0.050        70
     B-location      0.391     0.227     0.287       150
     I-location      0.250     0.064     0.102        94
       B-person      0.563     0.135     0.218       429
       I-person      0.560     0.214     0.309       131
      B-product      0.500     0.024     0.045       127
      I-product      0.333     0.048     0.083       126

      micro avg      0.432     0.088     0.146      1740
      macro avg      0.308     0.069     0.105      1740
   weighted avg      0.389     0.088     0.135      1740



<font size=5>Hyperparameters Optimization</font>

In [49]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002B5024CC7F0>},
                   pre_dispatch='2*n_jobs', random_state=123, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(flat_f1_score, labels=['B-location', 'I-location', 'B-group', 'B-c

In [51]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c2': 0.04512332140157257, 'c1': 0.008130349084185253}
best CV score: 0.3629082087042693
model size: 0.63M


In [52]:
crf = rs.best_estimator_
# crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.21047584355082335
                 precision    recall  f1-score   support

  B-corporation      0.500     0.015     0.029        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.181     0.092     0.121       142
I-creative-work      0.196     0.225     0.209       218
        B-group      0.333     0.006     0.012       165
        I-group      0.600     0.043     0.080        70
     B-location      0.400     0.187     0.255       150
     I-location      0.238     0.053     0.087        94
       B-person      0.461     0.375     0.414       429
       I-person      0.437     0.344     0.385       131
      B-product      0.143     0.079     0.102       127
      I-product      0.078     0.040     0.053       126

      micro avg      0.318     0.184     0.233      1740
      macro avg      0.297     0.121     0.146      1740
   weighted avg      0.324     0.184     0.210      1740



Custom Features

In [46]:
def word2features_custom(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i == 1:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    elif i == 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })
    elif i > 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
        })
    else:
        features['BOS'] = True

    if i == len(sent)-2:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    elif i == len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    elif i < len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features_custom(sent):
    return [word2features_custom(sent, i) for i in range(len(sent))]

In [5]:
# obtain triple array
train_sents = to_3arr('wnut17train.conll')
test_sents = to_3arr('emerging.test.annotated')
dev_sents = to_3arr('emerging.dev.conll')

In [47]:
# extract features and labels of sets
X_train = [sent2features_custom(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_custom(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_custom(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

In [55]:
# train and use L-BFGS
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.13791016470926343
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.200     0.021     0.038       142
I-creative-work      0.265     0.041     0.071       218
        B-group      0.353     0.036     0.066       165
        I-group      0.400     0.086     0.141        70
     B-location      0.405     0.200     0.268       150
     I-location      0.214     0.064     0.098        94
       B-person      0.512     0.147     0.228       429
       I-person      0.451     0.244     0.317       131
      B-product      0.250     0.008     0.015       127
      I-product      0.400     0.032     0.059       126

      micro avg      0.405     0.092     0.150      1740
      macro avg      0.288     0.073     0.109      1740
   weighted avg      0.353     0.092     0.138      1740



In [49]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.6min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000211F3F17400>},
                   pre_dispatch='2*n_jobs', random_state=123, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(flat_f1_score, labels=['B-location', 'I-location', 'B-group', 'B-c

In [50]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.18781321926080463
                 precision    recall  f1-score   support

  B-corporation      0.500     0.015     0.029        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.151     0.056     0.082       142
I-creative-work      0.183     0.138     0.157       218
        B-group      0.000     0.000     0.000       165
        I-group      0.000     0.000     0.000        70
     B-location      0.359     0.153     0.215       150
     I-location      0.280     0.074     0.118        94
       B-person      0.435     0.361     0.395       429
       I-person      0.390     0.298     0.338       131
      B-product      0.150     0.047     0.072       127
      I-product      0.180     0.071     0.102       126

      micro avg      0.324     0.160     0.214      1740
      macro avg      0.219     0.101     0.126      1740
   weighted avg      0.261     0.160     0.188      1740



In [8]:
# train and use L-BFGS
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.14508507753093186
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.267     0.028     0.051       142
I-creative-work      0.273     0.041     0.072       218
        B-group      0.357     0.030     0.056       165
        I-group      0.500     0.100     0.167        70
     B-location      0.431     0.207     0.279       150
     I-location      0.241     0.074     0.114        94
       B-person      0.511     0.156     0.239       429
       I-person      0.479     0.260     0.337       131
      B-product      0.250     0.008     0.015       127
      I-product      0.400     0.032     0.059       126

      micro avg      0.427     0.097     0.158      1740
      macro avg      0.309     0.078     0.116      1740
   weighted avg      0.369     0.097     0.145      1740



In [9]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.0min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F2EF5C5E48>},
                   pre_dispatch='2*n_jobs', random_state=123, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-location', 'I-location

In [23]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.18072571686206085
                 precision    recall  f1-score   support

  B-corporation      1.000     0.015     0.030        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.155     0.063     0.090       142
I-creative-work      0.182     0.133     0.154       218
        B-group      0.000     0.000     0.000       165
        I-group      0.000     0.000     0.000        70
     B-location      0.371     0.153     0.217       150
     I-location      0.292     0.074     0.119        94
       B-person      0.445     0.338     0.384       429
       I-person      0.443     0.267     0.333       131
      B-product      0.125     0.031     0.050       127
      I-product      0.128     0.040     0.061       126

      micro avg      0.329     0.148     0.204      1740
      macro avg      0.262     0.093     0.120      1740
   weighted avg      0.283     0.148     0.181      1740



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


<font size=5>Extend features-->train->test, and dev->test using RS</font>

0. Custom features adding punctuation-train->test

In [27]:
import string
def word2features_custom_punctuation(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.ispunctuation()': 1 if word.lower() in string.punctuation else 0,
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features_punctuation(sent):
    return [word2features_custom_punctuation(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# extract features and labels of sets
X_train = [sent2features_punctuation(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_punctuation(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_punctuation(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


0.14399274320874095
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.250     0.028     0.051       142
I-creative-work      0.303     0.046     0.080       218
        B-group      0.286     0.036     0.065       165
        I-group      0.312     0.071     0.116        70
     B-location      0.387     0.240     0.296       150
     I-location      0.304     0.074     0.120        94
       B-person      0.569     0.135     0.218       429
       I-person      0.583     0.214     0.313       131
      B-product      0.600     0.024     0.045       127
      I-product      0.545     0.048     0.088       126

      micro avg      0.439     0.094     0.154      1740
      macro avg      0.345     0.076     0.116      1740
   weighted avg      0.415     0.094     0.144      1740



0. Custom features adding punctuation-dev->test

In [28]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.3min finished


0.19827866061584476
                 precision    recall  f1-score   support

  B-corporation      1.000     0.015     0.030        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.143     0.077     0.100       142
I-creative-work      0.151     0.202     0.173       218
        B-group      0.000     0.000     0.000       165
        I-group      0.000     0.000     0.000        70
     B-location      0.425     0.207     0.278       150
     I-location      0.208     0.053     0.085        94
       B-person      0.474     0.357     0.407       429
       I-person      0.434     0.328     0.374       131
      B-product      0.138     0.071     0.094       127
      I-product      0.060     0.040     0.048       126

      micro avg      0.289     0.174     0.217      1740
      macro avg      0.253     0.112     0.132      1740
   weighted avg      0.280     0.174     0.198      1740



1) Custom features adding stem

In [31]:
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer("english")

def word2features_custom_stem(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.stem()': stemmer.stem(word.lower()),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.stem()': stemmer.stem(word1.lower()),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.stem()': stemmer.stem(word1.lower()),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features_stem(sent):
    return [word2features_custom_stem(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# extract features and labels of sets
X_train = [sent2features_stem(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_stem(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_stem(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.13846223169257973
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.357     0.035     0.064       142
I-creative-work      0.393     0.050     0.089       218
        B-group      0.250     0.024     0.044       165
        I-group      0.286     0.029     0.052        70
     B-location      0.396     0.240     0.299       150
     I-location      0.407     0.117     0.182        94
       B-person      0.556     0.128     0.208       429
       I-person      0.562     0.206     0.302       131
      B-product      0.500     0.016     0.031       127
      I-product      0.167     0.032     0.053       126

      micro avg      0.435     0.090     0.149      1740
      macro avg      0.323     0.073     0.110      1740
   weighted avg      0.398     0.090     0.138      1740



In [32]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.5min finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.21340399245544067
                 precision    recall  f1-score   support

  B-corporation      0.500     0.015     0.029        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.176     0.092     0.120       142
I-creative-work      0.189     0.225     0.205       218
        B-group      0.333     0.006     0.012       165
        I-group      0.600     0.043     0.080        70
     B-location      0.431     0.187     0.260       150
     I-location      0.300     0.064     0.105        94
       B-person      0.477     0.364     0.413       429
       I-person      0.462     0.366     0.409       131
      B-product      0.120     0.071     0.089       127
      I-product      0.103     0.056     0.072       126

      micro avg      0.320     0.184     0.234      1740
      macro avg      0.308     0.124     0.150      1740
   weighted avg      0.335     0.184     0.213      1740



2) Custom features adding stopwords

In [33]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def word2features_custom_stopwords(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isstopword()': 1 if word.lower() in stop_words else 0,
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features_stopwords(sent):
    return [word2features_custom_stopwords(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# extract features and labels of sets
X_train = [sent2features_stopwords(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_stopwords(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_stopwords(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.14096956171765285
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.286     0.028     0.051       142
I-creative-work      0.350     0.032     0.059       218
        B-group      0.294     0.030     0.055       165
        I-group      0.222     0.029     0.051        70
     B-location      0.358     0.227     0.278       150
     I-location      0.269     0.074     0.117        94
       B-person      0.606     0.147     0.236       429
       I-person      0.592     0.221     0.322       131
      B-product      0.600     0.024     0.045       127
      I-product      0.353     0.048     0.084       126

      micro avg      0.446     0.092     0.152      1740
      macro avg      0.327     0.072     0.108      1740
   weighted avg      0.413     0.092     0.141      1740



In [34]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.4min finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.2000262215769624
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.171     0.085     0.113       142
I-creative-work      0.157     0.170     0.163       218
        B-group      0.200     0.006     0.012       165
        I-group      0.200     0.043     0.071        70
     B-location      0.342     0.167     0.224       150
     I-location      0.227     0.053     0.086        94
       B-person      0.451     0.378     0.411       429
       I-person      0.433     0.397     0.414       131
      B-product      0.117     0.071     0.088       127
      I-product      0.058     0.040     0.047       126

      micro avg      0.293     0.179     0.222      1740
      macro avg      0.196     0.117     0.136      1740
   weighted avg      0.259     0.179     0.200      1740



3) Custom features adding -+3 words

In [35]:
def word2features_custom_3words(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i == 1:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    elif i == 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })
    elif i > 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
        })
    else:
        features['BOS'] = True

    if i == len(sent)-2:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    elif i == len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    elif i < len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features_3words(sent):
    return [word2features_custom_3words(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# extract features and labels of sets
X_train = [sent2features_3words(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_3words(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_3words(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.13811565438887777
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.200     0.021     0.038       142
I-creative-work      0.281     0.041     0.072       218
        B-group      0.333     0.030     0.056       165
        I-group      0.429     0.086     0.143        70
     B-location      0.413     0.207     0.276       150
     I-location      0.250     0.074     0.115        94
       B-person      0.508     0.147     0.228       429
       I-person      0.456     0.237     0.312       131
      B-product      0.250     0.008     0.015       127
      I-product      0.400     0.032     0.059       126

      micro avg      0.412     0.092     0.150      1740
      macro avg      0.293     0.074     0.109      1740
   weighted avg      0.356     0.092     0.138      1740



In [36]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.8min finished


0.18781321926080463
                 precision    recall  f1-score   support

  B-corporation      0.500     0.015     0.029        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.151     0.056     0.082       142
I-creative-work      0.183     0.138     0.157       218
        B-group      0.000     0.000     0.000       165
        I-group      0.000     0.000     0.000        70
     B-location      0.359     0.153     0.215       150
     I-location      0.280     0.074     0.118        94
       B-person      0.435     0.361     0.395       429
       I-person      0.390     0.298     0.338       131
      B-product      0.150     0.047     0.072       127
      I-product      0.180     0.071     0.102       126

      micro avg      0.324     0.160     0.214      1740
      macro avg      0.219     0.101     0.126      1740
   weighted avg      0.261     0.160     0.188      1740



4. Combination of additions of feature

In [37]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
stop_words = set(stopwords.words('english'))
stemmer=SnowballStemmer("english")

def word2features_custom_comb(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.ispunctuation()': 1 if word.lower() in string.punctuation else 0,
        'word.stem()': stemmer.stem(word.lower()),
        'word.isstopword()': 1 if word.lower() in stop_words else 0,
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i == 1:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '-1:word.stem()': stemmer.stem(word1.lower()),
            '-1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    elif i == 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '-1:word.stem()': stemmer.stem(word1.lower()),
            '-1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.ispunctuation()': 1 if word2.lower() in string.punctuation else 0,
            '-2:word.stem()': stemmer.stem(word2.lower()),
            '-2:word.isstopword()': 1 if word2.lower() in stop_words else 0,
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
        })
    elif i > 2:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        word3 = sent[i-3][0]
        postag3 = sent[i-3][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '-1:word.stem()': stemmer.stem(word1.lower()),
            '-1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.ispunctuation()': 1 if word2.lower() in string.punctuation else 0,
            '-2:word.stem()': stemmer.stem(word2.lower()),
            '-2:word.isstopword()': 1 if word2.lower() in stop_words else 0,
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-3:word.lower()': word3.lower(),
            '-3:word.istitle()': word3.istitle(),
            '-3:word.isupper()': word3.isupper(),
            '-3:word.ispunctuation()': 1 if word3.lower() in string.punctuation else 0,
            '-3:word.stem()': stemmer.stem(word3.lower()),
            '-3:word.isstopword()': 1 if word3.lower() in stop_words else 0,
            '-3:postag': postag3,
            '-3:postag[:2]': postag3[:2],
        })
    else:
        features['BOS'] = True

    if i == len(sent)-2:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '+1:word.stem()': stemmer.stem(word1.lower()),
            '+1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    elif i == len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '+1:word.stem()': stemmer.stem(word1.lower()),
            '+1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.ispunctuation()': 1 if word2.lower() in string.punctuation else 0,
            '+2:word.stem()': stemmer.stem(word2.lower()),
            '+2:word.isstopword()': 1 if word2.lower() in stop_words else 0,
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
    elif i < len(sent)-3:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        word3 = sent[i+3][0]
        postag3 = sent[i+3][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.ispunctuation()': 1 if word1.lower() in string.punctuation else 0,
            '+1:word.stem()': stemmer.stem(word1.lower()),
            '+1:word.isstopword()': 1 if word1.lower() in stop_words else 0,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.ispunctuation()': 1 if word2.lower() in string.punctuation else 0,
            '+2:word.stem()': stemmer.stem(word2.lower()),
            '+2:word.isstopword()': 1 if word2.lower() in stop_words else 0,
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+3:word.lower()': word3.lower(),
            '+3:word.istitle()': word3.istitle(),
            '+3:word.isupper()': word3.isupper(),
            '+3:word.ispunctuation()': 1 if word3.lower() in string.punctuation else 0,
            '+3:word.stem()': stemmer.stem(word3.lower()),
            '+3:word.isstopword()': 1 if word3.lower() in stop_words else 0,
            '+3:postag': postag3,
            '+3:postag[:2]': postag3[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features_comb(sent):
    return [word2features_custom_comb(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# extract features and labels of sets
X_train = [sent2features_comb(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features_comb(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features_comb(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# ignore entities labelled with 'Other'
labels = list(crf.classes_)
labels.remove('O')

# prediction
y_pred = crf.predict(X_test)

# F1-score
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

0.14242344432653975
                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.278     0.035     0.062       142
I-creative-work      0.270     0.046     0.078       218
        B-group      0.500     0.036     0.068       165
        I-group      0.400     0.086     0.141        70
     B-location      0.390     0.213     0.276       150
     I-location      0.250     0.074     0.115        94
       B-person      0.535     0.142     0.225       429
       I-person      0.485     0.244     0.325       131
      B-product      0.500     0.008     0.016       127
      I-product      0.800     0.032     0.061       126

      micro avg      0.430     0.094     0.155      1740
      macro avg      0.367     0.076     0.114      1740
   weighted avg      0.430     0.094     0.142      1740



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [38]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=123)
rs.fit(X_dev, y_dev)
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

# cls-report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.7min finished


0.19723798395691833
                 precision    recall  f1-score   support

  B-corporation      1.000     0.015     0.030        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.138     0.056     0.080       142
I-creative-work      0.177     0.147     0.160       218
        B-group      0.250     0.006     0.012       165
        I-group      0.500     0.043     0.079        70
     B-location      0.438     0.187     0.262       150
     I-location      0.280     0.074     0.118        94
       B-person      0.464     0.357     0.403       429
       I-person      0.398     0.282     0.330       131
      B-product      0.130     0.047     0.069       127
      I-product      0.145     0.071     0.096       126

      micro avg      0.328     0.164     0.218      1740
      macro avg      0.327     0.107     0.137      1740
   weighted avg      0.332     0.164     0.197      1740



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
