In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [5]:
from itertools import chain
import csv

import pandas as pd
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## Load training data

In [33]:
X_train = pd.read_pickle("../data/interim/crf_training_features.pickle")
y_train = pd.read_pickle("../data/interim/crf_training_labels.pickle")

X_test = pd.read_pickle("../data/interim/crf_test_features.pickle")
y_test = pd.read_pickle("../data/interim/crf_test_labels.pickle")

In [34]:
X_train[0]

[{'bias': 1.0,
  'lemma': '1.25',
  'pos': 'NUM',
  'tag': 'CD',
  'dep': 'nummod',
  'shape': 'd.dd',
  'is_alpha': False,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  'BOS': True,
  '+1:lemma': 'cup',
  '+1:pos': 'NOUN',
  '+1:tag': 'NNS',
  '+1:dep': 'ROOT',
  '+1:shape': 'xxxx',
  '+1:is_alpha': True,
  '+1:is_stop': False,
  '+1:is_title': False,
  '+1:is_right_punct': False,
  '+2:lemma': 'cook',
  '+2:pos': 'VERB',
  '+2:tag': 'VBN',
  '+2:dep': 'acl',
  '+2:shape': 'xxxx',
  '+2:is_alpha': True,
  '+2:is_stop': False,
  '+2:is_title': False,
  '+2:is_right_punct': False},
 {'bias': 1.0,
  'lemma': 'cup',
  'pos': 'NOUN',
  'tag': 'NNS',
  'dep': 'ROOT',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  '-1:lemma': '1.25',
  '-1:pos': 'NUM',
  '-1:tag': 'CD',
  '-1:dep': 'nummod',
  '-1:shape': 'd.dd',
  '-1:is_alpha': False,
  '-1:is_stop': False,
  '-1:is_title': False,
  '-1:is_left_punct': False,
  '+1:l

In [35]:
y_train[0]

['B-QTY',
 'B-UNIT',
 'B-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'B-NAME',
 'I-NAME',
 'B-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'I-COMMENT',
 'B-NAME',
 'B-COMMENT',
 'I-COMMENT']

## Training

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 5min 1s, sys: 8.51 s, total: 5min 9s
Wall time: 5min 53s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Evaluation 

In [37]:
labels = list(crf.classes_)

In [38]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.7904400486865897

Inspect per-class results in more detail:

In [29]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

   B-COMMENT      0.700     0.750     0.724     26640
   I-COMMENT      0.758     0.832     0.793     50271
      B-NAME      0.837     0.838     0.837     37386
      I-NAME      0.740     0.672     0.704     30915
     B-OTHER      0.819     0.684     0.746     20623
     I-OTHER      0.432     0.374     0.401     12379
       B-QTY      0.980     0.978     0.979     23724
       I-QTY      0.000     0.000     0.000         0
 B-RANGE_END      0.549     0.797     0.650       311
      B-UNIT      0.929     0.964     0.946     23471
      I-UNIT      0.000     0.000     0.000        38

   micro avg      0.792     0.792     0.792    225758
   macro avg      0.613     0.626     0.616    225758
weighted avg      0.790     0.792     0.789    225758



## Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

It takes quite a lot of CPU time and RAM, so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.

In [24]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

Best result:

In [25]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

## Check parameter space 

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = rs.cv_results_["param_c1"]
_y = rs.cv_results_["param_c2"]
_c = rs.cv_results_["mean_test_score"]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))


## Check best estimator on our test data 

As you can see, quality is improved.

In [30]:
#crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

   B-COMMENT      0.700     0.750     0.724     26640
   I-COMMENT      0.758     0.832     0.793     50271
      B-NAME      0.837     0.838     0.837     37386
      I-NAME      0.740     0.672     0.704     30915
     B-OTHER      0.819     0.684     0.746     20623
     I-OTHER      0.432     0.374     0.401     12379
       B-QTY      0.980     0.978     0.979     23724
       I-QTY      0.000     0.000     0.000         0
 B-RANGE_END      0.549     0.797     0.650       311
      B-UNIT      0.929     0.964     0.946     23471
      I-UNIT      0.000     0.000     0.000        38

   micro avg      0.792     0.792     0.792    225758
   macro avg      0.613     0.626     0.616    225758
weighted avg      0.790     0.792     0.789    225758



## Let's check what classifier learned

In [31]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-COMMENT -> I-COMMENT 5.187159
I-COMMENT -> I-COMMENT 4.588627
I-UNIT -> I-UNIT  3.368051
B-NAME -> I-NAME  3.170405
I-NAME -> I-NAME  3.130641
I-OTHER -> I-OTHER 2.649233
B-UNIT -> I-UNIT  2.599272
B-OTHER -> I-OTHER 2.169976
I-QTY  -> B-OTHER 1.632003
B-COMMENT -> B-NAME  1.423569
B-NAME -> B-COMMENT 1.180673
B-COMMENT -> B-OTHER 1.035739
B-OTHER -> B-NAME  0.965027
B-NAME -> B-OTHER 0.962771
B-QTY  -> I-QTY   0.958218
B-OTHER -> B-RANGE_END 0.913052
B-UNIT -> B-OTHER 0.889211
B-UNIT -> B-COMMENT 0.881716
B-RANGE_END -> B-UNIT  0.864725
B-QTY  -> B-OTHER 0.700816

Top unlikely transitions:
B-COMMENT -> I-NAME  -6.972959
B-OTHER -> I-COMMENT -7.028093
B-UNIT -> I-COMMENT -7.185151
B-COMMENT -> B-COMMENT -7.198270
B-UNIT -> I-OTHER -7.402736
B-QTY  -> I-OTHER -7.697871
I-NAME -> I-COMMENT -7.721009
B-OTHER -> B-OTHER -7.798965
B-NAME -> I-OTHER -7.816984
B-QTY  -> I-NAME  -7.946545
I-COMMENT -> I-OTHER -8.134926
B-OTHER -> I-NAME  -8.175268
I-OTHER -> I-NAME  -

Check the state features:

In [32]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.056011 B-QTY    shape:d.dd
6.122754 B-COMMENT -1:shape:xxxx/d
6.118293 B-OTHER  shape:d.ddd
5.923576 B-NAME   +1:lemma:consistency
5.784169 B-COMMENT -1:shape:xxx/ddd
5.705577 B-COMMENT -1:shape:xxx/dd
5.444852 B-QTY    shape:d
5.131788 B-OTHER  lemma:consistency
4.815221 B-UNIT   lemma:bunch
4.693732 I-OTHER  shape:xxx/ddd
4.615509 B-UNIT   lemma:pint
4.595689 B-UNIT   lemma:cups
4.537405 B-UNIT   lemma:bunche
4.522022 B-OTHER  lemma:slicer
4.517121 B-COMMENT -1:shape:xxxx/dd
4.485842 B-COMMENT -2:lemma:addition
4.485539 B-UNIT   -2:lemma:kaffir
4.454177 B-QTY    lemma:second
4.425728 B-UNIT   +2:lemma:25
4.422309 B-OTHER  shape:d.d-
4.417530 B-COMMENT lemma:plastic
4.414790 I-OTHER  +1:lemma:muesli
4.390513 B-NAME   -1:lemma:Note
4.372077 B-UNIT   -1:lemma:per
4.369732 B-QTY    shape:dd
4.359035 B-OTHER  lemma:kingarthurflourcom
4.336040 B-OTHER  lemma:ciltantro
4.313580 B-QTY    shape:ddd
4.301885 B-OTHER  lemma:washe
4.287260 B-QTY    -2:lemma:reduce

Top negative:


In [None]:
from joblib import dump
dump(crf, '../models/crf_model.joblib') 