In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain
import csv

import pandas as pd
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.model_selection import train_test_split

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## Load training data

In [3]:
features = pd.read_pickle("../data/interim/crf_training_features.pickle")
lables = pd.read_pickle("../data/interim/crf_training_labels.pickle")


#X_train, X_test, y_train, y_test = train_test_split(features, lables, test_size=0.2)
#X_test = pd.read_pickle("../data/interim/crf_test_features.pickle")
#y_test = pd.read_pickle("../data/interim/crf_test_labels.pickle")

In [4]:
X_train[0]

NameError: name 'X_train' is not defined

In [None]:
y_train[0]

## Training

In [5]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.43,
    c2=0.012,
    max_iterations=100,
    all_possible_states= True,
    all_possible_transitions = True,
    linesearch =  'StrongBacktracking'
)
crf.fit(features, lables)

CPU times: user 4min 20s, sys: 5.15 s, total: 4min 25s
Wall time: 4min 47s


CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=True,
    averaging=None, c=None, c1=0.43, c2=0.012, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch='StrongBacktracking',
    max_iterations=100, max_linesearch=None, min_freq=None, model_filename=None,
    num_memories=None, pa_type=None, period=None, trainer_cls=None,
    variance=None, verbose=False)

## Evaluation 

In [None]:
labels = list(crf.classes_)

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

Inspect per-class results in more detail:

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

## Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

It takes quite a lot of CPU time and RAM, so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.

In [None]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states = True,
    linesearch =  'StrongBacktracking'
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Best result:

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

## Check parameter space 

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = rs.cv_results_["param_c1"]
_y = rs.cv_results_["param_c2"]
_c = rs.cv_results_["mean_test_score"]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))


## Check best estimator on our test data 

As you can see, quality is improved.

In [None]:
#crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

## Let's check what classifier learned

In [6]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-NAME -> I-NAME  2.571770
I-COMMENT -> I-COMMENT 2.508770
B-NAME -> I-NAME  2.489292
B-COMMENT -> I-COMMENT 2.381683
B-NAME -> B-COMMENT 1.242102
B-UNIT -> B-COMMENT 1.171713
B-COMMENT -> B-NAME  1.041518
B-QTY  -> B-UNIT  0.977503
I-NAME -> B-COMMENT 0.890107
B-UNIT -> B-NAME  0.887822
B-QTY  -> B-NAME  0.818354
I-COMMENT -> B-NAME  0.322757
B-QTY  -> B-COMMENT 0.205802
I-COMMENT -> B-QTY   -0.000600
B-NAME -> B-UNIT  -0.042445
B-COMMENT -> B-UNIT  -0.549676
B-COMMENT -> B-QTY   -0.588682
I-COMMENT -> B-UNIT  -0.633832
I-NAME -> B-UNIT  -0.886721
B-QTY  -> B-QTY   -2.596278

Top unlikely transitions:
B-COMMENT -> B-QTY   -0.588682
I-COMMENT -> B-UNIT  -0.633832
I-NAME -> B-UNIT  -0.886721
B-QTY  -> B-QTY   -2.596278
B-NAME -> B-QTY   -4.537193
I-NAME -> B-QTY   -4.753589
B-UNIT -> B-QTY   -6.100325
B-NAME -> I-COMMENT -9.074924
B-COMMENT -> B-COMMENT -9.166639
I-NAME -> B-NAME  -9.197420
I-NAME -> I-COMMENT -9.213901
B-COMMENT -> I-NAME  -9.807339
I-COMMENT ->

Check the state features:

In [7]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.183379 B-UNIT   lemma:quart
7.119389 B-UNIT   lemma:pint
6.847831 B-COMMENT -1:lemma:ranchero
6.652601 B-NAME   +2:lemma:Pimm
6.403273 B-COMMENT lemma:lot
6.367340 I-COMMENT -1:shape:xxx/ddd
6.239520 B-UNIT   lemma:ounce
6.199581 B-UNIT   lemma:sprig
6.088236 B-NAME   -1:lemma:tablespoons/21
6.052733 B-NAME   -2:lemma:ingredient
5.998698 B-UNIT   lemma:ounces
5.920678 B-UNIT   lemma:pinch
5.918084 B-UNIT   lemma:teaspoon
5.898419 B-UNIT   lemma:tablespoon
5.850259 B-UNIT   lemma:cups
5.777448 B-UNIT   lemma:bunche
5.728816 B-UNIT   lemma:quarts
5.700073 B-UNIT   lemma:cup
5.689977 I-COMMENT -1:shape:xxxx/d
5.592567 B-UNIT   lemma:stalk
5.505698 B-UNIT   lemma:gallon
5.465947 B-UNIT   -2:lemma:jarred
5.408447 I-COMMENT -1:shape:xxxx/ddd
5.408415 B-NAME   -1:lemma:flatbread
5.359547 B-UNIT   lemma:fillet
5.281059 B-UNIT   lemma:envelope
5.265977 I-COMMENT +2:lemma:bakery
5.242218 B-UNIT   lemma:loaf
5.208715 I-COMMENT lemma:lengthwise
5.123494 B-UNIT   lemma:bar

Top nega

In [8]:
from joblib import dump, load
dump(crf, '../models/crf_model.joblib') 

['../models/crf_model.joblib']