In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain
import csv

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## Load training data

In [3]:
with open('input.csv') as csvfile:
    lines = fname.readlines()
    items = [line.strip('\n').split('\t') for line in lines]
    items = [item for item in items if len(item)==6]

In [4]:
items[:10]

[]

As we can see, each line of the train_file follows the format:

- token
- position on the phrase. (I1 would be first word, I2 the second, and so on)
- LX , being the length group of the token (defined by [LengthGroup](https://github.com/NYTimes/ingredient-phrase-tagger/blob/master/lib/training/utils.py#L140))
- NoCAP or YesCAP, whether the token is capitalized or not
- YesParen or NoParen, whether the token is inside parenthesis or not

PyCRFSuite expects the input to be a list of the structured items and their respective tags. So we process the items from the train file and bucket them into sentences

In [5]:
sentences = []

sent = [items[0]]
for item in items[1:]:
    if 'I1' in item:
        sentences.append(sent)
        sent = [item]
    else:
        sent.append(item)
len(sentences)

179062

In [6]:
import random
random.shuffle(sentences)
test_size = 0.1
data_size = len(sentences)

test_data = sentences[:int(test_size*data_size)]
train_data = sentences[int(test_size*data_size):]

## Features

In [7]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'position': sent[i][1],
        'length_group': sent[i][2],
        'CAP': sent[i][3],
        'Paren': sent[i][4]
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2labels(sent):
    return [word[-1] for word in sent]

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tokens(sent):
    return [word[0] for word in sent]   

Extract features from the data:

In [8]:
%%time
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

CPU times: user 4.34 s, sys: 322 ms, total: 4.66 s
Wall time: 4.7 s


In [9]:
X_train[0]

[{'bias': 1.0,
  'word.lower()': 'kosher',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'position': 'I1',
  'length_group': 'L4',
  'CAP': 'YesCAP',
  'Paren': 'NoPAREN',
  'BOS': True,
  '+1:word.lower()': 'salt',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'salt',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'position': 'I2',
  'length_group': 'L4',
  'CAP': 'NoCAP',
  'Paren': 'NoPAREN',
  '-1:word.lower()': 'kosher',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  'EOS': True}]

## Training

In [10]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 2min 3s, sys: 611 ms, total: 2min 4s
Wall time: 2min 4s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Evaluation 

In [11]:
labels = list(crf.classes_)
labels

['B-NAME',
 'I-NAME',
 'B-QTY',
 'B-UNIT',
 'B-COMMENT',
 'I-COMMENT',
 'OTHER',
 'B-RANGE_END',
 'I-UNIT',
 'B-INDEX']

In [12]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.7792505986622222

Inspect per-class results in more detail:

In [13]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

   B-COMMENT      0.655     0.752     0.700     11703
   I-COMMENT      0.725     0.799     0.760     21685
     B-INDEX      0.000     0.000     0.000         0
      B-NAME      0.839     0.841     0.840     18526
      I-NAME      0.686     0.695     0.690     13896
       B-QTY      0.982     0.984     0.983     15055
 B-RANGE_END      0.574     0.787     0.664       202
      B-UNIT      0.928     0.965     0.946     12073
      I-UNIT      0.000     0.000     0.000        16
       OTHER      0.660     0.442     0.529     14390

   micro avg      0.784     0.784     0.784    107546
   macro avg      0.605     0.627     0.611    107546
weighted avg      0.782     0.784     0.779    107546



## Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

It takes quite a lot of CPU time and RAM, so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.

In [14]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

Best result:

In [15]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

## Check parameter space 

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = rs.cv_results_["param_c1"]
_y = rs.cv_results_["param_c2"]
_c = rs.cv_results_["mean_test_score"]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))


## Check best estimator on our test data 

As you can see, quality is improved.

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

## Let's check what classifier learned

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Check the state features:

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

In [14]:
from joblib import dump, load
dump(crf, 'crf_model.joblib') 

['crf_model.joblib']