In [1]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [2]:
import logging
import math
import os
from pathlib import Path

In [None]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from crf_neural import ConditionalRandomFieldNeural as ConditionalRandomField



Set up logging.

In [4]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG

Switch working directory to the directory where the data live.  You may need to edit this line.

In [5]:
os.chdir("../data")

In [29]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
enraw =   TaggedCorpus(Path("enraw"), tagset=entrain.tagset, vocab=entrain.vocab) 
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996


In [30]:
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']


Make an HMM.  Let's do some pre-training to approximately maximize the
regularized log-likelihood on supervised training data.  In other words, the
probabilities at the M step will just be supervised count ratios.

On each epoch, you will see two progress bars: first it collects counts from
all the sentences (E step), and then after the M step, it evaluates the loss
function, which is the (unregularized) cross-entropy on the training set.

The parameters don't actually matter during the E step because there are no
hidden tags to impute.  The first M step will jump right to the optimal
solution.  The code will try a second epoch with the revised parameters, but
the result will be identical, so it will detect convergence and stop.

We arbitrarily choose λ=1 for our add-λ smoothing at the M step, but it would
be better to search for the best value of this hyperparameter.

In [31]:
def train_sup_then_eval(model, train_corpus, eval_corpus, save_as, λ=1.0):
    loss = lambda m: model_cross_entropy(m, eval_corpus=train_corpus)  # 监督期常用CE
    model.train(corpus=train_corpus, loss=loss, λ=λ, save_path=save_as)
    return model

def train_semisup_stop_on_dev(model, train_corpus, save_as, λ=1.0):
    # 半监督阶段：用dev上的Viterbi error作早停（题目建议）
    loss = lambda m: viterbi_error_rate(m, eval_corpus=endev, known_vocab=known_vocab)
    model.train(corpus=train_corpus, loss=loss, λ=λ, save_path=save_as)
    return model

In [32]:
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)
train_semisup_stop_on_dev(hmm, enraw, "hmm_raw_only.pkl", λ=1.0)

100%|██████████| 996/996 [00:03<00:00, 281.28it/s]
INFO : Cross-entropy: 12.6494 nats (= perplexity 311591.795)
100%|██████████| 996/996 [00:04<00:00, 213.83it/s]
INFO : Tagging accuracy: all: 8.176%, known: 8.809%, seen: 4.882%, novel: 0.330%
100%|██████████| 4013/4013 [00:33<00:00, 119.10it/s]
100%|██████████| 996/996 [00:03<00:00, 292.27it/s]
INFO : Cross-entropy: 10.8648 nats (= perplexity 52301.519)
100%|██████████| 996/996 [00:04<00:00, 222.90it/s]
INFO : Tagging accuracy: all: 9.332%, known: 9.761%, seen: 5.219%, novel: 4.756%
100%|██████████| 4013/4013 [00:34<00:00, 116.56it/s]
100%|██████████| 996/996 [00:03<00:00, 295.25it/s]
INFO : Cross-entropy: 10.8643 nats (= perplexity 52278.880)
100%|██████████| 996/996 [00:04<00:00, 223.48it/s]
INFO : Tagging accuracy: all: 9.412%, known: 10.000%, seen: 4.377%, novel: 2.906%
INFO : Saved model to hmm_raw_only.pkl


<hmm.HiddenMarkovModel at 0x2d2d5808670>

In [33]:
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)
train_semisup_stop_on_dev(hmm, entrain, "hmm_sup_raw.pkl", λ=1.0)


100%|██████████| 996/996 [00:03<00:00, 289.14it/s]
INFO : Cross-entropy: 12.6499 nats (= perplexity 311727.861)
100%|██████████| 996/996 [00:04<00:00, 217.41it/s]
INFO : Tagging accuracy: all: 4.280%, known: 4.253%, seen: 6.061%, novel: 3.963%
100%|██████████| 8064/8064 [01:15<00:00, 107.47it/s]
100%|██████████| 996/996 [00:03<00:00, 268.09it/s]
INFO : Cross-entropy: 7.9108 nats (= perplexity 2726.691)
100%|██████████| 996/996 [00:04<00:00, 212.08it/s]
INFO : Tagging accuracy: all: 88.463%, known: 92.537%, seen: 46.633%, novel: 46.103%
100%|██████████| 8064/8064 [01:13<00:00, 109.17it/s]
100%|██████████| 996/996 [00:03<00:00, 274.09it/s]
INFO : Cross-entropy: 7.4819 nats (= perplexity 1775.550)
100%|██████████| 996/996 [00:04<00:00, 223.50it/s]
INFO : Tagging accuracy: all: 87.110%, known: 91.452%, seen: 43.771%, novel: 41.480%
INFO : Saved model to hmm_sup_raw.pkl


<hmm.HiddenMarkovModel at 0x2d2d3b886d0>

In [34]:
ensupx3_enraw = TaggedCorpus(
    Path("ensup"), Path("ensup"), Path("ensup"), Path("enraw"),
    tagset=entrain.tagset, vocab=entrain.vocab
)
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)
train_semisup_stop_on_dev(hmm, ensupx3_enraw, "hmm_supx3_raw.pkl", λ=1.0)


100%|██████████| 996/996 [00:03<00:00, 283.54it/s]
INFO : Cross-entropy: 12.6506 nats (= perplexity 311959.728)
100%|██████████| 996/996 [00:04<00:00, 223.24it/s]
INFO : Tagging accuracy: all: 3.516%, known: 3.219%, seen: 3.535%, novel: 7.794%
100%|██████████| 16166/16166 [02:32<00:00, 105.93it/s]
100%|██████████| 996/996 [00:03<00:00, 299.40it/s]
INFO : Cross-entropy: 7.3557 nats (= perplexity 1565.039)
100%|██████████| 996/996 [00:04<00:00, 225.62it/s]
INFO : Tagging accuracy: all: 90.609%, known: 95.312%, seen: 41.582%, novel: 42.008%
100%|██████████| 16166/16166 [02:47<00:00, 96.29it/s] 
INFO : Saved model to hmm_supx3_raw-32332.pkl
100%|██████████| 996/996 [00:04<00:00, 244.36it/s]
INFO : Cross-entropy: 7.1453 nats (= perplexity 1268.093)
100%|██████████| 996/996 [00:04<00:00, 215.21it/s]
INFO : Tagging accuracy: all: 90.208%, known: 94.913%, seen: 42.929%, novel: 40.885%
INFO : Saved model to hmm_supx3_raw.pkl


<hmm.HiddenMarkovModel at 0x2d2d582bfd0>

In [35]:
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)
train_semisup_stop_on_dev(hmm, enraw, "hmm_raw_1.pkl", λ=1.0)

hmm = HiddenMarkovModel.load("hmm_raw_1.pkl")
train_sup_then_eval(hmm, ensup, endev, "hmm_raw_then_sup.pkl", λ=1.0)


100%|██████████| 996/996 [00:04<00:00, 238.59it/s]
INFO : Cross-entropy: 12.6493 nats (= perplexity 311550.766)
100%|██████████| 996/996 [00:04<00:00, 207.90it/s]
INFO : Tagging accuracy: all: 2.969%, known: 2.926%, seen: 3.535%, novel: 3.369%
100%|██████████| 4013/4013 [00:38<00:00, 104.59it/s]
100%|██████████| 996/996 [00:03<00:00, 257.70it/s]
INFO : Cross-entropy: 10.8651 nats (= perplexity 52317.236)
100%|██████████| 996/996 [00:04<00:00, 216.61it/s]
INFO : Tagging accuracy: all: 1.866%, known: 1.863%, seen: 2.189%, novel: 1.783%
INFO : Saved model to hmm_raw_1.pkl
INFO : Loaded model from hmm_raw_1.pkl
100%|██████████| 4051/4051 [00:20<00:00, 195.14it/s]
INFO : Cross-entropy: 10.8774 nats (= perplexity 52967.296)
100%|██████████| 4051/4051 [00:50<00:00, 80.64it/s] 
100%|██████████| 4051/4051 [00:21<00:00, 192.22it/s]
INFO : Cross-entropy: 7.4505 nats (= perplexity 1720.767)
100%|██████████| 4051/4051 [00:41<00:00, 97.35it/s] 
100%|██████████| 4051/4051 [00:17<00:00, 232.54it/s]
IN

<hmm.HiddenMarkovModel at 0x2d2d5c59d60>

In [37]:
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)
train_sup_then_eval(hmm, ensup, endev, "hmm_sup.pkl", λ=1.0)

100%|██████████| 4051/4051 [00:14<00:00, 287.23it/s]
INFO : Cross-entropy: 12.6437 nats (= perplexity 309809.268)
100%|██████████| 4051/4051 [00:39<00:00, 102.53it/s]
100%|██████████| 4051/4051 [00:14<00:00, 277.12it/s]
INFO : Cross-entropy: 7.4505 nats (= perplexity 1720.760)
100%|██████████| 4051/4051 [00:38<00:00, 105.37it/s]
100%|██████████| 4051/4051 [00:13<00:00, 307.93it/s]
INFO : Cross-entropy: 7.4505 nats (= perplexity 1720.767)
INFO : Saved model to hmm_sup.pkl


<hmm.HiddenMarkovModel at 0x2d2d5d17550>

In [38]:
hmm = HiddenMarkovModel.load("hmm_sup.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
print(loss_dev(hmm))

INFO : Loaded model from hmm_sup.pkl
100%|██████████| 996/996 [00:04<00:00, 221.03it/s]
INFO : Cross-entropy: 7.5995 nats (= perplexity 1997.182)
100%|██████████| 996/996 [00:05<00:00, 180.25it/s]
INFO : Tagging accuracy: all: 88.663%, known: 93.059%, seen: 44.108%, novel: 42.734%


0.11336590254290368


Now let's throw in the unsupervised training data as well, and continue
training as before, in order to increase the regularized log-likelihood on
this larger, semi-supervised training set.  It's now the *incomplete-data*
log-likelihood.

This time, we'll use a different evaluation loss function: we'll stop when the
*tagging error rate* on a held-out dev set stops getting better.  Also, the
implementation of this loss function (`viterbi_error_rate`) includes a helpful
side effect: it logs the *cross-entropy* on the held-out dataset as well, just
for your information.

We hope that held-out tagging accuracy will go up for a little bit before it
goes down again (see Merialdo 1994). (Log-likelihood on training data will
continue to improve, and that improvement may generalize to held-out
cross-entropy.  But getting accuracy to increase is harder.)

In [26]:
ensup =   TaggedCorpus(Path("ensup"))  # supervised training
endev =   TaggedCorpus(Path("endev"))  # supervised training
hmm = HiddenMarkovModel(ensup.tagset, ensup.vocab)  
loss_raw = lambda model: viterbi_error_rate(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_raw, λ=1.0,
          save_path="en_hmm_sup.pkl")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Read 23949 tokens from endev
INFO : Created 23 tag types
INFO : Created 4959 word types
100%|██████████| 4051/4051 [00:14<00:00, 276.47it/s]
INFO : Cross-entropy: 12.2669 nats (= perplexity 212533.383)
100%|██████████| 4051/4051 [00:22<00:00, 182.80it/s]
INFO : Tagging accuracy: all: 3.749%, seen: 3.749%, novel: nan%
100%|██████████| 4051/4051 [00:39<00:00, 101.32it/s]
100%|██████████| 4051/4051 [00:14<00:00, 285.40it/s]
INFO : Cross-entropy: 7.2282 nats (= perplexity 1377.693)
100%|██████████| 4051/4051 [00:22<00:00, 180.52it/s]
INFO : Tagging accuracy: all: 92.571%, seen: 92.571%, novel: nan%
100%|██████████| 4051/4051 [00:41<00:00, 97.01it/s] 
100%|██████████| 4051/4051 [00:15<00:00, 267.01it/s]
INFO : Cross-entropy: 7.2282 nats (= perplexity 1377.695)
100%|██████████| 4051/4051 [00:22<00:00, 178.92it/s]
INFO : Tagging accuracy: all: 92.571%, seen: 92.571%, novel: nan%
INFO : Saved

In [19]:
# hmm = HiddenMarkovModel.load("en_hmm_raw_2.pkl")  # reset to supervised model (in case you're re-executing this bit)
hmm = HiddenMarkovModel(enraw.tagset, enraw.vocab)  
loss_raw = lambda model: model_cross_entropy(model, eval_corpus=enraw)
hmm.train(corpus=enraw, loss=loss_raw, λ=1.0,
          save_path="en_hmm_raw_0.pkl")


100%|██████████| 4013/4013 [00:13<00:00, 303.49it/s]
INFO : Cross-entropy: 9.5973 nats (= perplexity 14724.539)
100%|██████████| 4013/4013 [00:35<00:00, 114.31it/s]
100%|██████████| 4013/4013 [00:12<00:00, 327.13it/s]
INFO : Cross-entropy: 7.7891 nats (= perplexity 2414.232)
100%|██████████| 4013/4013 [00:35<00:00, 114.62it/s]
100%|██████████| 4013/4013 [00:12<00:00, 331.13it/s]
INFO : Cross-entropy: 7.7891 nats (= perplexity 2414.222)
INFO : Saved model to en_hmm_raw_0.pkl


You can also retry the above workflow where you start with a worse supervised
model (like Merialdo).  Does EM help more in that case?  It's easiest to rerun
exactly the code above, but first make the `ensup` file smaller by copying
`ensup-tiny` over it.  `ensup-tiny` is only 25 sentences (that happen to cover
all tags in `endev`).  Back up your old `ensup` and your old `*.pkl` models
before you do this.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [11]:
def look_at_your_data(model, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

In [14]:
def look_at_your_data_compare(model, model_new, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        viterbi_new = model_new.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        counts_new = eval_tagging(predicted=viterbi_new, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        num_new = counts_new['NUM', 'ALL']
        denom_new = counts_new['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Supervised-Viterbi: {viterbi}")
        log.info(f"Semi-Supervised-Viterbi: {viterbi_new}")
        log.info(f"Supervised-Loss:    {denom - num}/{denom}")
        log.info(f"Semi-Supervised-Loss:    {denom_new - num_new}/{denom_new}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        xent_new = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Supervised-Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")
        log.info(f"Semi-supervised-Cross-entropy: {xent_new/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

In [15]:
hmm_sup = HiddenMarkovModel.load("en_hmm.pkl")
look_at_your_data_compare(hmm_sup, hmm, endev, N=10)

INFO : Loaded model from en_hmm.pkl
INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Supervised-Viterbi: ``/` We/P 're/V strongly/D _OOV_/N that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/I this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/, added/V ,/, ``/` and/C that/I means/V virtually/R everyone/, who/W works/V here/R ./.
INFO : Semi-Supervised-Viterbi: ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/T added/V ,/, ``/` and/C that/I means/V virtually/R everyone/, who/W works/V here/R ./.
INFO : Supervised-Loss:    6/34
INFO : Semi-Supervised-Loss:    3/34
INFO : Supervised-Cross-entropy: 11.143198013305664 nats (= perplexity 2261.7089960764656)
---

In [None]:
def look_at_your_data(model, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

In [19]:
for item in endev:
    print(list(item))
    break

[('_BOS_WORD_', '_BOS_TAG_'), ('``', '`'), ('We', 'P'), ("'re", 'V'), ('strongly', 'R'), ('_OOV_', 'V'), ('that', 'I'), ('anyone', 'N'), ('who', 'W'), ('has', 'V'), ('eaten', 'V'), ('in', 'I'), ('the', 'D'), ('cafeteria', 'N'), ('this', 'D'), ('month', 'N'), ('have', 'V'), ('the', 'D'), ('shot', 'N'), (',', ','), ("''", "'"), ('Mr.', 'N'), ('Mattausch', 'N'), ('added', 'V'), (',', ','), ('``', '`'), ('and', 'C'), ('that', 'D'), ('means', 'V'), ('virtually', 'R'), ('everyone', 'N'), ('who', 'W'), ('works', 'V'), ('here', 'R'), ('.', '.'), ('_EOS_WORD_', '_EOS_TAG_')]


In [34]:
def show_fixed_tokens(model1, model2, dev):
    for m, sent in enumerate(dev):
        pred1 = model1.viterbi_tagging(sent.desupervise(), endev)
        pred2 = model2.viterbi_tagging(sent.desupervise(), endev)
        i = 0
        for (word, gold_tag) in sent:
            # print(word)
            # print(gold_tag, pred1[i], pred2[i])
            if gold_tag != pred1[i][1] and gold_tag == pred2[i][1]:
                print(f"[FIXED] {word}: {pred1[i]} → {pred2[i]} (gold={gold_tag})")
            i += 1

In [35]:
show_fixed_tokens(hmm_sup, hmm, endev)

[FIXED] strongly: ('strongly', 'D') → ('strongly', 'R') (gold=R)
[FIXED] _OOV_: ('_OOV_', 'N') → ('_OOV_', 'V') (gold=V)
[FIXED] cafeteria: ('cafeteria', 'I') → ('cafeteria', 'N') (gold=N)
[FIXED] exclusive: ('exclusive', 'D') → ('exclusive', 'J') (gold=J)
[FIXED] _OOV_: ('_OOV_', 'N') → ('_OOV_', 'V') (gold=V)
[FIXED] up: ('up', 'I') → ('up', 'R') (gold=R)
[FIXED] previously: ('previously', 'D') → ('previously', 'R') (gold=R)
[FIXED] assumed: ('assumed', 'N') → ('assumed', 'V') (gold=V)
[FIXED] offices: ('offices', 'V') → ('offices', 'N') (gold=N)
[FIXED] packaging: ('packaging', 'D') → ('packaging', 'N') (gold=N)
[FIXED] checks: ('checks', ',') → ('checks', 'N') (gold=N)
[FIXED] checks: ('checks', 'P') → ('checks', 'N') (gold=N)
[FIXED] 65: ('65', 'N') → ('65', 'C') (gold=C)
[FIXED] _OOV_: ('_OOV_', 'V') → ('_OOV_', 'R') (gold=R)
[FIXED] reckons: ('reckons', 'D') → ('reckons', 'V') (gold=V)
[FIXED] out: ('out', 'I') → ('out', 'R') (gold=R)
[FIXED] Latin: ('Latin', 'N') → ('Latin', 'J

In [39]:
import collections
def trace_word_fix_context(model, target_word, correct_tag, topk=10):
    counter = collections.defaultdict(float)

    for prev_w, next_w, s, t, contrib in model.A_contrib_log:
        # 只看目标词的上下文
        if next_w == target_word and model.tagset[t] == correct_tag:
            counter[(model.tagset[s], prev_w)] += contrib

    # 排序并输出前 topk 个最重要的上下文贡献
    top = sorted(counter.items(), key=lambda kv: kv[1], reverse=True)[:topk]
    print(f"\nContextual contributors for word '{target_word}' ({correct_tag}):")
    for (prev_tag, prev_word), val in top:
        print(f"  ({prev_tag}→{correct_tag}) in '{prev_word} {target_word}'  contrib={val:.3f}")
trace_word_fix_context(hmm, "tells", "V")

AttributeError: 'HiddenMarkovModel' object has no attribute 'A_contrib_log'

In [12]:
look_at_your_data(hmm, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/T added/V ,/, ``/` and/C that/I means/V virtually/R everyone/, who/W works/V here/R ./.
INFO : Loss:    3/34
INFO : Cross-entropy: 10.617977142333984 nats (= perplexity 1571.554982254888)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/, ``/` _OOV_/P 's/V _OOV_/D _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: 10.876399040222168 nats (= perple

Now let's try supervised training of a CRF (this doesn't use the unsupervised
part of the data, so it is comparable to the supervised pre-training we did
for the HMM).  We will use SGD to approximately maximize the regularized
log-likelihood. 

As with the semi-supervised HMM training, we'll periodically evaluate the
tagging accuracy (and also print the cross-entropy) on a held-out dev set.
We use the default `eval_interval` and `tolerance`.  If you want to stop
sooner, then you could increase the `tolerance` so the training method decides
sooner that it has converged.

We arbitrarily choose reg = 1.0 for L2 regularization, learning rate = 0.05,
and a minibatch size of 10, but it would be better to search for the best
value of these hyperparameters.

Note that the logger reports the CRF's *conditional* cross-entropy, log p(tags
| words) / n.  This is much lower than the HMM's *joint* cross-entropy log
p(tags, words) / n, but that doesn't mean the CRF is worse at tagging.  The
CRF is just predicting less information.

In [14]:
log.info("*** Conditional Random Field (CRF)\n")
crf = ConditionalRandomField(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
crf.train(corpus=ensup, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="ensup_crf.pkl")

INFO : *** Conditional Random Field (CRF)

100%|██████████| 996/996 [00:06<00:00, 160.34it/s]
INFO : Cross-entropy: 3.0507 nats (= perplexity 21.131)
100%|██████████| 996/996 [00:04<00:00, 222.31it/s]
INFO : Tagging accuracy: all: 6.764%, known: 6.831%, seen: 4.209%, novel: 6.803%
100%|██████████| 500/500 [00:09<00:00, 51.92it/s]
100%|██████████| 996/996 [00:06<00:00, 155.69it/s]
INFO : Cross-entropy: 0.9112 nats (= perplexity 2.487)
100%|██████████| 996/996 [00:04<00:00, 227.68it/s]
INFO : Tagging accuracy: all: 72.542%, known: 73.513%, seen: 58.754%, novel: 63.937%
100%|██████████| 500/500 [00:09<00:00, 50.71it/s]
100%|██████████| 996/996 [00:06<00:00, 152.10it/s]
INFO : Cross-entropy: 0.7513 nats (= perplexity 2.120)
100%|██████████| 996/996 [00:04<00:00, 217.26it/s]
INFO : Tagging accuracy: all: 75.310%, known: 77.061%, seen: 55.892%, novel: 57.662%
100%|██████████| 500/500 [00:09<00:00, 50.83it/s]
100%|██████████| 996/996 [00:06<00:00, 164.51it/s]
INFO : Cross-entropy: 0.6580 nats

Let's examine how the CRF does on individual sentences. 
(Do you see any error patterns here that would inspire additional CRF features?)

In [15]:
look_at_your_data(crf, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/J _OOV_/N that/I anyone/N who/W has/V eaten/N in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/N ,/, ``/` and/C that/I means/J virtually/N everyone/N who/W works/V here/R ./.
INFO : Loss:    7/34
INFO : Cross-entropy: 0.7668604254722595 nats (= perplexity 1.7015628106627378)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/J Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Loss:    1/21
INFO : Cross-entropy: 0.4758842885494232 nats (= perpl

In [16]:
hmm = ConditionalRandomField.load("en_crf.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="en_crf_raw.pkl")

INFO : Loaded model from en_crf.pkl
100%|██████████| 996/996 [00:10<00:00, 98.56it/s] 
INFO : Cross-entropy: 0.3986 nats (= perplexity 1.490)
100%|██████████| 996/996 [00:05<00:00, 182.65it/s]
INFO : Tagging accuracy: all: 86.283%, known: 88.490%, seen: 62.963%, novel: 63.606%
100%|██████████| 500/500 [00:10<00:00, 46.07it/s]
100%|██████████| 996/996 [00:10<00:00, 94.32it/s] 
INFO : Cross-entropy: 0.3972 nats (= perplexity 1.488)
100%|██████████| 996/996 [00:05<00:00, 181.47it/s]
INFO : Tagging accuracy: all: 86.350%, known: 88.284%, seen: 64.983%, novel: 66.843%
100%|██████████| 500/500 [00:10<00:00, 49.15it/s]
100%|██████████| 996/996 [00:10<00:00, 97.72it/s] 
INFO : Cross-entropy: 0.3918 nats (= perplexity 1.480)
100%|██████████| 996/996 [00:05<00:00, 183.46it/s]
INFO : Tagging accuracy: all: 86.964%, known: 89.103%, seen: 63.468%, novel: 65.324%
100%|██████████| 500/500 [00:10<00:00, 49.68it/s]
100%|██████████| 996/996 [00:09<00:00, 103.99it/s]
INFO : Cross-entropy: 0.3867 nats (= 