In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [3]:
import logging
import math
import os
from pathlib import Path
import torch

In [4]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

Set up logging.

In [5]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG

Switch working directory to the directory where the data live.  You may need to edit this line.

In [6]:
os.chdir("../data")

In [7]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996


In [8]:
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']


Make an HMM.  Let's do some pre-training to approximately maximize the
regularized log-likelihood on supervised training data.  In other words, the
probabilities at the M step will just be supervised count ratios.

On each epoch, you will see two progress bars: first it collects counts from
all the sentences (E step), and then after the M step, it evaluates the loss
function, which is the (unregularized) cross-entropy on the training set.

The parameters don't actually matter during the E step because there are no
hidden tags to impute.  The first M step will jump right to the optimal
solution.  The code will try a second epoch with the revised parameters, but
the result will be identical, so it will detect convergence and stop.

We arbitrarily choose λ=1 for our add-λ smoothing at the M step, but it would
be better to search for the best value of this hyperparameter.

In [10]:
negative_log_likelihood = lambda model: model_cross_entropy(
    model, icraw
)

In [11]:
from eval import model_cross_entropy, write_tagging
ictrain = TaggedCorpus(Path("icsup"), Path("icraw"))                               # all training
icsup =   TaggedCorpus(Path("icsup"), tagset=ictrain.tagset, vocab=ictrain.vocab)  # supervised training
icdev =   TaggedCorpus(Path("icdev"), tagset=ictrain.tagset, vocab=ictrain.vocab)  # evaluation

print(f"{len(ictrain)=}  {len(icsup)=}  {len(icdev)=}")
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab)
hmm.B = torch.tensor(
    [
        [0.7000, 0.2000, 0.1000],  # emission probabilities
        [0.1000, 0.2000, 0.7000],
        [0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000],
    ]
)
hmm.A = torch.tensor(
    [
        [0.8000, 0.1000, 0.1000, 0.0000],  # transition probabilities
        [0.1000, 0.8000, 0.1000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
    ]
)

log.info(
    "*** Current A, B matrices (using initalizations from the ice cream spreadsheet)"
)
hmm.printAB()

# Try it out on the raw data from the spreadsheet, available in `icraw``.
log.info("*** Viterbi results on icraw with hard coded parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(
    hmm, icraw, Path("icraw_hmm.output")
)  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_hmm.output")  # print the file we just created
hmm.printAB()
hmm.train(corpus=icraw, loss=negative_log_likelihood, tolerance=0.0001, max_steps=50000)
hmm.printAB()

INFO : Read 73 tokens from icsup, icraw
INFO : Created 4 tag types
INFO : Created 6 word types


INFO : *** Current A, B matrices (using initalizations from the ice cream spreadsheet)
INFO : *** Viterbi results on icraw with hard coded parameters


len(ictrain)=5  len(icsup)=4  len(icdev)=1
Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 30.91it/s]


2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H
Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 200.85it/s]
INFO : Cross-entropy: 1.2217 nats (= perplexity 3.393)
100%|██████████| 1/1 [00:00<00:00, 3609.56it/s]
100%|██████████| 1/1 [00:00<00:00, 619.54it/s]
INFO : Cross-entropy: 1.7714 nats (= perplexity 5.879)
INFO : Saved model to my_hmm.pkl


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.333	0.333	0.333	0.000
H	0.333	0.333	0.333	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.333	0.333	0.333	0.000

Emission matrix B:
	1	2	3	_OOV_
C	0.250	0.250	0.250	0.250
H	0.250	0.250	0.250	0.250
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000	0.000




In [14]:
from tqdm import tqdm
import sys
sys.path.append("../code")
from test_ic.py import negative_log_likelihood

corpus = icraw

# for i in range(10):
#     for sentence in tqdm(corpus, total=len(corpus), leave=True):
#         isent = hmm._integerize_sentence(sentence, corpus)
#         hmm.E_step(isent)
#         hmm.M_step(isent)

log.info("*** Viterbi results on icraw with hard coded parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(
    hmm, icraw, Path("icraw_hmm.output"), tagger_method="posterior"
)  # calls hmm.viterbi_tagging on each sentence

INFO : Read 40 tokens from icsup
INFO : Created 4 tag types
INFO : Created 5 word types
INFO : Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO : Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']
INFO : 1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
INFO : *** Hidden Markov Model (HMM) test

INFO : *** Current A, B matrices (using initalizations from the ice cream spreadsheet)
INFO : *** Viterbi results on icraw with hard coded parameters


1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 274.19it/s]
INFO : *** Compare to icdev corpus:
2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


100%|██████████| 1/1 [00:00<00:00, 284.32it/s]
INFO : Tagging accuracy: all: 100.000%, seen: 100.000%, novel: nan%
INFO : *** A, B matrices as randomly initialized close to uniform
INFO : *** Supervised HMM training on icsup


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.334	0.334	0.332	0.000
H	0.334	0.332	0.334	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.333	0.334	0.334	0.000

Emission matrix B:
	1	2	3
C	0.333	0.335	0.332
H	0.333	0.333	0.334
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 4/4 [00:00<00:00, 1377.44it/s]
INFO : Cross-entropy: 2.0979 nats (= perplexity 8.149)
100%|██████████| 4/4 [00:00<00:00, 305.44it/s]
100%|██████████| 4/4 [00:00<00:00, 1679.91it/s]
INFO : Cross-entropy: 1.3729 nats (= perplexity 3.947)
100%|██████████| 4/4 [00:00<00:00, 783.10it/s]
100%|██████████| 4/4 [00:00<00:00, 1480.39it/s]
INFO : Cross-entropy: 1.3729 nats (= perplexity 3.947)
INFO : Saved model to my_hmm.pkl
INFO : *** A, B matrices after training on icsup (should match initial params on spreadsheet [transposed])
INFO : *** Viterbi results on icraw


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 23.93it/s]
INFO : *** Forward algorithm on icraw (should approximately match iteration 0 on spreadsheet)
INFO : 9.12758979993639e-19 = p(2 3 3 2 3 2 3 2 2 3 1 3 3 1 1 1 2 1 1 1 3 1 2 1 1 1 2 3 3 2 3 2 2)
INFO : *** Reestimating on icraw (perplexity should improve on every iteration)


2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/C 3/H 3/H 2/H 3/H 2/H 2/H


100%|██████████| 1/1 [00:00<00:00, 494.61it/s]
INFO : Cross-entropy: 1.2217 nats (= perplexity 3.393)
100%|██████████| 1/1 [00:00<00:00, 123.65it/s]
100%|██████████| 1/1 [00:00<00:00, 183.72it/s]
INFO : Cross-entropy: 1.0839 nats (= perplexity 2.956)
100%|██████████| 1/1 [00:00<00:00, 76.28it/s]
100%|██████████| 1/1 [00:00<00:00, 606.03it/s]
INFO : Cross-entropy: 1.0739 nats (= perplexity 2.927)
100%|██████████| 1/1 [00:00<00:00, 223.37it/s]
100%|██████████| 1/1 [00:00<00:00, 701.86it/s]
INFO : Cross-entropy: 1.0919 nats (= perplexity 2.980)
INFO : Saved model to my_hmm.pkl
INFO : *** A, B matrices after reestimation on icraw should match final params on spreadsheet [transposed])
INFO : *** Viterbi results on icraw after reestimation on icraw


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.794	0.206	0.000	0.000
H	0.028	0.927	0.045	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.001	0.999	0.000	0.000

Emission matrix B:
	1	2	3
C	0.695	0.155	0.149
H	0.032	0.482	0.487
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 407.17it/s]
INFO : *** Conditional Random Field (CRF) test



2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/C 3/C 3/C 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


NotImplementedError: 

In [9]:
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(
    hmm, icraw, Path("icraw_hmm.output")
) 

NameError: name 'icsup' is not defined

In [None]:
sys.path.append("../code")
from test_ic.py import negative_log_likelihood

for sentence in tqdm(corpus, total=len(corpus), leave=True):
    isent = hmm._integerize_sentence(sentence, corpus)
    hmm.E_step(isent=isent)
    hmm.M_step(isent=isent)
    hmm.printAB()
    break

INFO : Read 40 tokens from icsup
INFO : Created 4 tag types
INFO : Created 5 word types
INFO : Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO : Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']


INFO : 1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
INFO : *** Hidden Markov Model (HMM) test

INFO : *** Current A, B matrices (using initalizations from the ice cream spreadsheet)
INFO : *** Viterbi results on icraw with hard coded parameters


1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 344.81it/s]
INFO : *** Compare to icdev corpus:
2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


100%|██████████| 1/1 [00:00<00:00, 210.89it/s]
INFO : Tagging accuracy: all: 100.000%, seen: 100.000%, novel: nan%
INFO : *** A, B matrices as randomly initialized close to uniform
INFO : *** Supervised HMM training on icsup


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.334	0.333	0.333	0.000
H	0.334	0.334	0.332	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.332	0.335	0.333	0.000

Emission matrix B:
	1	2	3
C	0.333	0.333	0.335
H	0.334	0.333	0.333
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 4/4 [00:00<00:00, 1515.15it/s]
INFO : Cross-entropy: 2.0974 nats (= perplexity 8.145)
100%|██████████| 4/4 [00:00<00:00, 1320.21it/s]
100%|██████████| 4/4 [00:00<00:00, 2140.22it/s]
INFO : Cross-entropy: 1.3729 nats (= perplexity 3.947)
100%|██████████| 4/4 [00:00<00:00, 2544.70it/s]
100%|██████████| 4/4 [00:00<00:00, 1953.79it/s]
INFO : Cross-entropy: 1.3729 nats (= perplexity 3.947)
INFO : Saved model to my_hmm.pkl
INFO : *** A, B matrices after training on icsup (should match initial params on spreadsheet [transposed])
INFO : *** Viterbi results on icraw


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 334.47it/s]
INFO : *** Forward algorithm on icraw (should approximately match iteration 0 on spreadsheet)
INFO : 9.127694257509654e-19 = p(2 3 3 2 3 2 3 2 2 3 1 3 3 1 1 1 2 1 1 1 3 1 2 1 1 1 2 3 3 2 3 2 2)
INFO : *** Reestimating on icraw (perplexity should improve on every iteration)


2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H


100%|██████████| 1/1 [00:00<00:00, 565.57it/s]
INFO : Cross-entropy: 1.2217 nats (= perplexity 3.393)
100%|██████████| 1/1 [00:00<00:00, 1453.33it/s]
100%|██████████| 1/1 [00:00<00:00, 680.34it/s]
INFO : Cross-entropy: 1.4922 nats (= perplexity 4.447)
INFO : Saved model to my_hmm.pkl
INFO : *** A, B matrices after reestimation on icraw should match final params on spreadsheet [transposed])
INFO : *** Viterbi results on icraw after reestimation on icraw


Transition matrix A:
	(C|...)	(H|...)	(_EOS_TAG_|...)	(_BOS_TAG_|...)
C	0.333	0.333	0.333	0.000
H	0.333	0.333	0.333	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.333	0.333	0.333	0.000

Emission matrix B:
	1	2	3
C	0.333	0.333	0.333
H	0.333	0.333	0.333
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




100%|██████████| 1/1 [00:00<00:00, 369.71it/s]
INFO : *** Conditional Random Field (CRF) test



2/C 3/C 3/C 2/C 3/C 2/C 3/C 2/C 2/C 3/C 1/C 3/C 3/C 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/C 3/C 3/C 2/C 3/C 2/C 2/C


NotImplementedError: 

In [10]:
hmm.printAB()

NameError: name 'hmm' is not defined

In [11]:
log.info("*** Hidden Markov Model (HMM)")
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_sup, λ=1.0,
          save_path="ensup_hmm.pkl") 

INFO : *** Hidden Markov Model (HMM)
100%|██████████| 4051/4051 [00:13<00:00, 310.45it/s]
INFO : Cross-entropy: 12.6445 nats (= perplexity 310045.271)
100%|██████████| 4051/4051 [00:31<00:00, 129.50it/s]
100%|██████████| 4051/4051 [00:09<00:00, 422.11it/s]
INFO : Cross-entropy: 7.4503 nats (= perplexity 1720.395)
100%|██████████| 4051/4051 [00:29<00:00, 138.85it/s]
100%|██████████| 4051/4051 [00:09<00:00, 416.39it/s]
INFO : Cross-entropy: 7.4503 nats (= perplexity 1720.403)
INFO : Saved model to ensup_hmm.pkl


Now let's throw in the unsupervised training data as well, and continue
training as before, in order to increase the regularized log-likelihood on
this larger, semi-supervised training set.  It's now the *incomplete-data*
log-likelihood.

This time, we'll use a different evaluation loss function: we'll stop when the
*tagging error rate* on a held-out dev set stops getting better.  Also, the
implementation of this loss function (`viterbi_error_rate`) includes a helpful
side effect: it logs the *cross-entropy* on the held-out dataset as well, just
for your information.

We hope that held-out tagging accuracy will go up for a little bit before it
goes down again (see Merialdo 1994). (Log-likelihood on training data will
continue to improve, and that improvement may generalize to held-out
cross-entropy.  But getting accuracy to increase is harder.)

In [12]:
hmm = HiddenMarkovModel.load("ensup_hmm.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, λ=1.0,
          save_path="entrain_hmm.pkl")

INFO : Loaded model from ensup_hmm.pkl
100%|██████████| 996/996 [00:03<00:00, 313.12it/s]
INFO : Cross-entropy: 7.5993 nats (= perplexity 1996.797)
100%|██████████| 996/996 [00:05<00:00, 175.49it/s]
INFO : Tagging accuracy: all: 88.663%, known: 93.059%, seen: 44.108%, novel: 42.734%
100%|██████████| 8064/8064 [01:15<00:00, 106.45it/s]
100%|██████████| 996/996 [00:03<00:00, 320.06it/s]
INFO : Cross-entropy: 7.3485 nats (= perplexity 1553.848)
100%|██████████| 996/996 [00:14<00:00, 66.50it/s] 
INFO : Tagging accuracy: all: 87.031%, known: 91.397%, seen: 45.791%, novel: 40.225%
INFO : Saved model to entrain_hmm.pkl


You can also retry the above workflow where you start with a worse supervised
model (like Merialdo).  Does EM help more in that case?  It's easiest to rerun
exactly the code above, but first make the `ensup` file smaller by copying
`ensup-tiny` over it.  `ensup-tiny` is only 25 sentences (that happen to cover
all tags in `endev`).  Back up your old `ensup` and your old `*.pkl` models
before you do this.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [13]:
def look_at_your_data(model, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

[autoreload of corpus failed: Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/nlp-class-tag/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/homebrew/Caskroom/miniconda/base/envs/nlp-class-tag/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/opt/homebrew/Caskroom/miniconda/base/envs/nlp-class-tag/lib/python3.9/importlib/__init__.py", line 168, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'corpus'
]


In [14]:
look_at_your_data(hmm, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/T added/V ,/, ``/` and/C that/I means/V virtually/R everyone/, who/W works/V here/R ./.
INFO : Loss:    3/34
INFO : Cross-entropy: 10.61779499053955 nats (= perplexity 1571.3564102110174)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/, ``/` _OOV_/P 's/V _OOV_/D _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: 10.876285552978516 nats (= perple

Now let's try supervised training of a CRF (this doesn't use the unsupervised
part of the data, so it is comparable to the supervised pre-training we did
for the HMM).  We will use SGD to approximately maximize the regularized
log-likelihood. 

As with the semi-supervised HMM training, we'll periodically evaluate the
tagging accuracy (and also print the cross-entropy) on a held-out dev set.
We use the default `eval_interval` and `tolerance`.  If you want to stop
sooner, then you could increase the `tolerance` so the training method decides
sooner that it has converged.

We arbitrarily choose reg = 1.0 for L2 regularization, learning rate = 0.05,
and a minibatch size of 10, but it would be better to search for the best
value of these hyperparameters.

Note that the logger reports the CRF's *conditional* cross-entropy, log p(tags
| words) / n.  This is much lower than the HMM's *joint* cross-entropy log
p(tags, words) / n, but that doesn't mean the CRF is worse at tagging.  The
CRF is just predicting less information.

In [15]:
log.info("*** Conditional Random Field (CRF)\n")
crf = ConditionalRandomField(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
crf.train(corpus=ensup, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="ensup_crf.pkl")

INFO : *** Conditional Random Field (CRF)

100%|██████████| 996/996 [00:07<00:00, 140.96it/s]
INFO : Cross-entropy: 3.0510 nats (= perplexity 21.136)
100%|██████████| 996/996 [00:04<00:00, 228.30it/s]
INFO : Tagging accuracy: all: 4.514%, known: 4.720%, seen: 5.219%, novel: 1.255%
100%|██████████| 500/500 [00:08<00:00, 60.60it/s]
100%|██████████| 996/996 [00:06<00:00, 156.50it/s]
INFO : Cross-entropy: 0.9115 nats (= perplexity 2.488)
100%|██████████| 996/996 [00:07<00:00, 137.38it/s]
INFO : Tagging accuracy: all: 72.396%, known: 73.582%, seen: 57.912%, novel: 60.964%
100%|██████████| 500/500 [00:10<00:00, 45.88it/s]
100%|██████████| 996/996 [00:06<00:00, 164.46it/s]
INFO : Cross-entropy: 0.7517 nats (= perplexity 2.121)
100%|██████████| 996/996 [00:05<00:00, 181.65it/s]
INFO : Tagging accuracy: all: 75.247%, known: 77.016%, seen: 55.892%, novel: 57.332%
100%|██████████| 500/500 [00:10<00:00, 46.62it/s]
100%|██████████| 996/996 [00:06<00:00, 149.54it/s]
INFO : Cross-entropy: 0.6582 nats

Let's examine how the CRF does on individual sentences. 
(Do you see any error patterns here that would inspire additional CRF features?)

In [None]:
look_at_your_data(crf, endev, 10)

In [None]:
# Results
HMM trained on ensup and evaluated on endev - Lambda 0
INFO:eval:Cross-entropy: 9.9119 nats (= perplexity 20169.835)
INFO:eval:Tagging accuracy: all: 90.455%, known: 96.786%, seen: nan%, novel: 24.858%

HMM trained on ensup and evaluated on endev - Lambda 0.5
INFO:eval:Cross-entropy: 7.1537 nats (= perplexity 1278.872)
INFO:eval:Tagging accuracy: all: 89.866%, known: 94.542%, seen: nan%, novel: 41.414%

HMM trained on ensup and evaluated on endev - Lambda 1
INFO:eval:Cross-entropy: 7.3767 nats (= perplexity 1598.299)
INFO:eval:Tagging accuracy: all: 88.421%, known: 92.871%, seen: nan%, novel: 42.315%

HMM trained on ensup and evaluated on endev - Lambda 2
INFO:eval:Cross-entropy: 7.6768 nats (= perplexity 2157.697)
INFO:eval:Tagging accuracy: all: 86.609%, known: 90.733%, seen: nan%, novel: 43.880%

CRF trained on ensup and evaluated on endev - LR 0.05
INFO:eval:Cross-entropy: 0.2564 nats (= perplexity 1.292)
INFO:eval:Tagging accuracy: all: 91.031%, known: 93.682%, seen: nan%, novel: 63.567%

CRF Batch Size 32 - LR 0.05
INFO:eval:Cross-entropy: 0.2717 nats (= perplexity 1.312)
INFO:eval:Tagging accuracy: all: 90.626%, known: 93.109%, seen: nan%, novel: 64.896%

CRF Batch Size 32 - LR 0.01
INFO:eval:Cross-entropy: 0.3600 nats (= perplexity 1.433)
INFO:eval:Tagging accuracy: all: 87.482%, known: 89.666%, seen: nan%, novel: 64.848%

CRF Batch Size 32 - LR 0.1
INFO:eval:Cross-entropy: 0.5340 nats (= perplexity 1.706)
INFO:eval:Tagging accuracy: all: 82.003%, known: 83.746%, seen: nan%, novel: 63.947%

CRF Batch Size 128 - LR 0.05
INFO:eval:Cross-entropy: 4.4629 nats (= perplexity 86.738)
INFO:eval:Tagging accuracy: all: 61.623%, known: 62.126%, seen: nan%, novel: 56.404%

CRF Batch Size 256 - LR 0.5
INFO:eval:Cross-entropy: 7.6683 nats (= perplexity 2139.514)
INFO:eval:Tagging accuracy: all: 63.318%, known: 66.471%, seen: nan%, novel: 30.645%

CRF Batch Size 32 - LR 0.01 Reg 1e-4
INFO:eval:Cross-entropy: 0.2716 nats (= perplexity 1.312)
INFO:eval:Tagging accuracy: all: 90.643%, known: 93.128%, seen: nan%, novel: 64.896%