In [1]:
#!/usr/bin/env python3
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import math
import os
from pathlib import Path
import torch

In [3]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

In [4]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG

In [11]:
os.chdir("../data")
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
enraw =   TaggedCorpus(Path("enraw"), tagset=entrain.tagset, vocab=entrain.vocab)  # raw
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=} {len(enraw)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996 len(enraw)=4013


In [13]:
os.chdir("../data")
cztrain = TaggedCorpus(Path("czsup"), Path("czraw"))                               # all training
czsup =   TaggedCorpus(Path("czsup"), tagset=cztrain.tagset, vocab=cztrain.vocab)  # supervised training
czdev =   TaggedCorpus(Path("czdev"), tagset=cztrain.tagset, vocab=cztrain.vocab)  # evaluation
czraw =   TaggedCorpus(Path("czraw"), tagset=cztrain.tagset, vocab=cztrain.vocab)  # raw
print(f"{len(cztrain)=}  {len(czsup)=}  {len(czdev)=} {len(czraw)=}")

INFO : Read 283514 tokens from czsup, czraw
INFO : Created 67 tag types
INFO : Created 50542 word types


len(cztrain)=16469  len(czsup)=8439  len(czdev)=2148 len(czraw)=8030


In [None]:
* The perplexity of the hmm model trained on ensup data produces lower perplexity on enraw data than the endev data. The number of records observed in the endev dataset is too few to make a fair comparison against the enraw unlabeled data, which has 4x number of records.
The perplexity of the model when evaluated on ensup4k is ~534 much lesser than the other two datasets, which might be due to the presence of similar tagged sentences in the evaluation dataset to what the model was trained on.

* The perplexity of the hmm model trained on czsup data produces lower perpexity on the dev data than the czraw data.

> During the evaluation of model on raw data, the score computation block marginalizes over all possible tags 
and hence it might result in lower perplexity.
> During the evaluation of model on dev data, the score computation block scores on the joint distribution of word and tags, which might be higher.
> For language modeling/ language generation, the perplexity on the raw data matters more.
> For the supervised tagging task, the perplexity on the dev data matters more. But ultimately, the accuracy and F1 score give us a better picture than the perplexity on the dev data.





In [32]:
import os, sys, importlib
try:
    get_ipython().run_line_magic("autoreload", "0")
except Exception:
    pass

# Ensure we only have a single copy of these modules
for m in ["corpus", "hmm", "eval", "integerize", "tag", "crf", "crf_neural", "lexicon"]:
    if m in sys.modules:
        del sys.modules[m]

import corpus
import hmm

# Reload in this order so hmm binds to the SAME corpus.Sentence class
importlib.reload(corpus)
importlib.reload(hmm)

# Now import the names you use AFTER the reload
from corpus import TaggedCorpus  # avoid importing Sentence in helpers
from hmm import HiddenMarkovModel

from dataclasses import dataclass
from typing import List, Optional, Dict, Any, Tuple, Union
from pathlib import Path
import math, torch
import inspect
from corpus import TaggedCorpus, Sentence
from hmm import HiddenMarkovModel

Pathish = Union[str, Path]

@dataclass
class Phase:
    name: str
    corpus: Union[str, Pathish, List[Pathish], TaggedCorpus]  # "sup" | "raw" | paths | TaggedCorpus
    weight: float = 1.0
    tau: Optional[float] = None
    passes: int = 1

def _load_corpus(paths, tagset=None, vocab=None) -> TaggedCorpus:
    if isinstance(paths, TaggedCorpus):
        # already a corpus
        return paths
    if isinstance(paths, (str, Path)):
        paths = [Path(paths)]
    else:
        paths = [Path(p) for p in paths]
    return TaggedCorpus(*paths, tagset=tagset, vocab=vocab)

def _sentence_token_count(sent: Sentence) -> int:
    return max(0, len(sent) - 2)

def _eval_cross_entropy(model: HiddenMarkovModel, eval_corpus: TaggedCorpus) -> Tuple[float, float]:
    tot_logprob = 0.0
    tot_tokens = 0
    with torch.inference_mode():
        for sent in eval_corpus:
            if len(sent) <= 2: 
                continue
            tot_logprob += float(model.logprob(sent, eval_corpus))
            tot_tokens += _sentence_token_count(sent)
    xent = -tot_logprob / max(1, tot_tokens)
    return xent, math.exp(xent)

def _strip_tags(sent):
    n = len(sent)
    out = []
    for i, (w, t) in enumerate(sent):
        out.append((w, t if i in (0, n-1) else None))
    return type(sent)(out)


def _eval_accuracy_breakdown(model: HiddenMarkovModel,
                             eval_corpus: TaggedCorpus,
                             sup_vocab: set,
                             raw_vocab: set) -> Dict[str, float]:
    tot = corr = 0
    buckets = {"known":[0,0], "seen":[0,0], "novel":[0,0]}
    with torch.inference_mode():
        for sent in eval_corpus:
            if len(sent) <= 2: continue
            pred = model.viterbi_tagging(_strip_tags(sent), eval_corpus)
            for j in range(1, len(sent)-1):
                w, gold = sent[j]
                _, ptag = pred[j]
                tot += 1; corr += int(ptag == gold)
                if w in sup_vocab: b="known"
                elif w in raw_vocab: b="seen"
                else: b="novel"
                buckets[b][0] += 1; buckets[b][1] += int(ptag == gold)
    pct = lambda ok,n: 100.0*ok/n if n else float("nan")
    return {
        "all": pct(corr, tot),
        "known": pct(buckets["known"][1], buckets["known"][0]),
        "seen": pct(buckets["seen"][1], buckets["seen"][0]),
        "novel": pct(buckets["novel"][1], buckets["novel"][0]),
    }

def _resolve_phase_corpus(ph_corpus, model, sup_corpus, raw_corpus) -> TaggedCorpus:
    """Map 'sup'/'raw'/paths/TaggedCorpus → TaggedCorpus using model's tagset/vocab."""
    if isinstance(ph_corpus, TaggedCorpus):
        return ph_corpus
    if isinstance(ph_corpus, str) and ph_corpus.lower() == "sup":
        return sup_corpus
    if isinstance(ph_corpus, str) and ph_corpus.lower() == "raw":
        return raw_corpus
    # else treat as paths
    return _load_corpus(ph_corpus, tagset=model.tagset, vocab=model.vocab)

def _em_epoch(model: HiddenMarkovModel, phases: list, λ: float) -> None:
    model._zero_counts()

    # detect once whether E_step has a 'tau' parameter
    _esig = inspect.signature(model.E_step)
    _supports_tau = ("tau" in _esig.parameters)

    warned = False  # warn once per epoch if tau requested but unsupported
    for ph in phases:
        for _ in range(ph.passes):
            for sent in ph.corpus:
                isent = model._integerize_sentence(sent, ph.corpus)

                if _supports_tau and ph.tau is not None:
                    model.E_step(isent, mult=ph.weight, tau=ph.tau)
                else:
                    if (ph.tau is not None) and (not _supports_tau) and (not warned):
                        print("⚠️  tau requested but your HMM.E_step doesn't support it; "
                              "running without τ-sharpening.")
                        warned = True
                    model.E_step(isent, mult=ph.weight)

    model.M_step(λ)

def train_with_phases(
    sup_paths,
    raw_paths,
    eval_path,
    *,
    unigram: bool = False,
    λ: float = 0.1,
    init_from: Optional[HiddenMarkovModel] = None,
    phases: List[Phase],
    max_epochs: int = 5,
    tol: float = 1e-3,
    verbose: bool = True,
) -> Dict[str, Any]:
    # Build tagset/vocab from (sup ∪ raw) only
    build_corpus = _load_corpus(list(sup_paths) + list(raw_paths))
    model = init_from or HiddenMarkovModel(build_corpus.tagset, build_corpus.vocab, unigram=unigram)

    # Canonical corpora bound to model's vocab/tagset
    sup = _load_corpus(sup_paths, tagset=model.tagset, vocab=model.vocab)
    raw = _load_corpus(raw_paths, tagset=model.tagset, vocab=model.vocab)
    dev = _load_corpus(eval_path,   tagset=model.tagset, vocab=model.vocab)

    # Resolve each Phase's corpus now
    resolved_phases: List[Phase] = []
    for ph in phases:
        corpus_fixed = _resolve_phase_corpus(ph.corpus, model, sup, raw)
        resolved_phases.append(Phase(ph.name, corpus_fixed, ph.weight, ph.tau, ph.passes))

    # EM loop with simple dev x-ent early stopping
    prev_xent = float("inf")
    for epoch in range(1, max_epochs+1):
        _em_epoch(model, resolved_phases, λ=λ)
        xent, ppl = _eval_cross_entropy(model, dev)
        if verbose:
            print(f"epoch {epoch:02d}: x-ent={xent:.4f} nats, ppl={ppl:.1f}")
        if prev_xent - xent < tol:
            break
        prev_xent = xent

    # Metrics
    sup_vocab = set(sup.vocab); raw_vocab = set(raw.vocab)
    xent, ppl = _eval_cross_entropy(model, dev)
    accs = _eval_accuracy_breakdown(model, dev, sup_vocab, raw_vocab)
    return {
        "model": model,
        "xent_nats": xent,
        "perplexity": ppl,
        "accuracy_all": accs["all"],
        "accuracy_known": accs["known"],
        "accuracy_seen": accs["seen"],
        "accuracy_novel": accs["novel"],
    }



def run_strategy_ensup_only(ensup, enraw, endev, λ=0.1, epochs=5, unigram=False):
    return train_with_phases(
        [ensup], [enraw], endev,
        unigram=unigram, λ=λ, max_epochs=epochs,
        phases=[Phase("sup", "sup", weight=1.0, tau=None, passes=1)]
    )

def run_strategy_sup3_raw1(ensup, enraw, endev, λ=0.1, epochs=5, unigram=False):
    return train_with_phases(
        [ensup], [enraw], endev,
        unigram=unigram, λ=λ, max_epochs=epochs,
        phases=[Phase("sup×3", "sup", weight=3.0),
                Phase("raw×1", "raw", weight=1.0)]
    )

def run_strategy_staged_sup_raw_sup(ensup, enraw, endev, λ=0.1, epochs=5, unigram=False):
    return train_with_phases(
        [ensup], [enraw], endev,
        unigram=unigram, λ=λ, max_epochs=epochs,
        phases=[Phase("pre-sup",  "sup", weight=2.0),
                Phase("raw",      "raw", weight=1.0),
                Phase("post-sup", "sup", weight=2.0)]
    )

def run_strategy_tau_sharpened_raw(ensup, enraw, endev, λ=0.1, epochs=5, tau=0.7, unigram=False):
    return train_with_phases(
        [ensup], [enraw], endev,
        unigram=unigram, λ=λ, max_epochs=epochs,
        phases=[Phase("sup", "sup", weight=2.0, tau=None),
                Phase("raw-τ", "raw", weight=1.0, tau=tau)]  # needs E_step(..., tau=...) support
    )

def run_strategy_enraw_only_then_tiny_sup(ensup, enraw, endev, λ=0.1, epochs=5, sup_weight=10.0, unigram=False):
    return train_with_phases(
        [ensup], [enraw], endev,
        unigram=unigram, λ=λ, max_epochs=epochs,
        phases=[Phase("raw-only", "raw", weight=1.0),
                Phase("tiny-sup", "sup", weight=sup_weight)]
    )


In [31]:
ensup = "../data/ensup"
enraw = "../data/enraw"
endev = "../data/endev"

rows = []
rows.append(("ensup only", run_strategy_ensup_only(ensup, enraw, endev)))
rows.append(("ensup×3 + enraw", run_strategy_sup3_raw1(ensup, enraw, endev)))
rows.append(("staged: sup→raw→sup", run_strategy_staged_sup_raw_sup(ensup, enraw, endev)))
rows.append(("τ-sharpened raw (τ=0.7)", run_strategy_tau_sharpened_raw(ensup, enraw, endev, tau=0.7)))
rows.append(("enraw-only → tiny sup", run_strategy_enraw_only_then_tiny_sup(ensup, enraw, endev)))

for name, out in rows:
    print(f"\n{name}")
    print(f"  x-ent  : {out['xent_nats']:.4f} nats  (ppl={out['perplexity']:.1f})")
    print(f"  acc all/known/seen/novel : "
          f"{out['accuracy_all']:.2f} / {out['accuracy_known']:.2f} / "
          f"{out['accuracy_seen']:.2f} / {out['accuracy_novel']:.2f}")


INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


epoch 01: x-ent=7.2796 nats, ppl=1450.5
epoch 02: x-ent=7.2796 nats, ppl=1450.5


INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


epoch 01: x-ent=7.4620 nats, ppl=1740.7
epoch 02: x-ent=7.2455 nats, ppl=1401.8
epoch 03: x-ent=7.2120 nats, ppl=1355.5
epoch 04: x-ent=7.2064 nats, ppl=1348.1
epoch 05: x-ent=7.2063 nats, ppl=1347.9


INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


epoch 01: x-ent=7.4184 nats, ppl=1666.4
epoch 02: x-ent=7.2410 nats, ppl=1395.5
epoch 03: x-ent=7.2167 nats, ppl=1361.9
epoch 04: x-ent=7.2129 nats, ppl=1356.9
epoch 05: x-ent=7.2129 nats, ppl=1356.8


INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


⚠️  tau requested but your HMM.E_step doesn't support it; running without τ-sharpening.
epoch 01: x-ent=7.5499 nats, ppl=1900.6
⚠️  tau requested but your HMM.E_step doesn't support it; running without τ-sharpening.
epoch 02: x-ent=7.2689 nats, ppl=1434.9
⚠️  tau requested but your HMM.E_step doesn't support it; running without τ-sharpening.
epoch 03: x-ent=7.2160 nats, ppl=1361.1
⚠️  tau requested but your HMM.E_step doesn't support it; running without τ-sharpening.
epoch 04: x-ent=7.2065 nats, ppl=1348.2
⚠️  tau requested but your HMM.E_step doesn't support it; running without τ-sharpening.
epoch 05: x-ent=7.2062 nats, ppl=1347.7


INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


epoch 01: x-ent=7.3583 nats, ppl=1569.2
epoch 02: x-ent=7.2699 nats, ppl=1436.5
epoch 03: x-ent=7.2611 nats, ppl=1423.8
epoch 04: x-ent=7.2603 nats, ppl=1422.6

ensup only
  x-ent  : 7.2796 nats  (ppl=1450.5)
  acc all/known/seen/novel : 91.07 / 91.07 / nan / nan

ensup×3 + enraw
  x-ent  : 7.2063 nats  (ppl=1347.9)
  acc all/known/seen/novel : 89.72 / 89.72 / nan / nan

staged: sup→raw→sup
  x-ent  : 7.2129 nats  (ppl=1356.8)
  acc all/known/seen/novel : 89.79 / 89.79 / nan / nan

τ-sharpened raw (τ=0.7)
  x-ent  : 7.2062 nats  (ppl=1347.7)
  acc all/known/seen/novel : 89.35 / 89.35 / nan / nan

enraw-only → tiny sup
  x-ent  : 7.2603 nats  (ppl=1422.6)
  acc all/known/seen/novel : 90.23 / 90.23 / nan / nan
