# NER exercises

This is a modified, more compact version of (one variant of) the CRF tagger code from the NER notebook. You may wish to use this as a starting point for doing some of the exercises.

(This first bit is Jupyter magic and required imports, feel free to ignore)

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%run ../lib/conllu.ipynb
%run ../lib/visualization.ipynb

import sklearn_crfsuite
import eli5

from collections import namedtuple

from sklearn_crfsuite.metrics import flat_f1_score as f1_score
from sklearn_crfsuite.metrics import flat_classification_report as classification_report
from eli5.sklearn_crfsuite.explain_weights import sorted_for_ner

# Data loading

Look here if you want to work on English.

In [None]:
# Finnish

train_sentences = read_conll('../data/finer-featurized/digitoday.2014.train.conllu')
devel_sentences = read_conll('../data/finer-featurized/digitoday.2014.dev.conllu')
test_sentences = read_conll('../data/finer-featurized/digitoday.2015.test.conllu')
test_sentences_wiki = read_conll('../data/finer-featurized/wikipedia.test.conllu')

# English (uncomment if preferred)

# train_sentences = read_conll('../data/conll03-featurized/eng.train.conllu')
# devel_sentences = read_conll('../data/conll03-featurized/eng.dev.conllu')
# test_sentences = read_conll('../data/conll03-featurized/eng.test.conllu')
# test_sentences_wiki = []    # Sorry, we don't this dataset for English

## Data mangling and featurization

You'll want to edit this bit if you want to try different features.

In [None]:
Word = namedtuple('Word', [
    'form',
    'lemma',
    'upos',
    'xpos',
    'morpho',
])


def reformat_sentences(sentences):
    # separate tags, drop parse fields, cast as namedtuples
    tags, data = [], []
    for sentence in sentences:
        tags.append([w[0] for w in sentence])
        data.append([Word(*w[1:6]) for w in sentence])
    return tags, data


def less_minimal_featurizer(sentence, idx):
    word = sentence[idx]
    features = {
        'form': word.form,
        'form.lower()': word.form.lower(),
        'form[:2]': word.form[:2],
        'form[:3]': word.form[:3],
        'form[-2:]': word.form[-2:],
        'form[-3:]': word.form[-3:],
        'lemma': word.lemma,
        'lemma.lower()': word.lemma.lower(),
        'upos': word.upos,
        'xpos': word.xpos,
    }
    # morpho is either empty ("_") or like this: "Case=Nom|Number=Sing"
    if word.morpho == '_':
        features['morpho'] = '_'
    else:
        for attr, val in [m.split('=') for m in word.morpho.split('|')]:
            features['morpho.{}'.format(attr)] = val
    return features


def featurize_sentence(sentence, featurizer=less_minimal_featurizer):
    return [featurizer(sentence, i) for i in range(len(sentence))]


train_y, train_data = reformat_sentences(train_sentences)
devel_y, devel_data = reformat_sentences(devel_sentences)
test_y, test_data = reformat_sentences(test_sentences)
wiki_y, wiki_data = reformat_sentences(test_sentences_wiki)

train_x = [featurize_sentence(s) for s in train_data]
devel_x = [featurize_sentence(s) for s in devel_data]
test_x = [featurize_sentence(s) for s in test_data]
wiki_x = [featurize_sentence(s) for s in wiki_data]

## CRF training and prediction

This is the place to try different hyperparameters.

Some of the parameters for `CRF()` have been filled in with their default values for reference: see the [API documentation](https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#module-sklearn_crfsuite) for explanations.

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    min_freq=0,
    all_possible_states=False,
    all_possible_transitions=True,
    c1=0.0,
    c2=1.0,
    max_iterations=None
)
crf.fit(train_x, train_y)

In [None]:
%%time
pred_y = crf.predict(devel_x)

## Evaluation and analysis

In [None]:
exclude_O = [t for t in crf.classes_ if t != 'O']

print(classification_report(devel_y, pred_y, labels=exclude_O))
eli5.show_weights(crf, targets=sorted_for_ner(crf.classes_), top=30)

## Extras

This may be useful if you're interested in looking at what was tagged. 

In [None]:
class Tagged(namedtuple('Tagged', 'text tag')):
    def __repr__(self):
        return '<{0}>{1}</{0}>'.format(self.tag, self.text)


def get_tagged(words, tags):
    # Get tagged spans from sequences of words and BIO tags
    tagged, current, current_type = [], [], None
    for word, tag in zip(words, tags):
        if tag[0] in 'OB' and current:    # current ends
            tagged.append(Tagged(' '.join(current), current_type))
            current, current_type = [], None
        if tag[0] == 'B':
            current, current_type = [word], tag[2:]
        elif tag[0] == 'I':
            if not current:    # I without B, but nevermind
                current, current_type = [word], tag[2:]
            else:
                current.append(word)
                # TODO check that type agrees
        else:
            assert tag == 'O', 'unexpected tag {}'.format(tag)
    if current:    # span open at sentence end
        tagged.append(Tagged(' '.join(current), current_type))
    return tagged

Example: print most frequently tagged

In [None]:
from collections import Counter

counter = Counter()

for sentence, predictions in zip(devel_data, pred_y):
    words = [w.form for w in sentence]    # try w.lemma instead
    tagged = get_tagged(words, predictions)
    counter.update(tagged)
    
for pair, count in counter.most_common(50):
    print('{}\t{}'.format(count, pair))