# NER exercises

This is a modified, more compact version of (one variant of) the CRF tagger from the NER notebook. You may wish to use this as a starting point for doing some of the exercises.

(This first bit is Jupyter magic and required imports, feel free to ignore)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%run ../lib/conllu.ipynb
%run ../lib/visualization.ipynb

import sklearn_crfsuite
import eli5

from collections import namedtuple

from sklearn_crfsuite.metrics import flat_f1_score as f1_score
from sklearn_crfsuite.metrics import flat_classification_report as classification_report
from eli5.sklearn_crfsuite.explain_weights import sorted_for_ner

# Data loading

Look here if you want to work on English.

In [2]:
# Finnish

train_sentences = read_conll('../data/finer-featurized/digitoday.2014.train.conllu')
devel_sentences = read_conll('../data/finer-featurized/digitoday.2014.dev.conllu')
test_sentences = read_conll('../data/finer-featurized/digitoday.2015.test.conllu')
test_sentences_wiki = read_conll('../data/finer-featurized/wikipedia.test.conllu')

# English (uncomment if preferred)

# train_sentences = read_conll('../data/conll03-featurized/eng.train.conllu')
# devel_sentences = read_conll('../data/conll03-featurized/eng.dev.conllu')
# test_sentences = read_conll('../data/conll03-featurized/eng.test.conllu')
# test_sentences_wiki = []    # Sorry, we don't this dataset for English

## Data mangling and featurization

You'll want to edit this bit if you want to try different features.

In [3]:
Word = namedtuple('Word', [
    'form',
    'lemma',
    'upos',
    'xpos',
    'morpho',
])


def reformat_sentences(sentences):
    # separate tags, drop parse fields, cast as namedtuples
    tags, data = [], []
    for sentence in sentences:
        tags.append([w[0] for w in sentence])
        data.append([Word(*w[1:6]) for w in sentence])
    return tags, data


def less_minimal_featurizer(sentence, idx):
    word = sentence[idx]
    features = {
        'form': word.form,
        'form.lower()': word.form.lower(),
        'form[:2]': word.form[:2],
        'form[:3]': word.form[:3],
        'form[-2:]': word.form[-2:],
        'form[-3:]': word.form[-3:],
        'lemma': word.lemma,
        'lemma.lower()': word.lemma.lower(),
        'upos': word.upos,
        'xpos': word.xpos,
    }
    # morpho is either empty ("_") or like this: "Case=Nom|Number=Sing"
    if word.morpho == '_':
        features['morpho'] = '_'
    else:
        for attr, val in [m.split('=') for m in word.morpho.split('|')]:
            features['morpho.{}'.format(attr)] = val
    return features


def featurize_sentence(sentence, featurizer=less_minimal_featurizer):
    return [featurizer(sentence, i) for i in range(len(sentence))]


train_y, train_data = reformat_sentences(train_sentences)
devel_y, devel_data = reformat_sentences(devel_sentences)
test_y, test_data = reformat_sentences(test_sentences)
wiki_y, wiki_data = reformat_sentences(test_sentences_wiki)

train_x = [featurize_sentence(s) for s in train_data]
devel_x = [featurize_sentence(s) for s in devel_data]
test_x = [featurize_sentence(s) for s in test_data]
wiki_x = [featurize_sentence(s) for s in wiki_data]

## CRF training and prediction

This is the place to try different hyperparameters. Note that some of the parameters for `CRF()` have been filled in with their default values for reference.

In [4]:
%%time
crf = sklearn_crfsuite.CRF(all_possible_transitions=True)
crf.fit(train_x, train_y)
pred_y = crf.predict(devel_x)

CPU times: user 1min 22s, sys: 876 ms, total: 1min 23s
Wall time: 1min 23s


## Evaluation and analysis

In [5]:
exclude_O = [t for t in crf.classes_ if t != 'O']

print(classification_report(devel_y, pred_y, labels=exclude_O))
eli5.show_weights(crf, targets=sorted_for_ner(crf.classes_), top=30)

             precision    recall  f1-score   support

      B-ORG       0.83      0.80      0.81       544
      B-PER       0.88      0.77      0.82       185
      I-PER       0.85      0.90      0.88        94
      B-LOC       0.93      0.90      0.91       268
     B-DATE       0.96      0.88      0.92        51
      B-PRO       0.75      0.58      0.65       173
      I-ORG       0.78      0.69      0.73       185
     I-DATE       0.97      0.82      0.89        34
      I-PRO       0.50      0.67      0.57        82
    B-EVENT       0.00      0.00      0.00         2
    I-EVENT       0.00      0.00      0.00         2
      I-LOC       1.00      0.40      0.57        20

avg / total       0.83      0.77      0.79      1640



  'precision', 'predicted', average, warn_for)


From \ To,O,B-DATE,I-DATE,B-EVENT,I-EVENT,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER,B-PRO,I-PRO
O,1.079,1.39,-2.466,1.128,-1.336,1.844,-1.25,1.367,-1.806,1.103,-1.651,1.796,-1.666
B-DATE,-1.175,-0.732,4.013,0.604,-0.103,-0.047,-0.102,-0.185,-0.152,-0.556,-0.19,-0.828,-0.268
I-DATE,-1.03,-1.511,4.594,-0.063,-0.051,-0.143,-0.048,-0.225,-0.049,-0.206,-0.044,-0.49,-0.112
B-EVENT,-1.709,-0.171,-0.133,-0.049,4.16,0.46,-0.047,-0.645,-0.083,-0.546,-0.047,-0.517,-0.085
I-EVENT,-2.01,-0.114,-0.042,-0.109,4.68,0.149,-0.076,-0.717,-0.063,-0.433,-0.036,-0.383,-0.068
B-LOC,-0.773,-0.057,-0.344,1.039,-0.36,1.121,3.667,-0.579,-1.168,-1.597,-0.324,-0.667,-0.427
I-LOC,-1.512,0.129,-0.253,0.551,-0.211,0.318,3.664,-1.006,-0.303,-0.6,-0.076,-0.586,-0.1
B-ORG,-0.062,0.326,-0.586,-0.249,-0.695,0.129,-0.587,-1.114,5.372,-0.539,-0.736,0.365,-1.507
I-ORG,-0.725,-0.221,-0.269,-0.231,-0.338,-0.255,-0.313,-0.378,5.54,-1.01,-0.381,-1.258,-0.584
B-PER,-0.143,-0.299,-0.146,-0.151,-0.21,-0.667,-0.224,-1.052,-0.457,-2.486,6.34,-1.212,-0.316

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+2.886,morpho.NumType:Card,,,,,,,,,,,
+2.664,xpos:Adv,,,,,,,,,,,
+2.610,xpos:Punct,,,,,,,,,,,
+2.530,upos:PRON,,,,,,,,,,,
+2.483,morpho:_,,,,,,,,,,,
+2.453,upos:ADV,,,,,,,,,,,
+2.451,upos:ADP,,,,,,,,,,,
+2.434,form[:2]:va,,,,,,,,,,,
+2.415,xpos:Pron,,,,,,,,,,,
+2.353,form[:2]:ti,,,,,,,,,,,

Weight?,Feature
+2.886,morpho.NumType:Card
+2.664,xpos:Adv
+2.610,xpos:Punct
+2.530,upos:PRON
+2.483,morpho:_
+2.453,upos:ADV
+2.451,upos:ADP
+2.434,form[:2]:va
+2.415,xpos:Pron
+2.353,form[:2]:ti

Weight?,Feature
+2.018,form[:3]:tam
+1.929,xpos:Num
+1.651,form[:2]:he
+1.628,lemma:maaliskuu
+1.628,lemma.lower():maaliskuu
+1.627,form[-2:]:un
+1.624,upos:NOUN
+1.617,form[-3:]:uun
+1.608,form[:3]:lok
+1.605,lemma:kesäkuu

Weight?,Feature
+2.384,form[:2]:20
+2.204,form[-3:]:uta
+1.483,form[:3]:201
+1.203,form[:3]:200
+1.188,form[:2]:19
+1.155,form[:2]:ma
+1.102,lemma:päivä
+1.102,lemma.lower():päivä
+0.891,morpho.Case:Par
+0.852,form[:3]:päi

Weight?,Feature
+2.015,form[:3]:CES
+1.987,form[:2]:CE
+1.907,form[:3]:MWC
+1.903,form[:2]:MW
+1.241,form[:2]:9/
+1.241,form[:3]:9/1
+1.156,form[:3]:WWD
+1.156,form[:2]:WW
+1.130,form[:3]:Bui
+1.119,form[:2]:Tr

Weight?,Feature
+1.103,form[:3]:I/O
+1.103,form[:2]:I/
+0.900,form[:3]:Con
+0.803,form[:2]:-t
+0.801,form[:2]:Co
+0.774,form[-3:]:sin
+0.664,lemma.lower():i/o
+0.664,lemma:I/O
+0.524,form[:2]:-k
+0.509,lemma:Conference

Weight?,Feature
+1.788,form[:2]:Yh
+1.630,form[:3]:Suo
+1.630,form[:3]:Poh
+1.597,lemma.lower():suomi
+1.555,form[:3]:Yhd
+1.547,form[:3]:Ven
+1.462,upos:PROPN
+1.414,form[:2]:Eu
+1.414,form[:3]:Eur
+1.401,form[:2]:Ka

Weight?,Feature
+1.018,form[:2]:ka
+0.819,form[:3]:kan
+0.706,lemma:hotelli
+0.706,lemma.lower():hotelli
+0.683,lemma:Francisco
+0.683,lemma.lower():francisco
+0.682,lemma.lower():amerikka
+0.655,lemma:City
+0.655,lemma.lower():city
+0.655,form[:3]:Cit

Weight?,Feature
+1.883,form[:2]:OP
+1.735,form[:3]:Mic
+1.719,lemma.lower():apple
+1.699,form[:3]:Moz
+1.679,lemma.lower():google
+1.515,lemma.lower():samsung
+1.428,form[:3]:Jol
+1.422,form[:2]:NS
+1.417,form[:2]:Xi
+1.404,form[:3]:Rad

Weight?,Feature
+1.478,form[:3]:kul
+1.352,form[:2]:ko
+1.311,form[:3]:pol
+1.219,form[:2]:ku
+1.218,form[:2]:-y
+1.205,form[:3]:val
+1.162,form[-2:]:iö
+1.120,form[:3]:vie
+1.096,form[:3]:hal
+1.086,form[:3]:kon

Weight?,Feature
+1.415,form[:3]:Elo
+1.291,form[:2]:Mc
+1.276,form[:3]:Ste
+1.184,form[-2:]:ey
+1.183,lemma.lower():yli#ruusi
+1.183,lemma:yli#ruusi
+1.146,form[:2]:Jo
+1.088,form[:3]:Kar
+1.068,form[:2]:Aa
+1.068,form[:3]:Yli

Weight?,Feature
+0.990,upos:PROPN
+0.851,form[-2:]:ri
+0.812,morpho.Case:Par
+0.756,form[-3:]:son
+0.739,lemma.lower():ryan
+0.736,morpho.Derivation:Ja
+0.653,morpho.Case:Nom
+0.646,form[-2:]:in
+0.642,form[-2:]:er
+0.629,form[:2]:Mu

Weight?,Feature
+2.457,form[:2]:iP
+2.118,form[:3]:Lum
+1.853,form[:2]:Yo
+1.834,form[:3]:Jol
+1.610,form[:3]:Mac
+1.566,form[:3]:Bin
+1.496,form[:2]:iT
+1.494,form[:3]:ISS
+1.461,form[:2]:Op
+1.450,form[:2]:No

Weight?,Feature
+1.957,form[:2]:-p
+1.286,form[:2]:-s
+1.168,form[:2]:Pl
+0.989,form[:3]:Sto
+0.985,form[:2]:8.
+0.979,lemma:Watch
+0.977,form[:3]:-ki
+0.974,form[:2]:Wa
+0.974,form[:3]:8.1
+0.972,lemma.lower():watch
