In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')


In [2]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Let's use CoNLL 2003 data to build a NER system

We use English data.

In [32]:
#nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     /home/82068895153/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [3]:
#nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [34]:
%%time
#train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
#test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 1.07 s, sys: 76.2 ms, total: 1.15 s
Wall time: 1.39 s


In [35]:
#train_sents[0]


[('rejects', 'NNS'), ('VBZ', 'NNP'), ('B-VP', 'NNP'), ('O', 'NNP')]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

Features

Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.



In [36]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

This is what word2features extracts:

In [37]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'melbourne',
 'word[-3:]': 'rne',
 'word[-2:]': 'ne',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'BOS': True,
 '+1:word.lower()': '(',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp'}

Extract features from the data:

In [38]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 633 ms, sys: 27.7 ms, total: 661 ms
Wall time: 660 ms


Training

To see all possible CRF parameters check its docstring. Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [39]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

#try:
#    crf.fit(X_train, y_train)
#except AttributeError:
#    pass
#predictions = crf.predict(X_test)

CPU times: user 29.3 s, sys: 0 ns, total: 29.3 s
Wall time: 29.4 s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

Evaluation

There is much more O entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [40]:
labels = list(crf.classes_)
labels.remove('O')
labels


['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [41]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.7964686316443963

In [42]:
with open("/home/82068895153/POS/skweak/data/conll2003_dataset/train_out.txt", 'r') as file :
 X_test = file.readlines()
 

In [28]:
with open("/home/82068895153/POS/skweak/data/conll2003_dataset/train.txt", 'r') as file :
 y_test = file.readlines()

In [45]:
X_train

  'postag': 'AQ',
   'postag[:2]': 'AQ',
   '-1:word.lower()': 'm.',
   '-1:word.istitle()': True,
   '-1:word.isupper()': True,
   '-1:postag': 'VMI',
   '-1:postag[:2]': 'VM',
   '+1:word.lower()': 'w.',
   '+1:word.istitle()': True,
   '+1:word.isupper()': True,
   '+1:postag': 'NC',
   '+1:postag[:2]': 'NC'},
  {'bias': 1.0,
   'word.lower()': 'w.',
   'word[-3:]': 'W.',
   'word[-2:]': 'W.',
   'word.isupper()': True,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'NC',
   'postag[:2]': 'NC',
   '-1:word.lower()': 'w.',
   '-1:word.istitle()': True,
   '-1:word.isupper()': True,
   '-1:postag': 'AQ',
   '-1:postag[:2]': 'AQ',
   '+1:word.lower()': 'se',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'P0',
   '+1:postag[:2]': 'P0'},
  {'bias': 1.0,
   'word.lower()': 'se',
   'word[-3:]': 'se',
   'word[-2:]': 'se',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'P0',
   'postag[:2

In [29]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

ValueError: Found input variables with inconsistent numbers of samples: [3283493, 1134030]

Inspect per-class results in more detail:

In [14]:
from sklearn.metrics import classification_report

# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

#print(sklearn.metrics.classification_report(
#    y_test, y_pred, labels=sorted_labels
#))



              precision    recall  f1-score   support

       B-LOC      0.810     0.784     0.797      1084
       I-LOC      0.690     0.637     0.662       325
      B-MISC      0.731     0.569     0.640       339
      I-MISC      0.699     0.589     0.639       557
       B-ORG      0.807     0.832     0.820      1400
       I-ORG      0.852     0.786     0.818      1104
       B-PER      0.850     0.884     0.867       735
       I-PER      0.893     0.943     0.917       634

   micro avg      0.813     0.787     0.799      6178
   macro avg      0.791     0.753     0.770      6178
weighted avg      0.809     0.787     0.796      6178

