NER model on CoNLL 2003 data  
https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system.
F-Score: 0.8396.
Let's see whether deleting stop-words and punctuation could improve F-Score result.

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from collections import Counter

import nltk
from nltk.corpus import stopwords
import string

In [2]:
s = open("C:/Users/1/Desktop/eng.train.txt", "r", encoding="utf-8")
z = open("C:/Users/1/Desktop/eng.testa.txt", "r", encoding="utf-8")
v = open("C:/Users/1/Desktop/eng.testb.txt", "r", encoding="utf-8")

In [3]:
def get_sents(file):
    file = file.read()
    sents = file.split('\n\n')
    arr = []
    for sent in sents:
        lines = sent.split("\n")
        arr_lines = []
        for line in lines:
            tokens = line.split(" ")
            del tokens[2]
            tokens = tuple(tokens)
            arr_lines.append(tokens)
        arr.append(arr_lines)
    return (arr)

In [4]:
train_sents = get_sents(s)
test_sent_a = get_sents(z)
test_sent_b = get_sents(v)
test_sents = test_sent_a + test_sent_b

In [5]:
train_sents[0]

[('EU', 'NNP', 'I-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'I-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'I-MISC'),
 ('lamb', 'NN', 'O'),
 ('.', '.', 'O')]

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],    
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.istitle()': word1.istitle(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

In [7]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [10]:
labels = list(crf.classes_)
labels.remove('O')

In [11]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)


0.83959623980977693

In [12]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-LOC      0.000     0.000     0.000         6
      I-LOC      0.873     0.808     0.839      4013
     B-MISC      0.500     0.154     0.235        13
     I-MISC      0.834     0.782     0.807      2173
      B-ORG      0.000     0.000     0.000         5
      I-ORG      0.798     0.780     0.789      4583
      I-PER      0.881     0.907     0.894      5922

avg / total      0.849     0.831     0.840     16715



In [13]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
O      -> O       4.275360
I-ORG  -> I-ORG   3.769444
B-ORG  -> B-ORG   3.408064
I-MISC -> B-MISC  2.868010
I-PER  -> I-PER   2.311753
I-MISC -> I-MISC  2.241533
I-LOC  -> I-LOC   2.239393
I-LOC  -> B-LOC   2.121518
O      -> I-PER   1.204007
I-ORG  -> O       1.097261

Top unlikely transitions:
B-MISC -> I-PER   -2.323036
I-ORG  -> I-MISC  -2.449459
I-MISC -> I-LOC   -3.124988
I-PER  -> I-LOC   -3.270511
I-LOC  -> I-ORG   -3.571491
I-PER  -> I-MISC  -3.645297
I-ORG  -> I-PER   -3.724477
I-PER  -> I-ORG   -4.213237
I-ORG  -> I-LOC   -4.418433
I-LOC  -> I-PER   -4.464769


In [14]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive:
7.126838 O        word.lower():minister
6.232585 I-LOC    +1:word.lower():1996-08-26
5.846777 I-ORG    -1:word.lower():v
5.846448 I-LOC    +1:word.lower():1996-08-27
5.743065 I-LOC    +1:word.lower():1996-08-25
5.611029 I-LOC    +1:word.lower():1996-08-23
5.449491 I-LOC    -1:word.lower():wisc
5.438845 I-MISC   word.lower():frenchman
5.413254 I-PER    word.lower():clinton
5.273166 I-LOC    +1:word.lower():1996-08-28

Top negative:
-2.611683 O        word.lower():serie
-2.616193 O        -1:word.lower():beat
-2.671363 I-PER    word[-3:]:ion
-2.833813 O        +1:word.lower():radio
-2.881788 I-MISC   -1:word.lower():french
-2.989782 O        -1:word.lower():moody
-3.074463 O        -1:word.lower():queen
-3.448288 O        word.isupper()
-4.130953 O        -1:word.lower():lloyd
-4.648834 O        word.istitle()


In [15]:
stop = set(stopwords.words('english'))

In [16]:
stop_new = []
for item in stop:
    stop_new.append(item)
    item_cap = item.replace(item[0], item[0].capitalize(), 1)
    stop_new.append(item_cap)

In [17]:
punct = []
for item in string.punctuation:
    punct.append(item)

In [18]:
s = open("C:/Users/1/Desktop/eng.train.txt", "r", encoding="utf-8")
z = open("C:/Users/1/Desktop/eng.testa.txt", "r", encoding="utf-8")
v = open("C:/Users/1/Desktop/eng.testb.txt", "r", encoding="utf-8")

In [19]:
def get_sents_no_stops_and_punct(file):
    file = file.read()
    sents = file.split('\n\n')
    arr = []
    for sent in sents:
        lines = sent.split("\n")
        arr_lines = []
        for line in lines:
            tokens = line.split(" ")
            del tokens[2]
            if tokens[0] not in stop_new and tokens[0] not in punct:
                tokens = tuple(tokens)
                arr_lines.append(tokens)
        arr.append(arr_lines)
    return (arr)

In [20]:
train_sents = get_sents_no_stops_and_punct(s)
test_sent_a = get_sents_no_stops_and_punct(z)
test_sent_b = get_sents_no_stops_and_punct(v)
test_sents = test_sent_a + test_sent_b

In [21]:
train_sents[0]

[('EU', 'NNP', 'I-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'I-MISC'),
 ('call', 'NN', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'I-MISC'),
 ('lamb', 'NN', 'O')]

In [22]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [23]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)


0.72614932739151983

As we see, stop words and punctuation are really important features for our model.
Adding 'word[-4:]' feature (for common suffixes like -ment or -ness) did not change the results.

Interestingly, some dates like '1996-08-26' or '1996-08-27' were mentioned quite often in the corpus, and the model has learnt that if the next word is '1996-08-26' etc., it is likely that the token means location or a part of location ('BRUSSELS', 'NEW YORK' etc.).