# Named Entity Recognition (NER) using Python and Keras


## Part 1: Only using CRF from sklearn_crfsuite

### Data Preprocessing

In [93]:
# Install sklearn_crfsuite. Using '!' makes the notebook cell behave like CLI.
!pip install sklearn_crfsuite
!pip install eli5

# import pandas as pd
# from sklearn.model_selection import train_test_split

# CRF() is also available within keras_contrib. My code was getting confused and referencing the keras CRF, thus needed to give an explicit name.
from sklearn_crfsuite import CRF as skcrf

from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Used for hyperparameter optimization
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# to explain model weights
import eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 14.6MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.8MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.3MB/s eta 0:00:01[K     |████████████▍                   | 40kB 2.5MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.3MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 2.8MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 2.9MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [15]:
data = pd.read_csv(r'https://raw.githubusercontent.com/RuchitaGarde/NLP_NER_using_Python_Keras_LSTM_CRF/master/ner_dataset.csv',encoding = "ISO-8859-1")

In [17]:
data.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


Sentence # indicates the sentence number and each sentence comprises of words that are labeled using the BIO scheme in the tag column.

In [18]:
data.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 37337,the,NN,O
freq,1,52573,145807,887908


In [19]:
#Displaying the unique Tags
data['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [21]:
#Checking null values, if any.
data.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [23]:
# There are lots of missing values in 'Sentence #' attribute. 
# This is because, 'Sentence #' only gets filled for the first word at the beginning of the sentence. All other following words just get NaNs.
#So we will use pandas fillna technique and use 'ffill' method which propagates last valid observation forward to next.
data = data.fillna(method = 'ffill')

In [45]:
# This is a class to get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False

        #creating a class function
        create_tuple = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        
        # Adding 2 new fields to the dataset: grouped & sentences
        
        # Explanation of how this works: https://pandas.pydata.org/pandas-docs/version/0.22.0/groupby.html#returning-a-series-to-propagate-names
        # This will return each sentence # only once as a list, with each item of the list representing a tuple of 3 things: word, pos & tag
        self.grouped = self.data.groupby("Sentence #").apply(create_tuple)
        print(self.grouped[0])
        
        self.sentences = [s for s in self.grouped]
        print(self.sentences[0])
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None


In [48]:
#Displaying one full sentence
fetcher = sentence(data)
sentences = [" ".join([s[0] for s in sent]) for sent in fetcher.sentences]

#test
sentences[0]


[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]
[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops'

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [46]:
#sentence with its pos and tag.
sent = fetcher.get_text()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [50]:
# Getting all the sentences from the dataset
sentences = fetcher.sentences
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

### Feature Preparation
These are the default features used by the Sklearn-crfsuite library in nltk.

Reference:
https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

We use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.
sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [52]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
  
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [56]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# Testing output for a single sentence
X[0]

[{'+1:postag': 'IN',
  '+1:postag[:2]': 'IN',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'of',
  'BOS': True,
  'bias': 1.0,
  'postag': 'NNS',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'thousands',
  'word[-2:]': 'ds',
  'word[-3:]': 'nds'},
 {'+1:postag': 'NNS',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'demonstrators',
  '-1:postag': 'NNS',
  '-1:postag[:2]': 'NN',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'thousands',
  'bias': 1.0,
  'postag': 'IN',
  'postag[:2]': 'IN',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'of',
  'word[-2:]': 'of',
  'word[-3:]': 'of'},
 {'+1:postag': 'VBP',
  '+1:postag[:2]': 'VB',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'have',
  '-1:postag': 'I

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Here we are using L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.
The BFGS algorithm overcomes some of the limitations of plain gradient descent by seeking the second derivative (a stationary point) of the cost function.
C1 & C2 are coe-efficients for L1 & L2 regularization. 
all_possible_transitions (bool, optional (default=False)) – Specify whether CRFsuite generates transition features that do not even occur in the training data (i.e., negative transition features). When True, CRFsuite generates transition features that associate all of possible label pairs. Suppose that the number of labels in the training data is L, this function will generate (L * L) transition features. This function is disabled by default.

### Training CRF algorithm

In [99]:
%%time
crf = skcrf(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False,
          )

crf.fit(X_train, y_train)



CPU times: user 3min 56s, sys: 357 ms, total: 3min 57s
Wall time: 3min 56s


In [70]:
#Predicting on the test set.
%%time
y_pred = crf.predict(X_test)

CPU times: user 3.14 s, sys: 11 ms, total: 3.15 s
Wall time: 3.15 s


### Evaluating the model performance.
We will use precision, recall and f1-score metrics to evaluate the performance of the model since the accuracy is not a good metric for this dataset because we have an unequal number of data points in each class.

In [67]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.971219262484722


In [68]:
report = flat_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       B-art       0.42      0.12      0.19        82
       B-eve       0.56      0.29      0.38        77
       B-geo       0.86      0.91      0.89      7551
       B-gpe       0.97      0.95      0.96      3181
       B-nat       0.58      0.26      0.35        43
       B-org       0.80      0.74      0.77      4043
       B-per       0.84      0.82      0.83      3332
       B-tim       0.93      0.89      0.91      4102
       I-art       0.07      0.01      0.02        68
       I-eve       0.50      0.19      0.28        78
       I-geo       0.82      0.80      0.81      1510
       I-gpe       0.89      0.60      0.72        40
       I-nat       0.83      0.42      0.56        12
       I-org       0.80      0.80      0.80      3296
       I-per       0.85      0.90      0.87      3437
       I-tim       0.84      0.78      0.81      1244
           O       0.99      0.99      0.99    176918

    accuracy              

In [97]:
# to understand weights
eli5.show_weights(crf, top = 30)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,4.16,1.54,0.0,2.018,0.0,2.15,0.0,1.275,0.0,1.053,0.0,2.089,0.0,4.28,0.0,2.709,0.0
B-art,-0.114,0.0,7.982,0.0,0.0,-0.283,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.07,0.0,0.315,0.0
I-art,-0.285,0.0,8.031,0.0,0.0,-0.216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364,0.0,-0.864,0.0
B-eve,-0.779,0.0,0.0,0.0,7.369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217,0.0
I-eve,-0.017,0.0,0.0,0.0,7.835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.754,0.282,0.0,0.729,0.0,0.0,8.864,0.766,0.0,0.0,0.0,1.039,0.0,1.16,0.0,2.229,0.0
I-geo,-0.376,0.881,0.0,0.0,0.0,0.0,7.308,-0.647,0.0,0.0,0.0,0.287,0.0,1.473,0.0,0.8,0.0
B-gpe,0.661,0.0,0.0,-0.016,0.0,1.008,0.0,0.0,6.951,0.0,0.0,1.86,0.0,1.64,0.0,0.65,0.0
I-gpe,-0.116,0.0,0.0,0.0,0.0,-0.659,0.0,0.0,6.441,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.0
B-nat,-0.414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.001,0.0,0.0,0.543,0.0,0.195,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+7.781,word.lower():last,,,,,,,,,,,,,,,
+7.422,word.lower():month,,,,,,,,,,,,,,,
+5.553,word.lower():chairman,,,,,,,,,,,,,,,
+5.424,word.lower():hurricane,,,,,,,,,,,,,,,
+5.005,word.lower():weeks,,,,,,,,,,,,,,,
+4.959,word.lower():week,,,,,,,,,,,,,,,
+4.947,word.lower():columbia,,,,,,,,,,,,,,,
+4.922,word.lower():internet,,,,,,,,,,,,,,,
+4.791,word.lower():early,,,,,,,,,,,,,,,
+4.658,word.lower():months,,,,,,,,,,,,,,,

Weight?,Feature
+7.781,word.lower():last
+7.422,word.lower():month
+5.553,word.lower():chairman
+5.424,word.lower():hurricane
+5.005,word.lower():weeks
+4.959,word.lower():week
+4.947,word.lower():columbia
+4.922,word.lower():internet
+4.791,word.lower():early
+4.658,word.lower():months

Weight?,Feature
+4.734,word.lower():twitter
+4.196,-1:word.lower():engine
+4.090,+1:word.lower():enkhbayar
+4.054,word.lower():nevirapine
+3.916,word.lower():english
+3.884,+1:word.lower():boots
+3.477,word.lower():spanish
+3.435,word.lower():canal
+3.334,+1:word.lower():al-arabiya
+3.326,word.lower():russian

Weight?,Feature
+2.413,+1:word.lower():gained
+2.394,-1:word.lower():boeing
+2.156,word.lower():notice
+2.148,+1:word.lower():times
+2.052,-1:word.lower():cajun
+1.999,word.lower():constitution
+1.909,-1:word.lower():dignity
+1.900,+1:word.lower():marks
+1.891,+1:word.lower():reported
+1.875,word.lower():flowers

Weight?,Feature
+4.193,word.lower():ramadan
+3.921,-1:word.lower():falklands
+3.718,word.lower():games
+3.650,-1:word.lower():typhoon
+3.505,word[-3:]:mes
+3.282,+1:word.lower():men
+3.095,+1:word.lower():dean
+3.064,word.lower():olympic
+2.904,-1:word.lower():happy
+2.862,-1:word.lower():war

Weight?,Feature
+4.013,+1:word.lower():mascots
+2.946,+1:word.lower():caused
+2.707,+1:word.lower():rally
+2.631,+1:word.lower():disaster
+2.428,word.lower():series
+2.370,+1:word.lower():continues
+2.311,+1:word.lower():without
+2.288,-1:word.lower():hurricane
+1.982,+1:word.lower():starts
+1.954,word.lower():games

Weight?,Feature
+6.442,word.lower():caribbean
+5.999,word.lower():mid-march
+5.199,-1:word.lower():hamas
+5.118,word.lower():martian
+5.010,word.lower():beijing
+4.546,-1:word.lower():mr.
+4.543,word.lower():persian
+4.451,word.lower():balkans
+4.291,word.lower():quake-zone
+4.124,+1:word.lower():phoned

Weight?,Feature
+4.245,word.lower():led-invasion
+3.970,word.lower():island
+3.883,word.lower():city
+3.654,+1:word.lower():possessions
+3.627,word.lower():subway
+3.442,word.lower():shogunate
+3.442,-1:word.lower():tokugawa
+3.419,+1:word.lower():regional
+3.289,+1:word.lower():produced
+3.252,word.lower():east

Weight?,Feature
+7.055,word.lower():niger
+6.444,word.lower():afghan
+6.006,word.lower():nepal
+5.358,word.lower():azerbaijan
+5.155,word.lower():iranian
+4.972,word.lower():spaniard
+4.810,word.lower():gibraltar
+4.803,word.lower():croats
+4.769,word.lower():korean
+4.692,word.lower():argentine

Weight?,Feature
+5.354,+1:word.lower():mayor
+3.895,-1:word.lower():democratic
+3.848,-1:word.lower():soviet
+3.800,word.lower():cypriots
+3.718,-1:word.lower():bosnian
+3.671,+1:word.lower():developed
+3.054,+1:word.lower():man
+3.011,word.lower():city
+2.815,word[-2:]:bs
+2.745,word.lower():britons

Weight?,Feature
+6.345,word.lower():katrina
+4.874,word.lower():marburg
+4.201,word.lower():rita
+3.731,word[-2:]:N1
+3.262,+1:word.lower():shot
+2.853,+1:word.lower():correctly
+2.841,word[-3:]:ita
+2.514,word.lower():ebola
+2.417,word[-3:]:mia
+2.366,word[-3:]:urg

Weight?,Feature
+2.532,word[-3:]:ita
+2.382,+1:word.lower():outbreak
+2.209,word.lower():rita
+1.981,-1:word.lower():hurricanes
+1.947,word.lower():flu
+1.942,+1:word.lower():relief
+1.811,word[-2:]:lu
+1.780,word[-2:]:ta
+1.585,word.lower():katrina
+1.419,-1:word.lower():jing

Weight?,Feature
+7.526,word.lower():philippine
+7.153,word.lower():hamas
+6.498,word.lower():al-qaida
+5.786,word.lower():mid-march
+5.078,word.lower():hezbollah
+4.901,-1:word.lower():rice
+4.716,word.lower():conocophillips
+4.667,word.lower():taleban
+4.570,word.lower():reuters
+4.356,word.lower():university

Weight?,Feature
+3.905,+1:word.lower():attained
+3.789,+1:word.lower():reporter
+3.737,word.lower():singapore
+3.577,-1:word.lower():english
+3.517,-1:word.lower():decathlon
+3.409,word.lower():member-countries
+3.404,+1:word.lower():separating
+3.244,word.lower():mccain
+3.231,+1:word.lower():mulgueta
+3.184,word.lower():member-states

Weight?,Feature
+6.752,word.lower():prime
+6.690,word.lower():president
+6.397,word.lower():obama
+5.422,word.lower():al-zarqawi
+4.971,word.lower():senator
+4.774,word.lower():clinton
+4.664,word.lower():hall
+4.522,word.lower():greenspan
+4.221,word.lower():cobain
+4.200,word.lower():spears

Weight?,Feature
+3.876,+1:word.lower():advisor
+3.441,word.lower():obama
+3.410,-1:word.lower():michael
+3.168,word.lower():al-zarqawi
+2.753,+1:word.lower():udi
+2.751,word.lower():gration
+2.727,word.lower():peter
+2.690,-1:word.lower():viktor
+2.650,word.lower():pressewednesday
+2.636,+1:word.lower():administration

Weight?,Feature
+6.781,word.lower():multi-candidate
+6.330,word.lower():2000
+6.012,word.lower():weekend
+5.660,word.lower():february
+5.656,word.lower():january
+5.484,word.lower():one-year
+5.478,word.lower():titan
+5.317,+1:word.lower():week
+5.124,word.lower():july
+5.087,word.lower():december

Weight?,Feature
+4.471,+1:word.lower():stocky
+4.309,word.lower():2000
+4.045,+1:word.lower():old
+3.963,-1:word.lower():past
+3.740,word.lower():january
+3.692,+1:word.lower():month
+3.573,+1:word.lower():reflected
+3.556,+1:word.lower():jose
+3.270,word.lower():evening
+3.244,word.lower():cease-fire


It looks like the model is just memorizing words to make predictions. We want the model to use better features.

### Hyperparameter optimization
To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we’re fitting a model 50 * 3 = 150 times), so reducing n_iter in RandomizedSearchCV.
Other option: fit model only on a subset of training data.



In [None]:
%%time
# define fixed parameters and parameters to search
crf = skcrf(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
# make_scorer factory function wraps scoring functions
f1_scorer = make_scorer(flat_f1_score,
                        average='weighted')

# search using Cross validation
# If you have more time, replace n_iter by a higher number like 50 to see significant benefits from randomizedSearch. Then, rerun all the following cells.
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=5,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
# Best result
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

### Check best estimator on test data

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Let's check what the classifier learned

In [100]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-geo  -> I-geo   8.864462
I-art  -> I-art   8.030507
B-art  -> I-art   7.981690
I-eve  -> I-eve   7.835009
B-tim  -> I-tim   7.463041
B-eve  -> I-eve   7.369190
I-geo  -> I-geo   7.307890
B-per  -> I-per   7.180239
I-tim  -> I-tim   7.131106
B-nat  -> I-nat   7.001023
B-gpe  -> I-gpe   6.951194
B-org  -> I-org   6.670141
I-gpe  -> I-gpe   6.441063
I-per  -> I-per   6.404732
I-org  -> I-org   6.347542
I-nat  -> I-nat   4.697836
O      -> B-per   4.279677
O      -> O       4.159847
O      -> B-tim   2.708905
B-per  -> B-org   2.264998

Top unlikely transitions:
I-art  -> B-geo   -0.215609
B-tim  -> B-art   -0.220577
B-tim  -> B-org   -0.223183
B-art  -> B-geo   -0.283115
I-art  -> O       -0.284826
I-tim  -> O       -0.368599
I-geo  -> O       -0.376494
B-nat  -> O       -0.413664
I-org  -> O       -0.612494
I-geo  -> B-gpe   -0.646828
I-gpe  -> B-geo   -0.659494
B-eve  -> O       -0.779355
I-art  -> B-tim   -0.863699
I-tim  -> B-org   -1.024305
I-per  -> B-gpe  

Explanation: We can see that, for example, it is very likely that the beginning of an organization name (B-geo) will be followed by a token inside organization name (I-geo), but transitions to I-geo from tokens with other labels are penalized.

In [None]:
# recheck weights after parameter optimization
eli5.show_weights(crf, top = 30)