## Training: NER:CRF

#### Param optimization

In [41]:
import sys
sys.path.append('/Users/diego.esteves/git/horus')

#### Training the model

In [42]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.model_selection import cross_val_predict
import eli5
from src.horus_meta import HorusSentence, WordFeaturesInterface
from src.definitions import PRE_PROCESSING_STATUS

_crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.088, c2=0.002, max_iterations=100, all_possible_transitions=True)

In [43]:
import numpy as np

def word2features_dynamic(sent: HorusSentence, i: int, pfeatures: dict):
    word = sent.tokens[i].text
    postag = sent.tokens[i].label_pos
    
    features = pfeatures.copy()
    crf_features = {'bias': 1.0}
    has_pos = 'postag' in features.values()
    has_word = 'word.lower' in features.values()
    feature_dict_lex = WordFeaturesInterface.get_lexical()

    
#    for f in range(0, len(train_on_feature_labels)):
#        f_label = train_on_feature_labels[f]
#        f_id = train_on_feature_ids[f]
#        f_value = sent.tokens[i].features.lexical.values[f_id]
#        crf_features.update(dict(f_label, f_value))
    
    # POS tag comes from horus.token instead of lexical features array[],
    # so that we can safe delete after knowing it is required.
    if has_pos:
        del features['postag']
        crf_features.update({'postag': sent.tokens[i].label_pos})
        crf_features.update({'postag[:2]': sent.tokens[i].label_pos[:2]})


    if has_word:
        word_id = WordFeaturesInterface.get_key_by_value(dictionary=feature_dict_lex, feature_alias='word.lower')
        crf_features.update({'word[-3]:': sent.tokens[i].features.lexical.values[word_id][-3:]})
        crf_features.update({'word[-2]:': sent.tokens[i].features.lexical.values[word_id][-2:]})
 
    #print(features)
    #print(crf_features)
    [crf_features.update({f_label: sent.tokens[i].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
    #print(crf_features)

    if i > 0:
#        features_pre = dict(('-1:' + definitions.schemaindex2label[key], sent.iloc[i-1].at[key]) for key in np.sort(sent.columns.values))
        if has_pos:
            crf_features.update({'-1:postag': sent.tokens[i-1].label_pos})
            crf_features.update({'-1:postag[:2]': sent.tokens[i-1].label_pos[:2]})
        [crf_features.update({'-1:' + f_label: sent.tokens[i-1].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
            
    else:
        crf_features['BOS'] = True

    if i < len(sent.tokens) - 1:
#        features_pos = dict(('+1:' + definitions.schemaindex2label[key], sent.iloc[i+1].at[key]) for key in np.sort(sent.columns.values))
        if has_pos:
            crf_features.update({'+1:postag': sent.tokens[i+1].label_pos})
            crf_features.update({'+1:postag[:2]': sent.tokens[i+1].label_pos[:2]})
        [crf_features.update({'+1:' + f_label: sent.tokens[i+1].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
    else:
        crf_features['EOS'] = True

    return crf_features

In [44]:
def word2features(sent, i):
    word = sent.tokens[i].text
    postag = sent.tokens[i].label_pos

    crf_features = {
        'bias': 1.0,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent.tokens[i-1].text
        postag1 = sent.tokens[i-1].label_pos
        crf_features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        crf_features['BOS'] = True

    if i < len(sent.tokens)-1:
        word1 = sent.tokens[i+1].text
        postag1 = sent.tokens[i+1].label_pos
        crf_features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        crf_features['EOS'] = True

    return crf_features

In [45]:
def sent2features_dynamic(sent, features: dict):
    return [word2features_dynamic(sent, i, features) for i in range(len(sent.tokens))]

In [46]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent.tokens))]

In [47]:
def sent2labels(sent):
    return [token.label_ner_gold for token in sent.tokens]

In [48]:
from config import HorusConfig
from src.horus_meta import HorusDataLoader, Horus
from src import definitions

# define the dataset to process
dataset_label = 'ritter.train'

config = HorusConfig()

for ds in definitions.NER_DATASETS:
    if ds[0] == dataset_label:
        dataset = ds
        break
    
print(dataset)

conll_file = ds[1] + ds[2]
conll_label = ds[0]
assert '.horusx' in conll_file
horus_file = conll_file.replace('.horusx', '.horus3.json')

# get data
horus = HorusDataLoader.load_metadata_from_file(file=horus_file)
print('ok.')

# make sure the horus file has been processed and features are in place
assert horus.processing_status in (PRE_PROCESSING_STATUS["FEATURE_LEXICAL"],
                                   PRE_PROCESSING_STATUS["FEATURE_NEWS"],
                                   PRE_PROCESSING_STATUS["FEATURE_IMG"],
                                   PRE_PROCESSING_STATUS["FEATURE_ALL"])



--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 1029, in emit
    self.flush()
  File "/usr/local/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 1009, in flush
    self.stream.flush()
OSError: [Errno 5] Input/output error
Call stack:
  File "/usr/local/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/diego.esteves/.local/share/virtualenvs/horus-MzGAWMx0/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/diego.esteves/.local/share/virtualenvs/horus-MzGAWMx0/lib/python3.7

:: reading ok /Users/diego.esteves/git/horus/horus.ini
039284589743ff3295825
['ritter.train', '/Volumes/dne5ssd/horus/resources/datasets/Ritter/', 'ner.txt.horusx']
ok.


In [49]:
# define the training features
train_on_features_labels = ['word.lower', 'word.istitle', 'word.isupper', 'word.isdigit', 'postag']

# get feature indices per feature labels
feature_dict_lex = WordFeaturesInterface.get_lexical()
print('features set - lexical')
print(feature_dict_lex)

# get feature ids
train_on_features_ids =\
    [WordFeaturesInterface.get_key_by_value(dictionary=feature_dict_lex,
                                            feature_alias=feat) for feat in train_on_features_labels]

features = dict(zip(train_on_features_ids, train_on_features_labels))
print('training on features: ', features)

Xd = [sent2features_dynamic(s, features) for s in horus.sentences] 
print('------------------------------------------------------------------------------------------------------------')
print(Xd[0])
print('------------------------------------------------------------------------------------------------------------')
X = [sent2features(s) for s in horus.sentences]
print(X[0])
print('------------------------------------------------------------------------------------------------------------')

features set - lexical
{0: 'word.lower', 1: 'word.lemma', 2: 'word.stem', 3: 'word.len.1', 4: 'word.has.special', 5: 'word[0].isupper', 6: 'word.isupper', 7: 'word.istitle', 8: 'word.isdigit', 9: 'word.stop', 10: 'word.len.issmall', 11: 'word.has.minus', 12: 'word.shape', 13: 'brown_320.1', 14: 'brown_320.2', 15: 'brown_320.3', 16: 'brown_320.4', 17: 'brown_320.5', 18: 'brown_640.1', 19: 'brown_640.2', 20: 'brown_640.3', 21: 'brown_640.4', 22: 'brown_640.5', 23: 'brown_1000.1', 24: 'brown_1000.2', 25: 'brown_1000.3', 26: 'brown_1000.4', 27: 'brown_1000.5'}
training on features:  {0: 'word.lower', 7: 'word.istitle', 6: 'word.isupper', 8: 'word.isdigit', 'postag': 'postag'}
------------------------------------------------------------------------------------------------------------
[{'bias': 1.0, 'postag': 'USR', 'postag[:2]': 'US', 'word[-3]:': 'alk', 'word[-2]:': 'lk', 'word.lower': '@paulwalk', 'word.istitle': 0, 'word.isupper': 0, 'word.isdigit': 0, 'BOS': True, '+1:postag': 'PRP', '+

In [103]:
def get_4muc_labels(y):
    for i in range(0, len(y)):
        for z in range(0, len(y[i])):
            label = y[i][z]
            if label != 'O':
                #print(label[2:len(label)])
                if label[2:len(label)] not in ('person', 'geo-loc', 'company', 'O'):
                    #print(label[2:len(label)])
                    #print(label)
                    y[i][z] = y[i][z][0] + '-other'
                    
    return y
    

In [104]:
y = [sent2labels(s) for s in horus.sentences]
labels_set = []
[labels_set.extend(list(set(z))) for z in y]
print(set(labels_set))
y2 = get_4muc_labels(y)
labels_set2 = []
[labels_set2.extend(list(set(z))) for z in y2]
print(set(labels_set2))


{'I-person', 'I-other', 'I-movie', 'I-musicartist', 'B-musicartist', 'B-company', 'B-facility', 'B-movie', 'B-sportsteam', 'I-company', 'I-facility', 'I-product', 'I-sportsteam', 'I-tvshow', 'B-person', 'B-other', 'O', 'B-tvshow', 'B-geo-loc', 'B-product', 'I-geo-loc'}
{'O', 'I-person', 'I-other', 'I-company', 'B-geo-loc', 'B-person', 'B-other', 'B-company', 'I-geo-loc'}


In [50]:
pred = cross_val_predict(estimator=_crf, X=X, y=y, cv=4)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

               precision    recall  f1-score   support

    B-company       0.80      0.43      0.56       171
   B-facility       0.64      0.45      0.53       104
    B-geo-loc       0.64      0.48      0.55       276
      B-movie       0.64      0.21      0.31        34
B-musicartist       0.62      0.18      0.28        55
      B-other       0.48      0.29      0.36       225
     B-person       0.64      0.59      0.61       449
    B-product       0.77      0.21      0.33        97
 B-sportsteam       0.54      0.14      0.22        51
     B-tvshow       0.44      0.12      0.19        34
    I-company       0.00      0.00      0.00        36
   I-facility       0.68      0.47      0.55       105
    I-geo-loc       0.43      0.20      0.28        49
      I-movie       0.58      0.15      0.24        46
I-musicartist       0.54      0.23      0.32        61
      I-other       0.39      0.32      0.35       320
     I-person       0.62      0.65      0.63       215
    I-pro

In [105]:
pred = cross_val_predict(estimator=_crf, X=Xd, y=y2, cv=4)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

   B-company       0.85      0.45      0.59       171
   B-geo-loc       0.67      0.47      0.55       276
     B-other       0.57      0.36      0.44       600
    B-person       0.65      0.57      0.60       449
   I-company       0.00      0.00      0.00        36
   I-geo-loc       0.53      0.18      0.27        49
     I-other       0.53      0.39      0.45       666
    I-person       0.65      0.61      0.63       215
           O       0.98      0.99      0.98     44007

    accuracy                           0.96     46469
   macro avg       0.60      0.45      0.50     46469
weighted avg       0.96      0.96      0.96     46469



In [None]:
_crf.fit(X, y)
eli5.show_weights(_crf, top=30)