## Training: NER:CRF

#### Param optimization

In [41]:
import sys
sys.path.append('/Users/diego.esteves/git/horus-ner')

#### Training the model

In [42]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.model_selection import cross_val_predict
import eli5
from src.horus_meta import HorusSentence, WordFeaturesInterface
from src.definitions import PRE_PROCESSING_STATUS

_crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.088, c2=0.002, max_iterations=100, all_possible_transitions=True)

In [43]:
import numpy as np

def word2features_dynamic(sent: HorusSentence, i: int, pfeatures: dict):
    word = sent.tokens[i].text
    postag = sent.tokens[i].label_pos
    
    features = pfeatures.copy()
    crf_features = {'bias': 1.0}
    has_pos = 'postag' in features.values()
    has_word = 'word.lower' in features.values()
    feature_dict_lex, feature_dict_lex_rev = WordFeaturesInterface.get_lexical()

    
#    for f in range(0, len(train_on_feature_labels)):
#        f_label = train_on_feature_labels[f]
#        f_id = train_on_feature_ids[f]
#        f_value = sent.tokens[i].features.lexical.values[f_id]
#        crf_features.update(dict(f_label, f_value))
    
    # POS tag comes from horus.token instead of lexical features array[],
    # so that we can safe delete after knowing it is required.
    if has_pos:
        del features['postag']
        crf_features.update({'postag': sent.tokens[i].label_pos})
        crf_features.update({'postag[:2]': sent.tokens[i].label_pos[:2]})


    if has_word:
        word_id = WordFeaturesInterface.get_key_by_value(dictionary=feature_dict_lex, feature_alias='word.lower')
        crf_features.update({'word[-3]:': sent.tokens[i].features.lexical.values[word_id][-3:]})
        crf_features.update({'word[-2]:': sent.tokens[i].features.lexical.values[word_id][-2:]})
 
    #print(features)
    #print(crf_features)
    [crf_features.update({f_label: sent.tokens[i].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
    #print(crf_features)

    if i > 0:
#        features_pre = dict(('-1:' + definitions.schemaindex2label[key], sent.iloc[i-1].at[key]) for key in np.sort(sent.columns.values))
        if has_pos:
            crf_features.update({'-1:postag': sent.tokens[i-1].label_pos})
            crf_features.update({'-1:postag[:2]': sent.tokens[i-1].label_pos[:2]})
        [crf_features.update({'-1:' + f_label: sent.tokens[i-1].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
            
    else:
        crf_features['BOS'] = True

    if i < len(sent.tokens) - 1:
#        features_pos = dict(('+1:' + definitions.schemaindex2label[key], sent.iloc[i+1].at[key]) for key in np.sort(sent.columns.values))
        if has_pos:
            crf_features.update({'+1:postag': sent.tokens[i+1].label_pos})
            crf_features.update({'+1:postag[:2]': sent.tokens[i+1].label_pos[:2]})
        [crf_features.update({'+1:' + f_label: sent.tokens[i+1].features.lexical.values[f_id]}) for f_id, f_label in features.items()]
    else:
        crf_features['EOS'] = True

    return crf_features

In [44]:
def word2features(sent, i):
    word = sent.tokens[i].text
    postag = sent.tokens[i].label_pos

    crf_features = {
        'bias': 1.0,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent.tokens[i-1].text
        postag1 = sent.tokens[i-1].label_pos
        crf_features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        crf_features['BOS'] = True

    if i < len(sent.tokens)-1:
        word1 = sent.tokens[i+1].text
        postag1 = sent.tokens[i+1].label_pos
        crf_features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        crf_features['EOS'] = True

    return crf_features

In [45]:
def sent2features_dynamic(sent, features: dict):
    return [word2features_dynamic(sent, i, features) for i in range(len(sent.tokens))]

In [46]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent.tokens))]

In [47]:
def sent2labels(sent):
    return [token.label_ner_gold for token in sent.tokens]

In [48]:
from config import HorusConfig
from src.horus_meta import HorusDataLoader, Horus
from src import definitions

# define the dataset to process
dataset_label = 'ritter.train'

config = HorusConfig()

for ds in definitions.NER_DATASETS:
    if ds[0] == dataset_label:
        dataset = ds
        break
    
print(dataset)

conll_file = ds[1] + ds[2]
conll_label = ds[0]
assert '.horusx' in conll_file
horus_file = conll_file.replace('.horusx', '.horus3.json')

# get data
horus = HorusDataLoader.load_metadata_from_file(file=horus_file)
print('ok.')

print(horus.processing_status)
# make sure the horus file has been processed and features are in place
assert (str(PRE_PROCESSING_STATUS["FEATURE_LEXICAL"]) in horus.processing_status or
        str(PRE_PROCESSING_STATUS["FEATURE_TEXT"]) in horus.processing_status or
        str(PRE_PROCESSING_STATUS["FEATURE_IMAGE"]) in horus.processing_status)

2020-04-30 19:53:31,142 [MainThread  ] [INFO ]  HORUS Framework
2020-04-30 19:53:31,144 [MainThread  ] [INFO ]  version: 1.0
2020-04-30 19:53:31,146 [MainThread  ] [INFO ]  http://horus-ner.org/


:: reading ok /Users/diego.esteves/git/horus-ner/horus.ini
039284589743ff3295825
['ritter.train', '/Volumes/dne5ssd/horus/resources/datasets/Ritter/', 'ner_short.txt.horusx']
ok.
123


In [49]:
# define the training features
train_on_features_labels = ['word.lower', 'word.istitle', 'word.isupper', 'word.isdigit', 'postag']

# get feature indices per feature labels
feature_dict_lex, feature_dict_lex_rev = WordFeaturesInterface.get_lexical()
print('features set - lexical')
print(feature_dict_lex)

# get feature ids
train_on_features_ids =\
    [WordFeaturesInterface.get_key_by_value(dictionary=feature_dict_lex,
                                            feature_alias=feat) for feat in train_on_features_labels]

features = dict(zip(train_on_features_ids, train_on_features_labels))
print('training on features: ', features)

Xd = [sent2features_dynamic(s, features) for s in horus.sentences] 
print('------------------------------------------------------------------------------------------------------------')
print(Xd[0])
print('------------------------------------------------------------------------------------------------------------')
X = [sent2features(s) for s in horus.sentences]
print(X[0])
print('------------------------------------------------------------------------------------------------------------')

features set - lexical
{0: 'word.lower', 1: 'word.lemma', 2: 'word.stem', 3: 'word.len.1', 4: 'word.has.special', 5: 'word[0].isupper', 6: 'word.isupper', 7: 'word.istitle', 8: 'word.isdigit', 9: 'word.stop', 10: 'word.len.issmall', 11: 'word.has.minus', 12: 'word.shape', 13: 'brown_320.1', 14: 'brown_320.2', 15: 'brown_320.3', 16: 'brown_320.4', 17: 'brown_320.5', 18: 'brown_640.1', 19: 'brown_640.2', 20: 'brown_640.3', 21: 'brown_640.4', 22: 'brown_640.5', 23: 'brown_1000.1', 24: 'brown_1000.2', 25: 'brown_1000.3', 26: 'brown_1000.4', 27: 'brown_1000.5'}
training on features:  {0: 'word.lower', 7: 'word.istitle', 6: 'word.isupper', 8: 'word.isdigit', 'postag': 'postag'}
------------------------------------------------------------------------------------------------------------
[{'bias': 1.0, 'postag': 'USR', 'postag[:2]': 'US', 'word[-3]:': 'alk', 'word[-2]:': 'lk', 'word.lower': '@paulwalk', 'word.istitle': 0, 'word.isupper': 0, 'word.isdigit': 0, 'BOS': True, '+1:postag': 'PRP', '+

In [50]:
def get_4muc_labels(y):
    for i in range(0, len(y)):
        for z in range(0, len(y[i])):
            label = y[i][z]
            if label != 'O':
                #print(label[2:len(label)])
                if label[2:len(label)] not in ('person', 'geo-loc', 'company', 'O'):
                    #print(label[2:len(label)])
                    #print(label)
                    y[i][z] = y[i][z][0] + '-other'
                    
    return y
    

In [51]:
y = [sent2labels(s) for s in horus.sentences]
labels_set = []
[labels_set.extend(list(set(z))) for z in y]
print(set(labels_set))
y2 = get_4muc_labels(y)
labels_set2 = []
[labels_set2.extend(list(set(z))) for z in y2]
print(set(labels_set2))


{'O', 'B-tvshow', 'I-facility', 'B-person', 'B-company', 'B-facility', 'I-other', 'B-other'}
{'B-other', 'B-person', 'B-company', 'I-other', 'O'}


In [52]:
pred = cross_val_predict(estimator=_crf, X=X, y=y, cv=4)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   B-company       0.00      0.00      0.00         2
     B-other       1.00      0.17      0.29         6
    B-person       0.00      0.00      0.00         1
     I-other       0.00      0.00      0.00         5
           O       0.95      1.00      0.97       247

    accuracy                           0.95       261
   macro avg       0.39      0.23      0.25       261
weighted avg       0.92      0.95      0.93       261



In [53]:
pred = cross_val_predict(estimator=_crf, X=Xd, y=y2, cv=4)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

   B-company       0.00      0.00      0.00         2
     B-other       1.00      0.17      0.29         6
    B-person       0.00      0.00      0.00         1
     I-other       0.00      0.00      0.00         5
           O       0.95      1.00      0.97       247

    accuracy                           0.95       261
   macro avg       0.39      0.23      0.25       261
weighted avg       0.92      0.95      0.93       261



In [54]:
_crf.fit(X, y)
eli5.show_weights(_crf, top=30)



From \ To,O,B-company,B-other,I-other,B-person
O,3.463,0.0,1.527,-0.256,0.0
B-company,0.361,0.0,0.0,0.0,0.0
B-other,-0.352,0.0,0.0,3.039,0.0
I-other,-0.052,0.0,0.0,2.847,0.0
B-person,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.551,bias,,,
+1.322,-1:word.istitle(),,,
+1.230,word[-2:]:ay,,,
+1.198,word[-3:]:day,,,
+1.057,-1:postag:NNP,,,
+0.548,-1:word.isupper(),,,
+0.432,EOS,,,
+0.326,BOS,,,
+0.270,-1:word.lower():of,,,
+0.201,+1:postag:CD,,,

Weight?,Feature
+4.551,bias
+1.322,-1:word.istitle()
+1.230,word[-2:]:ay
+1.198,word[-3:]:day
+1.057,-1:postag:NNP
+0.548,-1:word.isupper()
+0.432,EOS
+0.326,BOS
+0.270,-1:word.lower():of
+0.201,+1:postag:CD

Weight?,Feature
1.229,word[-3:]:ook
1.229,word[-2:]:ok
1.229,word.lower():facebook
1.229,+1:word.lower():.....
1.059,-1:word.lower():on
1.034,word.lower():pxleyes
1.034,+1:word.lower():top
1.034,word[-3:]:yes
1.026,word[-2:]:es
0.385,+1:postag[:2]::

Weight?,Feature
4.526,word.isupper()
2.8,-1:word.lower():for
1.055,postag:NNP
0.36,word[-3:]:ire
0.36,word.lower():empire
0.36,+1:word.lower():state
0.288,word[-2:]:re
0.272,-1:word.lower():.
0.218,-1:postag:.
0.218,-1:postag[:2]:.

Weight?,Feature
+1.467,-1:postag[:2]:NN
+1.108,-1:word.lower():fashion
+1.037,+1:word.lower():and
+0.640,+1:postag[:2]:CC
+0.640,+1:postag:CC
+0.588,postag:NN
+0.419,word.lower():week
+0.419,word[-2:]:ek
+0.419,word[-3:]:eek
+0.364,word[-2:]:rd

Weight?,Feature
1.547,word.lower():4dbling
1.516,+1:word.lower():'s
0.367,word[-3:]:ing
0.367,word[-2:]:ng
