## CRF model with cross validation

Modelo CRF para ato de aviso de licitação (com dados rotulados manualmente)

In [17]:
# !pip install sklearn_crfsuite
# !pip install nltk

import pandas as pd
import sklearn_crfsuite
import nltk
from nltk.tokenize import word_tokenize

In [18]:
df= pd.read_parquet('iob_aviso_licitacao.parquet')
result = pd.DataFrame(df)

Load csv with text and labels

In [19]:
from nltk.tokenize import RegexpTokenizer

TOKENIZER = nltk.RegexpTokenizer(r"\w+").tokenize

x = []
y = []
for row in range(len(result)):
    if pd.notna(result['treated_text'][row]):
        x.append(TOKENIZER(result['treated_text'][row]))
        y.append(result['IOB'][row].split())
len(x), len(y)

(234, 234)

Create dictionary feature for each word in each sequence in x

In [20]:
def get_features(sentence):
        """Create features for each word in act.
        Create a list of dict of words features to be used in the predictor module.
        Args:
            act (list): List of words in an act.
        Returns:
            A list with a dictionary of features for each of the words.
        """
        sent_features = []
        for i in range(len(sentence)):
            word_feat = {
                'word': sentence[i].lower(),
                'word[-3:]': sentence[i][-3:],
                'word[-2:]': sentence[i][-2:],
                'capital_letter': sentence[i][0].isupper(),
                'word_istitle': sentence[i].istitle(),
                'all_capital': sentence[i].isupper(),
                'word_isdigit': sentence[i].isdigit(),
                'word_before': '' if i == 0 else sentence[i-1].lower(),
                'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
                'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
                'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
                'word_after:': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
                'word_after_isdigit:': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
                'word_after_isupper:': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
                'word_after_istitle:': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),
                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features
    
for i in range(len(x)):
    x[i] = get_features(x[i])

Separate train and test splits (in order)

In [21]:
import math
split_idx = math.floor(0.8*len(x))
x_train = x[0:split_idx]
y_train = y[0:split_idx]
x_test = x[split_idx:]
y_test = y[split_idx:]
len(x_train),  len(x_test), len(y_train)


(187, 47, 187)

In [22]:
# !pip install scipy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50)
rs.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 10.5min finished


RandomizedSearchCV(cv=5,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3481040a60>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3460cd4c10>},
                   verbose=1)

In [23]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.8255380650353655, 'c2': 0.06517638358398285}
best CV score: 0.895343436287717
model size: 0.14M


In [24]:
# !pip install matplotlib
# import matplotlib.pyplot as plt
# from sklearn.model_selection import RandomizedSearchCV
# plt.style.use('ggplot')

# _x = rs.cv_results_['param_c1']
# _y = rs.cv_results_['param_c2']
# _c = rs.cv_results_['mean_test_score']

# fig = plt.figure()
# fig.set_size_inches(12, 12)
# ax = plt.gca()
# ax.set_yscale('log')
# ax.set_xscale('log')
# ax.set_xlabel('C1')
# ax.set_ylabel('C2')
# ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
#     min(_c), max(_c)
# ))

# ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

# print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

In [25]:
crf = rs.best_estimator_

classes = list(crf.classes_)
classes.remove('O')

y_pred = crf.predict(x_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=classes, digits=3
))

                          precision    recall  f1-score   support

  B-MODALIDADE_LICITACAO      0.975     0.812     0.886        48
  I-MODALIDADE_LICITACAO      1.000     0.725     0.841        51
         B-NUM_LICITACAO      0.942     0.875     0.907        56
         I-NUM_LICITACAO      0.942     0.961     0.951        51
              B-PROCESSO      0.815     0.524     0.638        42
              I-PROCESSO      0.816     0.559     0.663       111
         B-OBJ_LICITACAO      0.950     0.844     0.894        45
         I-OBJ_LICITACAO      0.811     0.856     0.833      1442
        B-VALOR_ESTIMADO      0.800     0.471     0.593        17
        I-VALOR_ESTIMADO      0.960     0.649     0.774        37
B-CODIGO_SISTEMA_COMPRAS      0.400     0.333     0.364         6
       B-SISTEMA_COMPRAS      0.775     0.838     0.805        37
       I-SISTEMA_COMPRAS      0.693     0.906     0.785       117
         B-DATA_ABERTURA      0.850     0.654     0.739        52
         

In [26]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
        

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-OBJ_LICITACAO -> I-OBJ_LICITACAO 5.454570
O      -> O       5.377731
B-NUM_LICITACAO -> I-NUM_LICITACAO 5.245663
I-SISTEMA_COMPRAS -> I-SISTEMA_COMPRAS 4.989615
B-TIPO_OBJ -> I-TIPO_OBJ 4.568041
I-ORGAO_LICITANTE -> I-ORGAO_LICITANTE 4.412276
B-VALOR_ESTIMADO -> I-VALOR_ESTIMADO 4.340435
I-TIPO_OBJ -> I-TIPO_OBJ 4.222748
B-ORGAO_LICITANTE -> I-ORGAO_LICITANTE 3.944919
B-MODALIDADE_LICITACAO -> I-MODALIDADE_LICITACAO 3.931801
I-VALOR_ESTIMADO -> I-VALOR_ESTIMADO 3.919917
I-PROCESSO -> I-PROCESSO 3.730050
I-DATA_ABERTURA -> I-DATA_ABERTURA 3.664360
B-CODIGO_SISTEMA_COMPRAS -> I-CODIGO_SISTEMA_COMPRAS 3.473041
B-PROCESSO -> I-PROCESSO 2.986749
B-SISTEMA_COMPRAS -> I-SISTEMA_COMPRAS 2.930768
I-MODALIDADE_LICITACAO -> I-MODALIDADE_LICITACAO 2.874957
O      -> B-SISTEMA_COMPRAS 2.606481
B-OBJ_LICITACAO -> I-OBJ_LICITACAO 2.523743
O      -> B-OBJ_LICITACAO 2.497686

Top unlikely transitions:
B-PROCESSO -> I-DATA_ABERTURA -1.066920
I-PROCESSO -> I-OBJ_LICITACAO -1.108

In [27]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
4.735413 B-CODIGO_SISTEMA_COMPRAS word_before:uasg
3.929046 B-NUM_LICITACAO word_before:nº
3.881915 B-CODIGO_SISTEMA_COMPRAS word_before:id
3.582970 O        word_before:br
3.532931 B-MODALIDADE_LICITACAO word:concorrência
3.520654 B-DATA_ABERTURA word_before:abertura
3.423585 B-SISTEMA_COMPRAS word:wwwlicitacoes
3.339143 B-MODALIDADE_LICITACAO word:convite
3.308811 I-DATA_ABERTURA word_after::às
3.300599 B-MODALIDADE_LICITACAO word_before:licitação
3.289942 B-VALOR_ESTIMADO word_before:r
3.284088 B-OBJ_LICITACAO word_before:objeto
3.223029 B-TIPO_OBJ word:execução
3.187037 B-PROCESSO word_before:processo
3.160723 I-SISTEMA_COMPRAS word_after::codhab
2.946253 B-NUM_LICITACAO word_after::fonte
2.840146 B-OBJ_LICITACAO word_before:é
2.739770 I-CODIGO_SISTEMA_COMPRAS word_before:uasg
2.689305 B-MODALIDADE_LICITACAO word:tomada
2.525708 B-MODALIDADE_LICITACAO word:pregão
2.519824 B-ORGAO_LICITANTE word_before:a
2.497259 O        word:abertura
2.453163 O        word:valor
2.34