# CRF model

### Modelo CRF para ato de abertura de licitação

In [3]:
import pandas as pd

import nltk

import scipy.stats

import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## Load csv with text and IOB

In [4]:
DEFAULT_TOKENIZER = nltk.RegexpTokenizer(r"\w+").tokenize

data = pd.read_csv('CSVs/IOBabertura.csv')
x = []
y = []
for row in range(len(data)):
    if pd.notna(data['Texto'][row]) and pd.notna(data['IOB'][row]):
        x.append(DEFAULT_TOKENIZER(data['Texto'][row]))
        y.append(data['IOB'][row].split())
len(x), len(y)

(1243, 1243)

In [5]:
data

Unnamed: 0.1,Unnamed: 0,ATO,DATA,PROCESSO,TIPO,DISPUTA,V_ESTIMADO,REDATOR,Texto,IOB
0,0,AVISO DE ABERTURA DE LICITACAO,23 de janeiro de 2004,,,,,,AVISO DE ABERTURA DE LICITACAO TOMADA DE PRECO...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O O ...
1,1,AVISO DE ABERTURA DE LICITACAO,29 de janeiro de 2004,,,,,,AVISO DE ABERTURA DE LICITACAO TOMADA DE PRECO...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O O ...
2,2,AVISO DE ABERTURA DE LICITACAO,04-02-2004,1.018682e+09,,,,,AVISO DE ABERTURA DE LICITACAO PREGAO No 001/2...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O O ...
3,3,AVISO DE ABERTURA DE LICITACAO,25-02-2004,1.001312e+10,,,,,AVISO DE ABERTURA DE LICITACAO CONVITE No 001/...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O O ...
4,4,AVISO DE ABERTURA DE LICITACAO,25-02-2004,1.001312e+10,,,,,AVISO DE ABERTURA DE LICITACAO CONVITE No 001/...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O O ...
...,...,...,...,...,...,...,...,...,...,...
1238,1238,AVISO DE ABERTURA DE LICITACAO,24 de agosto de 2021,5.000035e+15,Menor Preco,Aberto,"R$ 3.864.574,44",AMILCAR UBIRATAN URACH VIEIRA,AVISO DE ABERTURA DE LICITACAO PREGAO ELETRONI...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O B-PR...
1239,1239,AVISO DE ABERTURA DE LICITACAO,25-08-2021,7.200001e+15,Menor Preco por item,,,JOAO DE DEUS ABREU SOARES,AVISO DE ABERTURA DE LICITACAO PREGAO ELETRONI...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O B-PR...
1240,1240,AVISO DE ABERTURA DE LICITACAO,26-08-2021,4.026000e+17,Menor Preco,,"R$ 19.940,00",ROSIMEIRE PAIVA DA SILVA,AVISO DE ABERTURA DE LICITACAO SRP PREGAO ELET...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O O B-...
1241,1241,AVISO DE ABERTURA DE LICITACAO,26-08-2021,7.200000e+15,Menor Preco por Grupo,,,JOAO DE DEUS ABREU SOARES,AVISO DE ABERTURA DE LICITACAO PREGAO ELETRONI...,B-ATO I-ATO I-ATO I-ATO I-ATO O O O O O O B-PR...


In [3]:
len(x[0])

146

In [4]:
len(y[0])

146

## Create dictionary feature for each word in each sequence in x

In [5]:
def get_features(sentence):
        """Create features for each word in act.
        Create a list of dict of words features to be used in the predictor module.
        Args:
            act (list): List of words in an act.
        Returns:
            A list with a dictionary of features for each of the words.
        """
        sent_features = []
        for i in range(len(sentence)):
            word_feat = {
                'word': sentence[i].lower(),
                'capital_letter': sentence[i][0].isupper(),
                'all_capital': sentence[i].isupper(),
                'isdigit': sentence[i].isdigit(),
                'word_before': sentence[i].lower() if i == 0 else sentence[i-1].lower(),
                'word_after:': sentence[i].lower() if i+1 >= len(sentence) else sentence[i+1].lower(),
                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features
    
for i in range(len(x)):
    x[i] = get_features(x[i])

## Separate train and test splits (in order)

In [38]:
import math
split_idx = math.floor(0.8*len(x))
x_train = x[0:split_idx]
y_train = y[0:split_idx]
x_test = x[split_idx:]
y_test = y[split_idx:]
len(x_train),  len(x_test), len(x)


(994, 249, 1243)

In [39]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=10,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False,
    all_possible_states=True
)

crf.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=False,
    c1=10, c2=0.1, keep_tempfiles=None, max_iterations=100)

## Evaluation

In [8]:
classes = list(crf.classes_)
# classes.remove('O')

y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.9141661869699788

In [9]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=classes, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ATO      1.000     0.996     0.998       258
       I-ATO      0.995     0.999     0.997      1023
           O      0.932     0.997     0.964     33199
      B-DATA      0.937     0.563     0.704       158
      I-DATA      0.937     0.776     0.849       459
  B-PROCESSO      0.996     1.000     0.998       241
   B-REDATOR      0.807     0.673     0.734       217
   I-REDATOR      0.964     0.753     0.846       640
B-V_ESTIMADO      0.585     0.087     0.151       277
I-V_ESTIMADO      0.575     0.098     0.167       235
      B-TIPO      1.000     0.889     0.942       190
      I-TIPO      0.995     0.121     0.216      1697
   B-DISPUTA      1.000     0.886     0.939        35
   I-DISPUTA      0.000     0.000     0.000        63

    accuracy                          0.934     38692
   macro avg      0.837     0.631     0.679     38692
weighted avg      0.932     0.934     0.914     38692



In [10]:
classes = list(crf.classes_)
classes.remove('O')

y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.6151794685411471

In [11]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=classes, digits=3
))



              precision    recall  f1-score   support

       B-ATO      1.000     0.996     0.998       258
       I-ATO      0.995     0.999     0.997      1023
      B-DATA      0.937     0.563     0.704       158
      I-DATA      0.937     0.776     0.849       459
  B-PROCESSO      0.996     1.000     0.998       241
   B-REDATOR      0.807     0.673     0.734       217
   I-REDATOR      0.964     0.753     0.846       640
B-V_ESTIMADO      0.585     0.087     0.151       277
I-V_ESTIMADO      0.575     0.098     0.167       235
      B-TIPO      1.000     0.889     0.942       190
      I-TIPO      0.995     0.121     0.216      1697
   B-DISPUTA      1.000     0.886     0.939        35
   I-DISPUTA      0.000     0.000     0.000        63

   micro avg      0.961     0.555     0.703      5493
   macro avg      0.830     0.603     0.657      5493
weighted avg      0.928     0.555     0.615      5493



## Hyperparameter Optimization

In [12]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states=True
)
params_space = {
    'c1': scipy.stats.expon(scale=15.0),
    'c2': scipy.stats.expon(scale=1.0),
}


# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 89.5min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=True,
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F9839B6FD0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F9839B6278>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-ATO', 'I-ATO', 'B-DATA', 'I-DATA', 'B-PROCESSO', 'B-REDATOR', 'I-REDATOR', 'B-V_ESTIMADO', 'I-V_ESTIMADO', 'B-TIPO', 'I-TIPO', 'B-DISPUTA', 'I-DISPUTA']),
                   verbose=1)

In [18]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 3.827954989443119, 'c2': 0.09387710654084042}
best CV score: 0.7162594297020441
model size: 0.06M


## Check best estimator on our test data

In [19]:
sorted_classes = sorted(
    classes,
    key=lambda name: (name[1:], name[0])
)

In [31]:
classes = list(crf.classes_)
classes.remove('O')

y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.6474571946297808

In [30]:
crf = rs.best_estimator_
y_pred = crf.predict(x_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_classes, digits=3
))

              precision    recall  f1-score   support

       B-ATO      1.000     1.000     1.000       258
       I-ATO      0.997     0.999     0.998      1023
      B-DATA      0.941     0.601     0.734       158
      I-DATA      0.933     0.821     0.874       459
   B-DISPUTA      1.000     0.886     0.939        35
   I-DISPUTA      0.000     0.000     0.000        63
  B-PROCESSO      1.000     1.000     1.000       241
   B-REDATOR      0.852     0.770     0.809       217
   I-REDATOR      0.939     0.866     0.901       640
      B-TIPO      1.000     0.900     0.947       190
      I-TIPO      0.988     0.145     0.253      1697
B-V_ESTIMADO      0.578     0.094     0.161       277
I-V_ESTIMADO      0.726     0.226     0.344       235

   micro avg      0.958     0.590     0.730      5493
   macro avg      0.843     0.639     0.689      5493
weighted avg      0.931     0.590     0.647      5493



## Let’s check what classifier learned

In [21]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
O      -> O       6.951327
I-TIPO -> I-TIPO  6.672961
I-V_ESTIMADO -> I-V_ESTIMADO 5.540480
B-TIPO -> I-TIPO  4.248815
I-DATA -> I-DATA  4.246677
B-REDATOR -> I-REDATOR 4.216292
O      -> B-V_ESTIMADO 3.304356
I-REDATOR -> I-REDATOR 3.301610
O      -> B-REDATOR 2.932066
O      -> B-DATA  2.925795
I-ATO  -> I-ATO   2.871139
O      -> B-TIPO  2.616860
B-V_ESTIMADO -> I-V_ESTIMADO 2.309509
B-ATO  -> I-ATO   2.190152
B-V_ESTIMADO -> O       1.958944
I-ATO  -> O       1.577792
I-DATA -> O       1.521440
I-V_ESTIMADO -> O       1.274088
B-DATA -> O       1.259533
B-DATA -> I-DATA  1.239651

Top unlikely transitions:
B-V_ESTIMADO -> I-REDATOR -0.502562
B-V_ESTIMADO -> B-DATA  -0.528547
I-REDATOR -> B-ATO   -0.587319
B-PROCESSO -> I-V_ESTIMADO -0.721172
I-REDATOR -> I-ATO   -0.751840
O      -> I-TIPO  -0.836569
B-REDATOR -> O       -0.901386
B-PROCESSO -> I-TIPO  -1.054468
I-ATO  -> I-REDATOR -1.058103
I-REDATOR -> O       -1.131006
B-DATA -> I-V_ESTIMADO -1.184755
B-DA

In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
8.288086 B-ATO    word:aviso
7.762159 B-TIPO   word:menor
7.692698 B-PROCESSO word_before:processo
5.726582 B-DISPUTA word_before:disputa
5.134986 B-V_ESTIMADO word:r
4.447545 B-PROCESSO isdigit
4.419830 B-DATA   word_before:brasilia
4.339073 I-V_ESTIMADO word_after::classificacao
4.226758 I-DATA   word_before:de
4.020117 B-DISPUTA word:aberto
3.725989 I-ATO    word:abertura
3.681526 B-DATA   isdigit
3.630837 O        word:processo
3.626531 B-V_ESTIMADO word:da
3.439164 I-ATO    word:licitacao
3.326734 O        word_after::aviso
3.265885 B-DATA   word:24
3.129679 B-TIPO   word:maior
3.115425 B-REDATOR word:departamento
3.107413 B-REDATOR word:ineditoriais
2.854801 B-TIPO   word_before:tipo
2.852048 B-REDATOR word:fundacao
2.790989 I-V_ESTIMADO word_after::data
2.778209 O        word_before:companhia
2.746854 B-DATA   word_before:df
2.640818 B-DATA   word:05
2.573038 B-DATA   word:15
2.540899 O        word:data
2.537661 B-TIPO   word_before:licitacao
2.512727 I-ATO    word

## Persistindo o melhor modelo em disco

In [34]:
import joblib

joblib.dump(crf, 'crf_model.pkl')

['crf_model.pkl']

In [35]:
model = joblib.load('crf_model.pkl')

In [36]:
model

CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=True,
    c1=3.827954989443119, c2=0.09387710654084042, keep_tempfiles=None,
    max_iterations=100)

In [37]:
model.classes_

['B-ATO',
 'I-ATO',
 'O',
 'B-DATA',
 'I-DATA',
 'B-PROCESSO',
 'B-REDATOR',
 'I-REDATOR',
 'B-V_ESTIMADO',
 'I-V_ESTIMADO',
 'B-TIPO',
 'I-TIPO',
 'B-DISPUTA',
 'I-DISPUTA']