## CRF model with cross validation

Modelo CRF para ato de aviso de licitação (com dados rotulados manualmente)

In [99]:
# !pip install sklearn_crfsuite
# !pip install nltk
# !pip install pyarrow

# !pip freeze

import pandas as pd
import sklearn_crfsuite
import nltk
from nltk.tokenize import word_tokenize

In [100]:
path_data_train = pd.read_parquet('./result/train_new.parquet')
path_data_test = pd.read_parquet('./result/test_new.parquet')
path_data_test
data_train = pd.DataFrame(path_data_train)
# data_train['tipo'].unique()
data_test = pd.DataFrame(path_data_test)


Load csv with text and labels

In [101]:
# tratando dados de treino
 
from nltk.tokenize import RegexpTokenizer

x_train = []
y_train = []
for row in range(len(data_train)):
    if pd.notna(data_train['treated_text'][row]):
        x_train.append(word_tokenize(data_train['treated_text'][row]))
        y_train.append(data_train['IOB'][row].split())
len(x_train), len(y_train)

(304, 304)

In [102]:
# tratando dados de teste

from nltk.tokenize import RegexpTokenizer


x_test = []
y_test = []
for row in range(len(data_test)):
    if pd.notna(data_test['treated_text'][row]):
        x_test.append(word_tokenize(data_test['treated_text'][row]))
        y_test.append(data_test['IOB'][row].split())
len(x_test), len(y_test)

(131, 131)

Create dictionary feature for each word in each sequence in x

In [103]:
def get_features(sentence):
    """Create features for each word in act.
    Create a list of dict of words features to be used in the predictor module.
    Args:
        act (list): List of words in an act.
    Returns:
        A list with a dictionary of features for each of the words.
    """
    sent_features = []
    for i in range(len(sentence)):
        word_feat = {
            'word': sentence[i].lower(),
            'word[-3:]': sentence[i][-3:],
            'word[-2:]': sentence[i][-2:],
            'word_istitle': sentence[i].istitle(),
            'all_capital': sentence[i].isupper(),
            'word_isdigit': sentence[i].isdigit(),
            'word_isalpha': sentence[i].isalpha(),  # falso se tiver simbolo
            # Uma palavra antes
            'word_before': '' if i == 0 else sentence[i-1].lower(),
            'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
            'word_before_isalpha': '' if i == 0 else sentence[i-1].isalpha(),
            'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
            'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
            # Duas palavras antes
            'word_before2': '' if i in [0, 1] else sentence[i-2].lower(),
            'word_before_isdigit2': '' if i in [0, 1] else sentence[i-2].isdigit(),
            'word_before_isalpha2': '' if i in [0, 1] else sentence[i-2].isalpha(),
            'word_before_isupper2': '' if i in [0, 1] else sentence[i-2].isupper(),
            'word_before_istitle2': '' if i in [0, 1] else sentence[i-2].istitle(),
            # Uma palavra depois
            'word_after': '' if i+1 == len(sentence) else sentence[i+1].lower(),
            'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
            'word_after_isalpha': '' if i+1 >= len(sentence) else sentence[i+1].isalpha(),
            'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
            'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),
            # Duas palavras depois
            'word_after2': '' if i+2 >= len(sentence) else sentence[i+2].lower(),
            'word_after_isdigit2': '' if i+2 >= len(sentence) else sentence[i+2].isdigit(),
            'word_after_isalpha2': '' if i+2 >= len(sentence) else sentence[i+2].isalpha(),
            'word_after_isupper2': '' if i+2 >= len(sentence) else sentence[i+2].isupper(),
            'word_after_istitle2': '' if i+2 >= len(sentence) else sentence[i+2].istitle(),
            
            'BOS': i == 0,
            'EOS': i == len(sentence)-1
        }
        sent_features.append(word_feat)
    return sent_features

# Concatena cada palavra do texto do ato com sua respectiva anotação de entidade

def concatenaPredicao(ato,  predicao):
    print ("{:<15} {:<10}".format('Entidade','Predição'))
    for i in range(len(ato)):
        print ("{:<15} {:<10}".format( ato[i], predicao[i]))
        # print(ato[i] + '------' + predicao[i])
        # print(predicao[i]) + '[' + predicao[i] + ']' , end=" ")

In [98]:
for i in range(len(x_train)):
    x_train[i] = get_features(x_train[i])
    

for i in range(len(x_test)):
    x_test[i] = get_features(x_test[i])

A
D
L
P
E
N
0
U
U
9
A
f
d
a
a
d
d
I
B
A
(
I
)
e
d
d
o
p
d
S
d
R
d
P
d
G
d
D
F
,
a
S
d
C
G
S
S
o
l
n
s
C
,
c
o
e
o
R
d
P
v
a
e
a
d
m
d
i
(
c
d
t
e
d
t
)
,
c
e
e
c
c
d
A
I
d
E
.
V
e
:
R
$
1
(
c
e
s
m
,
s
e
q
r
e
v
e
s
c
)
.
T
d
L
:
M
P
.
E
d
d
:
3
.
A
d
p
d
0
,
a
0
h
.
P
n
:
0
.
O
e
p
s
r
n
e
e
w
.
I
p
t
:
(
6
)
3
.
B
,
1
d
d
d
2
K
R
D
S
R
P
P
E
N
0
U
U
9
A
D
L
O
M
,
p
m
d
P
,
t
p
a
r
d
l
p
r
d
p
,
o
a
c
d
e
e
p
o
f
d
d
e
d
m
d
1
C
S
S
d
C
p
p
a
R
d
A
d
M
d
V
(
A
'
)
n
V
P
d
M
,
c
p
n
0
.
A
p
d
c
c
o
v
e
s
,
d
a
c
o
A
.
3
d
L
n
1
e
A
.
4
d
R
d
L
e
C
d
M
.
D
e
h
l
p
o
c
d
p
:
a
a
1
d
d
7
d
j
d
2
.
O
r
E
p
s
r
n
e
e
w
e
w
,
m
p
r
n
u
p
p
d
l
.
A
L
S
C
A
D
A
D
L
S
S
P
E
N
3
P
S
:
0
.
T
:
M
P
.
O
:
R
d
P
p
f
a
d
e
p
a
a
U
B
d
S
d
S
P
d
D
F
,
a
f
d
c
c
a
a
m
,
p
e
s
d
m
c
d
D
F
,
p
a
d
d
S
d
E
d
S
P
d
D
F
.
V
E
:
S
p
a
e
e
s
a
a
o
e
d
e
d
l
.
P
:
D
e
:
3
(
t
)
d
c
d
r
d
N
d
E
o
d
a
d
c
.
V
d
a
:
1
m
.
D
C
:
1
(
c
e
v
)
d
a
p
d
d
d
a
.
D
D
S
P
:
0
a
9
h
n
w
.
U
4
.
E
e
d
n
e
a
e
n
h
t
t
p
:
/


Separate train and test splits (in order)

In [76]:
x_train[5]

[{'word': 'pregao',
  'word[-3:]': 'GAO',
  'word[-2:]': 'AO',
  'capital_letter': True,
  'word_istitle': False,
  'all_capital': True,
  'word_isdigit': False,
  'word_isalpha': True,
  'word_before': '',
  'word_before_isdigit': '',
  'word_before_isalpha': '',
  'word_before_isupper': '',
  'word_before_istitle': '',
  'word_before2': '',
  'word_before_isdigit2': '',
  'word_before_isalpha2': '',
  'word_before_isupper2': '',
  'word_before_istitle2': '',
  'word_after': 'eletronico',
  'word_after_isdigit': False,
  'word_after_isalpha': True,
  'word_after_isupper': True,
  'word_after_istitle': False,
  'word_after2': 'no',
  'word_after_isdigit2': False,
  'word_after_isalpha2': True,
  'word_after_isupper2': False,
  'word_after_istitle2': True,
  'BOS': True,
  'EOS': False},
 {'word': 'eletronico',
  'word[-3:]': 'ICO',
  'word[-2:]': 'CO',
  'capital_letter': True,
  'word_istitle': False,
  'all_capital': True,
  'word_isdigit': False,
  'word_isalpha': True,
  'word_befo

In [13]:
# import math

# split_idx = math.floor(0.8*len(x))
# x_train = x[0:split_idx]
# y_train = y[0:split_idx]
# x_test = x[split_idx:]
# y_test = y[split_idx:]
# len(x_train),  len(x_test), len(y_train)

(666, 167, 666)

In [21]:
# !pip install -U 'scikit-learn<0.24'

In [77]:
# !pip install scipy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.38),
    'c2': scipy.stats.expon(scale=0.02),
}

rs = RandomizedSearchCV(crf, params_space, 
                        cv=5, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=20)
rs.fit(x_train, y_train)

# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer

# custom_scorer_f1 = make_scorer(metrics.flat_f1_score, average='macro')
# scores = cross_val_score(crf, x_train, y_train, cv=5, scoring=custom_scorer_f1)
# scores

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.3min finished


RandomizedSearchCV(cv=5,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb914be5c70>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb914c4a460>},
                   verbose=1)

In [10]:
# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [78]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.5810809797851426, 'c2': 0.034040355813791626}
best CV score: 0.9178731377602617
model size: 0.27M


In [79]:
crf = rs.best_estimator_

classes = list(crf.classes_)
classes.remove('O')

y_pred = crf.predict(x_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=classes, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

  B-MODALIDADE_LICITACAO      0.932     0.891     0.911       138
  I-MODALIDADE_LICITACAO      0.884     0.891     0.887       137
         B-NUM_LICITACAO      0.856     0.876     0.866       129
       B-ORGAO_LICITANTE      0.615     0.182     0.281        44
       I-ORGAO_LICITANTE      0.795     0.161     0.268       217
       B-SISTEMA_COMPRAS      0.875     0.706     0.782       109
         B-OBJ_LICITACAO      0.878     0.783     0.828       138
         I-OBJ_LICITACAO      0.797     0.893     0.843      3990
        B-VALOR_ESTIMADO      0.760     0.890     0.820        82
         B-DATA_ABERTURA      0.784     0.766     0.775       128
              B-PROCESSO      0.922     0.645     0.759       110
                   B-IOB      0.880     0.807     0.842       109
      B-NOME_RESPONSAVEL      0.316     0.900     0.468        20
      I-NOME_RESPONSAVEL      0.324     0.941     0.482        51
B-CODIGO_