In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
import glob
import xml.etree.ElementTree as ET
import nltk
import math
import numpy as np
import warnings
import mlflow
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline


warnings.filterwarnings("ignore")

---
# 1. Extraindo trechos anotados dos XMLs

### 1.1. Adiquirindo a raiz de cada XML

In [2]:
glob_path = 'xml_batch1/*.xml' # Caminho até os XMLs
roots = []
for xml in glob.glob(glob_path):
    tree = ET.parse(xml)
    roots.append(tree.getroot())

In [3]:
# Esse bloco deve demorar de alguns segundos a 1 minuto
atos_csv_dict = {}

for root in roots: # Intera sobre as raizes
    for relation in root.findall(".//relation"):                            # Intera sobre as relações.
        row_act = {}
        type_relation = relation.find('.//infon[@key="type"]').text
        for node in relation.findall('node'):                               # Intera sobre os nós(anotações) da relação.
            ref_id = node.get('refid')
            annotation = root.find(f'.//annotation[@id="{ref_id}"]')        # Encontra anotação.
            type_annotation = annotation.find('.//infon[@key="type"]').text # Encontra tipo da anotação.
            text_annotation = annotation.find('text').text                  # Encontra texto da anotação.
            row_act[type_annotation] = text_annotation
        
        if type_relation not in atos_csv_dict:                              # Checa se a tabela já existe, caso contrário, cria uma.
            atos_csv_dict[type_relation] = pd.DataFrame()
            
        atos_csv_dict[type_relation] = atos_csv_dict[type_relation].append(row_act, ignore_index=True)

In [4]:
nomeacao_df = atos_csv_dict['Ato_Nomeacao_Comissionado']

nomeacao_fields = ['nome', 'cargo_efetivo', 'matricula', 'matricula_SIAPE', 'simbolo', 'cargo_comissionado', 'hierarquia_lotacao', 'orgao', 'Ato_Nomeacao_Comissionado']

# Nessa linha todas as colunas que não pertencem a nomecao são extraidas para uma segunda lista. (Compreensão de listas).
nomeacao_non_fields = [column for column in nomeacao_df.columns if column not in nomeacao_fields]

# Exclusão de todas as linhas que possuam algum valor nos campos que não pertecem a nomeacao.
nomeacao_df = nomeacao_df[nomeacao_df[nomeacao_non_fields].isna().any(axis=1)]

# Exclusão de todas as colunas que não pertecem a nomeação.
nomeacao_df = nomeacao_df.drop(columns=nomeacao_non_fields)

# Exclusão das linhas que não possuem anotação de atos.
nomeacao_df = nomeacao_df.dropna(subset=['Ato_Nomeacao_Comissionado'])

atos_csv_dict['Ato_Nomeacao_Comissionado'] = nomeacao_df
atos_csv_dict['Ato_Nomeacao_Comissionado']

Unnamed: 0,Ato_Nomeacao_Comissionado,cargo_comissionado,hierarquia_lotacao,nome,orgao,simbolo,matricula,cargo_efetivo,matricula_SIAPE
0,NOMEAR JOSE RAIMUNDO PINTO para exercer o Carg...,Assistente,"Gerencia de Seguranca e Saude, da Diretoria de...",JOSE RAIMUNDO PINTO,\nGovernadoria do Distrito Federal,DFA-\n08,,,
1,NOMEAR JOSE RAIMUNDO PINTO para exercer o Carg...,Assistente,"Gerencia de Seguranca e Saude, da Diretoria de...",JOSE RAIMUNDO PINTO,\nGovernadoria do Distrito Federal,DFA-\n08,,,
2,NOMEAR INACIA GRACCIELLA COSTA BARROS para exe...,Chefe,"Assessoria de Comunicacao, do Gabinete, da Ad-...",INACIA GRACCIELLA COSTA BARROS,"Casa Civil, \nda Governadoria do Distrito Federal",CNE-07,,,
3,NOMEAR INACIA GRACCIELLA COSTA BARROS para exe...,Chefe,"Assessoria de Comunicacao, do Gabinete, da Ad-...",INACIA GRACCIELLA COSTA BARROS,"Casa Civil, \nda Governadoria do Distrito Federal",CNE-07,,,
4,NOMEAR JORDANA ZANFERARI para exercer o Cargo ...,Gerente,"Gerencia de Seguranca e Saude, da Diretoria de...",JORDANA ZANFERARI,Governa-\ndoria do Distrito Federal,DFG-14,,,
...,...,...,...,...,...,...,...,...,...
7682,NOMEAR o Tenente-Coronel QOBM/Comb. TITO VAZ D...,Chefe,"Secao de Ensino, Pesquisa, \nCiencia e Tecnolo...",Tenente-Coronel QOBM/Comb. TITO VAZ DE ABREU NETO,Corpo de Bombeiros \nMilitar do Distrito Federal,DFG-14,1399904,,
7683,NOMEAR DOLORES MARIA DE ALBUQUERQUE para exerc...,Agente Operacional,Diretoria de Operacoes e Defesa do Solo e da \...,DOLORES MARIA DE ALBUQUERQUE,Secretaria de Estado da \nOrdem Publica do Dis...,DFA-10,,,
7684,NOMEAR SANDRA MARIA FERREIRA para exercer o Ca...,,"Agente Operacional, da Diretoria de Operacoes ...",SANDRA MARIA FERREIRA,Secretaria de Estado da \nOrdem Publica do Dis...,DFA-10,,,
7685,NOMEAR BRUNO LUAN LIMA DOS SANTOS para exercer...,,"Agente Operacional, da Diretoria de Operacoes ...",BRUNO LUAN LIMA DOS SANTOS,Secretaria de Estado da \nOrdem Publica do Dis...,DFA-10,,,


In [5]:
for key in atos_csv_dict.keys():
    atos_csv_dict[key] = atos_csv_dict[key].dropna(subset=[key])

In [6]:
_tokenizer = nltk.RegexpTokenizer(r"\w+")
def tokenize(sentence):
    try:
        new_words = _tokenizer.tokenize(sentence)
        return new_words    
    except:
        print("SENTENCE:", sentence)
        input()
    return ''


def find_entity(row, token):
    """ ...
    
    Assumes `row` has the whole text on the first collumn
    and the remaining contain entities.
    """
    for column in row.keys()[1:]:
        if (row[column] is not np.nan 
            and token == tokenize(row[column])[0]):
            return column
    return None

# Atualizar no futuro para qualquer ato.
# Aparentemente esse algoritmo está O(n*m) onde n é a quantidade de tokens e m a quantidade de colunas do df.
def generate_IOB_labels(row):
    """Generates IOB-labeling for whole text and entities.

    Assumes `row` has the whole text on the first collumn
    and the remaining contain entities.
    """
    labels = []
    entity_started = False
    text = row.iloc[0]
    for token in tokenize(text):                         # Intera sobre cada token da anotação do ato.
        if not entity_started:                               # Caso uma entidade ainda n tenha sido identificada nos tokens.
            entity = find_entity(row, token)                 # Busca o token atual no primeiro token de todos os campos do df.
            if entity is not None:                           # Se foi encontrado o token no inicio de alguma entidade ele inicia a comparação token a token com a entidade.
                entity_started = True
                token_index = 1
                labels.append('B-' + entity)
            else:
                labels.append('O')
        else:                                                # Caso uma entidade já tenha sido identificada
            if token_index < len(tokenize(row[entity])) and token == tokenize(row[entity])[token_index]: # Checa se o próximo token pertence à entidade e se o tamanho da entidade chegou ao fim.
                labels.append('I-' + entity)                 # Se a entidade ainda possui tokens e a comparação foi bem sucedida adicione o label I.
                token_index += 1
                if token_index >= len(tokenize(row[entity])):
                    entity_started = False
            else:                                            # Se o token n for igual ou a entidade chegou ao fim.
                entity_started = False
                labels.append('O')
                
    return labels


def find_entity(row, token, ignore_idx=0):
    """Searches for named entities on columns, except by ignore_idx-columns.
    
    ignore_idx: int indicating which column has
                the TEXT where the named were extracted from
    """
    for idx, column in enumerate(row.keys()):
        if idx == ignore_idx:
            continue
        if row[column] is not np.nan and token == tokenize(row[column])[0]:
            return column
    
    return None

# Atualizar no futuro para qualquer ato.
# Complexidade: O(n*m)
# n: quantidade de tokens
# m: quantidade de colunas do df.
def generate_IOB_labels(row, idx=0):
    """Generate IOB-labels for idx-column."""
    labels = []
    entity_started = False
    text = row.iloc[idx]
    for token in tokenize(text):                         # Intera sobre cada token da anotação do ato.
        if not entity_started:                               # Caso uma entidade ainda n tenha sido identificada nos tokens.
            entity = find_entity(row, token)                 # Busca o token atual no primeiro token de todos os campos do df.
            if entity is not None:                           # Se foi encontrado o token no inicio de alguma entidade ele inicia a comparação token a token com a entidade.
                entity_started = True
                token_index = 1
                labels.append('B-' + entity)
            else:
                labels.append('O')
        else:                                                # Caso uma entidade já tenha sido identificada
            if token_index < len(tokenize(row[entity])) and token == tokenize(row[entity])[token_index]: # Checa se o próximo token pertence à entidade e se o tamanho da entidade chegou ao fim.
                labels.append('I-' + entity)                 # Se a entidade ainda possui tokens e a comparação foi bem sucedida adicione o label I.
                token_index += 1
                if token_index >= len(tokenize(row[entity])):
                    entity_started = False
            else:                                            # Se o token n for igual ou a entidade chegou ao fim.
                entity_started = False
                labels.append('O')
                
    return labels


### 3.2. Agora, com as funções de geração label prontas iremos criar uma lista de strings representando os labels para adicionar ao df de cada ato.

---
# 4. Criação das features e treinamento do CRF à moda José

In [7]:
# %%time
def extract_features(sentence):
    sentence_features = []
    for j, sent in enumerate(sentence):
        word_feat = {
                'word': sent.lower(),
                'capital_letter': sent[0].isupper(),
                'all_capital': sent.isupper(),
                'isdigit': sent.isdigit(),
                'word_before': sent.lower() if j==0 else sentence[j-1].lower(),
                'word_after:': sent.lower() if j+1>=len(sentence) else sentence[j+1].lower(),
                'BOS': j==0,
                'EOS': j==len(sentence)-1
        }
        sentence_features.append(word_feat)
    return sentence_features


def extract_rows_features(arq, idx=0):
    """
    Tokenizes then extract features from idx-column.
    """
    return arq.iloc[:, idx].map(tokenize).map(extract_features)



class IOB_Transformer:
    def __init__(self, idx=''):
#         self.idx = idx
        pass
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, df):
        labels_row = []
        for index, row in df.iterrows():
            try:
                labels_row.append(' '.join(generate_IOB_labels(row)))
            except Exception as e:
                print(row)
                raise e

        return pd.Series(labels_row).str.split()


    
class FeatureTransformer:
    def __init__(self, key=''):
        self.key = key
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, df):        
        return extract_rows_features(df)


In [8]:
atos_csv_dict['Ato_Cessao'].columns

Index(['Ato_Cessao', 'cargo_efetivo', 'cargo_orgao_cessionario',
       'fundamento_legal', 'matricula', 'nome', 'onus', 'orgao_cedente',
       'orgao_cessionario', 'processo_SEI', 'simbolo', 'vigencia',
       'hierarquia_lotacao'],
      dtype='object')

In [11]:
ss=atos_csv_dict['Ato_Cessao'].iloc[0]

In [13]:
open('cessao_exemplo.csv', 'w').write(ss.to_csv())

1524

In [23]:
len(ss.values), len(ss.index)

(13, 13)

In [27]:
df = pd.DataFrame(
    data=[ss.values], columns=ss.index
)
df.to_csv('cessao_exemplo.csv')

In [39]:
import json
for v in df.iloc[0]:
    print(json.dumps(v), end=',\n')

"Processo:060.011.183/2014. Interessado: FERNANDES BARNABE DA SILVA. Assunto: \nCESSAO DE SERVIDOR.\nAUTORIZO, com base no Decreto no 35.403 7/5/2014, combinado com o art. 152 \nda Lei Complementar no 840 de 23/12/2011, a cessao de FERNANDES BARNABE \nDA SILVA, Tecnico Administrativo, matricula 122.711-4, da Secretaria de Estado de \nSaude do Distrito Federal a Secretaria de Estado de Educacao do Distrito Federal, \npara exercer o cargo de Assessor da Unidade de Administracao Geral da Fundacao \nUniversidade Aberta do Distrito Federal  UAG/FUNAB, simbolo DFG-14, com onus \npara o orgao de origem.\nEm conformidade com a Lei Complementar no 840, de 23 de Dezembro de 2011, art.153 inci-\nsos I e II, a cessao termina com a exoneracao do cargo para o qual o servidor foi cedido ou a \nrevogacao pela autoridade cedente.\nPublique-se e encaminhe-se a Secretaria de Estado de Saude do Distrito Federal, para \nas providencias pertinentes.",
"Tecnico Administrativo",
"Assessor da Unidade de Admini

In [36]:
v.shape

(13,)

In [14]:
pd.read_csv('cessao_exemplo.csv')

Unnamed: 0.1,Unnamed: 0,0
0,Ato_Cessao,Processo:060.011.183/2014. Interessado: FERNAN...
1,cargo_efetivo,Tecnico Administrativo
2,cargo_orgao_cessionario,Assessor da Unidade de Administracao Geral da ...
3,fundamento_legal,"Decreto no 35.403 7/5/2014, combinado com o ar..."
4,matricula,122.711-4
5,nome,FERNANDES BARNABE \nDA SILVA
6,onus,para o orgao de origem
7,orgao_cedente,Secretaria de Estado de \nSaude do Distrito Fe...
8,orgao_cessionario,Secretaria de Estado de Educacao do Distrito F...
9,processo_SEI,060.011.183/2014


In [11]:
for k, v in atos_csv_dict.items():
    if 'labels_IOB' in v.columns:
        v.pop('labels_IOB')


In [12]:
atos_csv_dict['Ato_Cessao'].columns

Index(['Ato_Cessao', 'cargo_efetivo', 'cargo_orgao_cessionario',
       'fundamento_legal', 'matricula', 'nome', 'onus', 'orgao_cedente',
       'orgao_cessionario', 'processo_SEI', 'simbolo', 'vigencia',
       'hierarquia_lotacao'],
      dtype='object')

In [18]:
mlflow.end_run()

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


In [19]:
# %%time
# mlflow.set_registry_uri("")
mlflow.set_tracking_uri("sqlite:///localhost:5000")
training_ratio = 0.7
try:
    mlflow.end_run()
except:
    pass
run = mlflow.start_run()
print("RUN:", run.info.run_id)

model = sklearn_crfsuite.CRF(
    algorithm = 'l2sgd', 
    c2=1,
    max_iterations=10, 
    all_possible_transitions=True,
    verbose=False
)

pipe = Pipeline([
    ('featurizer', FeatureTransformer()),
    ('model', model)
])

for key in atos_csv_dict.keys():
    sz = len(atos_csv_dict[key])
    limiar = math.floor(training_ratio*sz)
    print("------------------------------------------------------------------------------------")
    print("Ato:" + key)
    print("Tamanho: " + str(sz))
    df = atos_csv_dict[key].copy()
    
    if sz < 10:
        df_train, df_test = df.iloc[:limiar, :], df.iloc[limiar:, :]
        pipe.fit(
            df_train,
            IOB_Transformer().transform(df_train),
        );
        y_pred = pipe.predict(df_test)
        test_y = IOB_Transformer().transform(df_test)

        labels = list(pipe.classes_)
        labels.remove('O')

        f1 = metrics.flat_f1_score(test_y, y_pred, 
                              average='weighted', labels=labels)
    

        print(f1)
        print(metrics.flat_classification_report(
            test_y, y_pred, labels=labels, digits=3
        ))
        model = pipe

    else:
        X, y = df, IOB_Transformer().transform(df)
        res = cross_validate(
            pipe, X, y,
            cv = 3, return_estimator=True
        )
        model = res['estimator'][np.argmax(res['test_score'])]
        print(res)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=f"model-{key}",
        registered_model_name=key,
    )

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


RUN: 6fd0b1b454c24bbd9cb87082d398d904
------------------------------------------------------------------------------------
Ato:Ato_Exoneracao_Comissionado
Tamanho: 755


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Exoneracao_Comissionado'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


{'fit_time': array([1.73572803, 1.71416402, 1.8003242 ]), 'score_time': array([0.1381259 , 0.12973213, 0.08950257]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83e9c70>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83e9ac0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83e9670>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.88073691, 0.91179742, 0.91

2021/04/09 08:49:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Exoneracao_Comissionado, version 1
Created version '1' of model 'Ato_Exoneracao_Comissionado'.


------------------------------------------------------------------------------------
Ato:Ato_Nomeacao_Comissionado
Tamanho: 619


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Nomeacao_Comissionado'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Nomeacao_Comissionado, version 1


{'fit_time': array([0.85650086, 0.88541746, 0.8450098 ]), 'score_time': array([0.08039355, 0.0732193 , 0.07158875]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc31e2400>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc31e2310>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc31e2070>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.90603883, 0.90594945, 0.90

Created version '1' of model 'Ato_Nomeacao_Comissionado'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Tornado_Sem_Efeito_Exo_Nom'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Tornado_Sem_Efeito_Exo_Nom, version 1


{'fit_time': array([0.18725252, 0.22993183, 0.25142217]), 'score_time': array([0.01307011, 0.01447725, 0.01560259]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83609a0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8360490>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8360430>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.82083662, 0.8160834 , 0.91

Created version '1' of model 'Ato_Tornado_Sem_Efeito_Exo_Nom'.


------------------------------------------------------------------------------------
Ato:Ato_Exoneracao_Efetivo
Tamanho: 79


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Exoneracao_Efetivo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Exoneracao_Efetivo, version 1


{'fit_time': array([0.3453145 , 0.35548115, 0.33597589]), 'score_time': array([0.03133154, 0.02397823, 0.02232933]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8568ac0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8568df0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8568a00>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.96309963, 0.98827335, 0.87

Created version '1' of model 'Ato_Exoneracao_Efetivo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Retificacao_Comissionado'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Retificacao_Comissionado, version 1


{'fit_time': array([0.14762449, 0.16361856, 0.1792922 ]), 'score_time': array([0.01174927, 0.0138483 , 0.01304817]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc3fc8a00>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc3fc8220>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc8b758e0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.72805508, 0.66064415, 0.78

Created version '1' of model 'Ato_Retificacao_Comissionado'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Substituicao'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Substituicao, version 1


{'fit_time': array([1.3461411 , 1.34763193, 1.54343343]), 'score_time': array([0.0775249 , 0.07454276, 0.07257032]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83d1b80>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc329e760>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc2f26bb0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.84641594, 0.8557037 , 0.74

Created version '1' of model 'Ato_Substituicao'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Cessao'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Cessao, version 1


{'fit_time': array([0.48203325, 0.59248304, 0.57789063]), 'score_time': array([0.03817201, 0.03195262, 0.03377843]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc83a16a0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc98f8d30>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc867ff70>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.75240595, 0.6290886 , 0.72

Created version '1' of model 'Ato_Cessao'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Retificacao_Efetivo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Retificacao_Efetivo, version 1


{'fit_time': array([1.01865315, 0.8147068 , 0.46663809]), 'score_time': array([0.04298353, 0.04095507, 0.04090142]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc84f7e50>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc4169a90>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc4169dc0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.69959812, 0.58222491, 0.58

Created version '1' of model 'Ato_Retificacao_Efetivo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Nomeacao_Efetivo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Nomeacao_Efetivo, version 1


0.0
                                precision    recall  f1-score   support

            B-edital_normativo      0.000     0.000     0.000         2
            I-edital_normativo      0.000     0.000     0.000         4
B-numero_dodf_edital_normativo      0.000     0.000     0.000         3
  B-data_dodf_edital_normativo      0.000     0.000     0.000         3
  I-data_dodf_edital_normativo      0.000     0.000     0.000        12
 B-numero_dodf_resultado_final      0.000     0.000     0.000         3
   B-data_dodf_resultado_final      0.000     0.000     0.000         3
   I-data_dodf_resultado_final      0.000     0.000     0.000        12
                       B-cargo      0.000     0.000     0.000         3
                    B-carreira      0.000     0.000     0.000         3
                       B-orgao      0.000     0.000     0.000         0
                       I-orgao      0.000     0.000     0.000         0
                   B-candidato      0.000     0.000     0.0

Created version '1' of model 'Ato_Nomeacao_Efetivo'.


------------------------------------------------------------------------------------
Ato:Ato_Abono_Permanencia
Tamanho: 33


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Abono_Permanencia'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


{'fit_time': array([0.17945194, 0.12030792, 0.23139238]), 'score_time': array([0.01082611, 0.03179932, 0.01176691]), 'estimator': (Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc4190fa0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc4190e80>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))]), Pipeline(steps=[('featurizer',
                 <__main__.FeatureTransformer object at 0x7f4bc4190eb0>),
                ('model',
                 CRF(algorithm='l2sgd', all_possible_transitions=True, c2=1,
                     keep_tempfiles=None, max_iterations=10))])), 'test_score': array([0.70177074, 0.5326138 , 0.61

2021/04/09 08:49:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Abono_Permanencia, version 1
Created version '1' of model 'Ato_Abono_Permanencia'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Tornado_Sem_Efeito_Apo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


------------------------------------------------------------------------------------
Ato:Ato_Tornado_Sem_Efeito_Apo
Tamanho: 2
0.0
                    precision    recall  f1-score   support

  B-tipo_documento      0.000     0.000     0.000         1
  I-tipo_documento      0.000     0.000     0.000         0
  B-data_documento      0.000     0.000     0.000         1
  I-data_documento      0.000     0.000     0.000         2
     B-numero_dodf      0.000     0.000     0.000         1
       B-data_dodf      0.000     0.000     0.000         1
       I-data_dodf      0.000     0.000     0.000         2
     B-pagina_dodf      0.000     0.000     0.000         1
            B-nome      0.000     0.000     0.000         1
            I-nome      0.000     0.000     0.000         3
       B-matricula      0.000     0.000     0.000         1
       I-matricula      0.000     0.000     0.000         2
   B-cargo_efetivo      0.000     0.000     0.000         0
   I-cargo_efetivo      0.00

2021/04/09 08:49:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Tornado_Sem_Efeito_Apo, version 1
Created version '1' of model 'Ato_Tornado_Sem_Efeito_Apo'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


------------------------------------------------------------------------------------
Ato:Ato_Reversao
Tamanho: 8
0.9139072847682119
                    precision    recall  f1-score   support

          B-quadro      0.000     0.000     0.000         0
        B-vigencia      1.000     1.000     1.000         3
        I-vigencia      1.000     1.000     1.000        12
   B-cargo_efetivo      1.000     1.000     1.000        12
       B-matricula      0.000     0.000     0.000         3
       I-matricula      0.000     0.000     0.000         0
          I-quadro      0.000     0.000     0.000         0
B-fundamento_legal      1.000     1.000     1.000         3
I-fundamento_legal      1.000     1.000     1.000        60
          B-motivo      1.000     1.000     1.000         3
          I-motivo      1.000     1.000     1.000        21
   I-cargo_efetivo      1.000     1.000     1.000        24
            B-nome      0.000     0.000     0.000         3
            I-nome      0.0

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Ato_Reversao'.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2021/04/09 08:49:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ato_Reversao, version 1


CPU times: user 39.9 s, sys: 161 ms, total: 40 s
Wall time: 40.4 s


Created version '1' of model 'Ato_Reversao'.


In [21]:
mlflow.end_run()