### Carregando os atos de Licitação


In [1]:
import pandas as pd
import sklearn_crfsuite
import nltk
from nltk.tokenize import word_tokenize

path_data = pd.read_csv('/home/thais/Downloads/licitacao_dodf_watcher.csv')
data_train = pd.DataFrame(path_data)

### Carregando modelo

In [2]:
import pickle
import nltk

with open('licitacao.pkl' , 'rb') as f:
    lr = pickle.load(f)

In [3]:
def get_features(sentence):
    """Create features for each word in act.
    Create a list of dict of words features to be used in the predictor module.
    Args:
        act (list): List of words in an act.
    Returns:
        A list with a dictionary of features for each of the words.
    """
    sent_features = []
    for i in range(len(sentence)):
        word_feat = {
            'word': sentence[i].lower(),
            'word[-3:]': sentence[i][-3:],
            'word[-2:]': sentence[i][-2:],
            'capital_letter': sentence[i][0].isupper(),
            'word_istitle': sentence[i].istitle(),
            'all_capital': sentence[i].isupper(),
            'word_isdigit': sentence[i].isdigit(),
            # Uma palavra antes
            'word_before': '' if i == 0 else sentence[i-1].lower(),
            'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
            'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
            'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
            # Duas palavras antes
            'word_before2': '' if i in [0, 1] else sentence[i-2].lower(),
            'word_before_isdigit2': '' if i in [0, 1] else sentence[i-1].isdigit(),
            'word_before_isupper2': '' if i in [0, 1] else sentence[i-1].isupper(),
            'word_before_istitle2': '' if i in [0, 1] else sentence[i-1].istitle(),
            # Uma palavra depois
            'word_after': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
            'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
            'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
            'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),
            # Duas palavras depois
            'word_after2': '' if i+2 >= len(sentence) else sentence[i+2].lower(),
            'word_after_isdigit2': '' if i+2 >= len(sentence) else sentence[i+2].isdigit(),
            'word_after_isupper2': '' if i+2 >= len(sentence) else sentence[i+2].isupper(),
            'word_after_istitle2': '' if i+2 >= len(sentence) else sentence[i+2].istitle(),
            
            'BOS': i == 0,
            'EOS': i == len(sentence)-1
        }
        sent_features.append(word_feat)
    return sent_features

# Concatena cada palavra do texto do ato com sua respectiva anotação de entidade

def concatenaPredicao(ato,  predicao):
    print ("{:<15} {:<10}".format('Entidade','Predição'))
    for i in range(len(ato)):
        print ("{:<15} {:<10}".format( ato[i], predicao[i]))
        # print(ato[i] + '------' + predicao[i])
        # print(predicao[i]) + '[' + predicao[i] + ']' , end=" ")

In [4]:
import numpy as np

def predictions_dict(model, act, prediction):
    """Create dictionary of proprieties.
    Create dictionary of tags to save predicted entities.
    Args:
        sentence (list): List of words and tokens in the act.
        prediction ([type]): The correspondent predicitons for each
                             word in the sentence.
    Returns:
        A dictionary of the proprieties found.
    """


    dict_ato = {}
    for klass in model.classes_:
        if klass == 'O':
            continue
        dict_ato[klass[2:]] = []

    current = ''
    count = 0
    pred_start = 0

    for i,_ in enumerate(prediction):

        if prediction[i][0] == 'I':
            count += 1

        elif prediction[i][0] == 'B':
            pred_start = i

        elif prediction[i][0] == 'O' and pred_start:
            current = prediction[pred_start][2:]
            entidade = ' '.join(act[pred_start:i])
            dict_ato[current] = entidade
            pred_start = 0
            count = 0
        else:
            continue
           
    for key, val in dict_ato.items():
        if len(val) == 0:
            dict_ato[key] = np.nan
        elif len(val) == 1:
            dict_ato[key] = val[0]
    return dict_ato

def add_standard_props(act, capitalize=False):
    standard_props = _standard_props()

    if capitalize:
        standard_props = {(key.capitalize()):val for key, val in standard_props.items()}

    act = {**act, **(standard_props)}
    return act

In [5]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd


tokens = [str(data_train['texto'][i]) for i in range(len(data_train['texto']))]

atos_tokens_lista = [word_tokenize(tokens[i]) for i in range(len(tokens))]

# atos_tokens_lista = [word_tokenize(data_train['texto'][i]) for i in range(len(data_train['texto']))]
atos_features_lista = [get_features(atos_tokens_lista[i]) for i in range(len(atos_tokens_lista))]

predicao = lr.predict(atos_features_lista)

In [6]:
_acts = []

for i in range(len(predicao)):
    predicted_dict = predictions_dict(lr, atos_tokens_lista[i], predicao[i])
    predicted_dict['TEXTO'] = tokens[i] 
    _acts.append(predicted_dict)

data_frame = pd.DataFrame.from_dict(_acts)

# data_frame.to_csv('licitacoes_previstas.csv')


In [15]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2076 entries, 0 to 2075
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   MODALIDADE_LICITACAO    293 non-null    object
 1   NUM_LICITACAO           1187 non-null   object
 2   ORGAO_LICITANTE         244 non-null    object
 3   SISTEMA_COMPRAS         1083 non-null   object
 4   OBJ_LICITACAO           1898 non-null   object
 5   VALOR_ESTIMADO          1233 non-null   object
 6   DATA_ABERTURA           1318 non-null   object
 7   PROCESSO                606 non-null    object
 8   IOB                     1583 non-null   object
 9   NOME_RESPONSAVEL        10 non-null     object
 10  CODIGO_SISTEMA_COMPRAS  886 non-null    object
 11  TEXTO                   2076 non-null   object
dtypes: object(12)
memory usage: 194.8+ KB
