In [4]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
# nltk.download('punkt')

In [13]:
df = pd.read_parquet('parquet/all_acts_200x_2018_2020.parquet')

In [14]:
df['ato'].unique()

array(['AVISO_LICITACAO', 'EXTRATO_CONTRATO_CONVENIO',
       'EXTRATO_ADITAMENTO_CONTRATUAL', 'AVISO_SUSPENSAO_LICITACAO',
       'AVISO_ANUL_REV_LICITACAO', 'EXTRATO_CONTRATO', 'EXTRATO_CONVENIO'],
      dtype=object)

In [15]:
lic = df[list(df.columns)].loc[(df['ato'] == 'AVISO_LICITACAO')]
lic = lic.reset_index(drop=True)

sus = df[list(df.columns)].loc[(df['ato'] == 'AVISO_SUSPENSAO_LICITACAO')]
sus = sus.reset_index(drop=True)

anr = df[list(df.columns)].loc[(df['ato'] == 'AVISO_ANUL_REV_LICITACAO')]
anr = anr.reset_index(drop=True)

adi = df[list(df.columns)].loc[(df['ato'] == 'EXTRATO_ADITAMENTO_CONTRATUAL')]
adi = adi.reset_index(drop=True)

con = df[list(df.columns)].loc[(df['ato'] == 'EXTRATO_CONTRATO') |\
                               (df['ato'] == 'EXTRATO_CONVENIO') |\
                               (df['ato'] == 'EXTRATO_CONTRATO_CONVENIO')]
con = con.reset_index(drop=True)


### Licitação

In [19]:
ent = ['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text']

for i in range(5, len(list(lic.isnull().sum()))):
    if list(lic.isnull().sum())[i] != len(lic):
        ent.append(list(lic.columns)[i])

lic = lic[ent]

### Suspensão

In [20]:
ent = ['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text']

for i in range(5, len(list(sus.isnull().sum()))):
    if list(sus.isnull().sum())[i] != len(sus):
        ent.append(list(sus.columns)[i])

sus = sus[ent]

### Anulação e Revogação

In [21]:
ent = ['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text']

for i in range(5, len(list(anr.isnull().sum()))):
    if list(anr.isnull().sum())[i] != len(anr):
        ent.append(list(anr.columns)[i])

anr = anr[ent]

### Aditamento

In [22]:
ent = ['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text']

for i in range(5, len(list(adi.isnull().sum()))):
    if list(adi.isnull().sum())[i] != len(adi):
        ent.append(list(adi.columns)[i])

adi = adi[ent]

### Contrato e Convênio

In [23]:
ent = ['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text']

for i in range(5, len(list(con.isnull().sum()))):
    if list(con.isnull().sum())[i] != len(con):
        ent.append(list(con.columns)[i])

con = con[ent]

In [26]:
con['NOME_RESPONSAVEL'] = np.nan

In [27]:
con.columns

Index(['arquivo_rast', 'text', 'ato', 'dodf', 'treated_text', 'PROCESSO',
       'CONTRATANTE', 'CONTRATADA', 'OBJ_AJUSTE', 'VIGENCIA', 'VALOR', 'PT',
       'DATA_ASSINATURA', 'CODIGO_UO', 'ND', 'NE', 'FUND_DISPENSA',
       'ORGAO_LICITANTE', 'NUM_LICITACAO', 'NUM_CONTRATO', 'IDENT_DISPENSA',
       'CONVENENTE', 'OG_ATA', 'FONTE_RECURSO', 'CNPJ_CONTRATADA',
       'CONCEDENTE', 'CNPJ_CONVENENTE', 'CNPJ_CONTRATANTE', 'CNPJ_CONCEDENTE',
       'NOME_RESPONSAVEL'],
      dtype='object')

## IOB

In [24]:
class Tokenizer(TransformerMixin, BaseEstimator):
    def __init__(self, tokenizer=''):
        self.tokenizer = tokenizer


    def __call__(self, X, **kw_params):
        return self.tokenizer(X, **kw_params)


    def fit(self, X, y=None, **fit_params):
        return self


    def transform(self, X, **kw_params):
        if not isinstance(X, pd.Series):
            print("[preprocess.Tokenizer.transform] TYPE:", type(X))
            print('X:::: ', X)
            X = pd.Series(X)
        return X.map(self)


from sklearn.base import BaseEstimator, TransformerMixin

class IOBifyer(TransformerMixin, BaseEstimator):

    @staticmethod
    def find_entity(row, token, ignore_idx=0,
        tokenizer=''):
        # TODO: aceitar opção de offset, para não ter tennhum tipo de problema
        for idx, column in enumerate(row.keys()):
            if idx == ignore_idx:
                continue
            if isinstance(row[column], str) and \
                token == word_tokenize(row[column])[0]:
                return column

        return None


    @staticmethod
    def generate_IOB_labels(row, idx, tokenizer, dbg={}):
        labels = []
        entity_started = False
        text = row.iloc[idx]
        for token in word_tokenize(text):                         
            if not entity_started:                               
                entity = IOBifyer.find_entity(row, token, idx)                 
                if entity is not None:                           
                    entity_started = True
                    token_index = 1
                    labels.append('B-' + entity)
                else:
                    labels.append('O')
            else:
                if token_index < len(word_tokenize(row[entity])) and \
                    token == word_tokenize(row[entity])[token_index]:
                    labels.append('I-' + entity)
                    token_index += 1
                    if token_index >= len(word_tokenize(row[entity])):
                        entity_started = False
                else:
                    entity_started = False
                    labels.append('O')
        if labels[0] != 'O':
            dbg['l'] = dbg.get('l', []) + [(row, idx)]

        return labels


    @staticmethod
    def dump_iob(tokens_mat, labels_mat, path='dump.txt',
                            sep=' X X ', sent_sep='\n',):
        dbg_mat = []
        if isinstance(path, Path):
            path = path.as_posix()
        if '/' in path:
            os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True)

        with open(path, 'w') as fp:
            for tokens_lis, labels_lis in zip(tokens_mat, labels_mat):
                dbg_mat.append([])
                for token, label in zip(tokens_lis, labels_lis):
                    dbg_mat[-1].append((token, label))
                    fp.write(f"{token}{sep}{label}\n")
                fp.write(sent_sep)
        return dbg_mat


    def __init__(self, column='act_column',
        tokenizer=''):
        self.column = column
        self.tokenizer = tokenizer
        self.dbg = {}


    def fit(self, X=None, y=None, **fit_params):
        return self


    def transform(self, df):
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"`df` expected to be a pd.DataFrame. Got {type(df)}")
        if df.empty:
            print("[core.preprocess]Warning: empty DataFrame. There won't be ioblabels.")
            return pd.Series()

        idx = self.column if isinstance(self.column, int) else  \
                df.columns.get_loc(self.column)
        labels_row = []
        for index, row in df.iterrows():
            try:
                labels_row.append(
                    IOBifyer.generate_IOB_labels(
                        row, idx, self.tokenizer, self.dbg
                    )
                )
            except Exception as e:
                print("problem iobifyin row:", row)
                raise e
        return pd.Series(labels_row)


In [28]:
# Licitacao
lic.drop(166, inplace=True)
lic = lic.reset_index(drop=True)

lic_iob = lic[list(lic.columns)[4:]]

iob_lic= IOBifyer(column='treated_text')
r_lic = iob_lic.transform(lic_iob)
lic["IOB"] = np.nan

for i in range(len(lic)):
    lic.loc[i, "IOB"] = ' '.join(r_lic[i])

In [29]:
# Suspensao
sus_iob = sus[list(sus.columns)[4:]]

iob_sus = IOBifyer(column='treated_text')
r_sus = iob_sus.transform(sus_iob)
sus["IOB"] = np.nan

for i in range(len(sus)):
    sus.loc[i, "IOB"] = ' '.join(r_sus[i])

In [30]:
# Anulacao e revogacao
anr_iob = anr[list(anr.columns)[4:]]

iob_anr = IOBifyer(column='treated_text')
r_anr = iob_anr.transform(anr_iob)
anr["IOB"] = np.nan

for i in range(len(anr)):
    anr.loc[i, "IOB"] = ' '.join(r_anr[i])

In [31]:
# Aditamento
adi_iob = adi[list(adi.columns)[4:]]

iob_adi = IOBifyer(column='treated_text')
r_adi = iob_adi.transform(adi_iob)
adi["IOB"] = np.nan

for i in range(len(adi)):
    adi.loc[i, "IOB"] = ' '.join(r_adi[i])

In [32]:
# Contrato e convenio
con_iob = con[list(con.columns)[4:]]

iob_con = IOBifyer(column='treated_text')
r_con = iob_con.transform(con_iob)
con["IOB"] = np.nan

for i in range(len(con)):
    con.loc[i, "IOB"] = ' '.join(r_con[i])

In [34]:
anr['IOB'][0], anr['treated_text'][0]

('B-IDENT_REVOGACAO_ANULACAO I-IDENT_REVOGACAO_ANULACAO I-IDENT_REVOGACAO_ANULACAO B-MODALIDADE_LICITACAO I-MODALIDADE_LICITACAO O B-NUM_LICITACAO O O O O B-ORGAO_LICITANTE O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'AVISO DE ANULAÇÃO PREGÃO PRESENCIAL Nº 03/2014. O PREGOEIRO da CPL comunica aos interessados a anulação da licitação em epígrafe, processo 001-000.601/2013, que tem por objeto a aquisição de canetas esferográficas para o Projeto Jovem Cidadão, da Câmara Legislativa do Distrito Federal, por reavaliação dos interesses da Administração (Escola do Legislativo ELEGIS). Maiores informações no local, pelo telefone (61) 3348.8650 ou 3348.8651 ou 3348.8652. Brasília/DF, 27 de janeiro de 2014. GUILHERME TAPAJÓS TÁVORA Pregoeiro')

In [35]:
lic.to_parquet('parquet/licitacao_435_acts_200x_2018_2020.parquet')
sus.to_parquet('parquet/suspensao_25_acts_200x_2018_2020.parquet')
anr.to_parquet('parquet/anulacao-revogacao_34_acts_200x_2018_2020.parquet')
adi.to_parquet('parquet/aditamento_833_acts_200x_2018_2020.parquet')
con.to_parquet('parquet/contrato-convenio_566_acts_200x_2018_2020.parquet')

In [36]:
len(lic), len(sus), len(anr), len(adi), len(con)

(435, 25, 34, 833, 566)