# Preprocessamento do DODF para segmentacao 

Separa sentencas e as rotula no padrao IOB (B=sentenca de inicio do ato, I=sentenca continuacao do ato, O=sentenca nao faz parte do ato)

In [1]:
import json
import pandas as pd
import glob
import re

## Parte 1 - Identificando os DODFs com anotacao de aposentadoria

In [2]:
df = pd.read_csv("labeled.csv")
df.columns

Index(['DATA_DODF', 'NUM_DODF', 'ATO', 'EMPRESA_ATO', 'COD_MATRICULA_ATO',
       'NOME_ATO', 'CARGO', 'CLASSE', 'PADRAO', 'QUADRO', 'PROCESSO',
       'FUND_LEGAL', 'text', 'labels'],
      dtype='object')

In [3]:
month = ["01_Janeiro/", "02_Fevereiro/", "03_Março/", "04_Abril/", "05_Maio/", "06_Junho/", "07_Julho/", "08_Agosto/", "09_Setembro/", "10_Outubro/", "11_Novembro/", "12_Dezembro/"]
dodfs = []
dic = {}

for i in range(len(df)):
    num_dodf = int(df.loc[i, 'NUM_DODF'])
    data_dodf= df.loc[i, 'DATA_DODF']
    if df.loc[i, 'text']:
        if (num_dodf, data_dodf) not in dic:
            dic[(num_dodf, data_dodf)] = -1
            dia, mes, ano = data_dodf.split('/')
            path = "../dodfs/results/json/" + ano + "/" + month[int(mes)-1]
            for i in glob.glob(path+'*.json'):
                aux = i.split('/')[-1]
                aux = aux.split()
                if int(aux[1])==num_dodf and aux[2] == data_dodf.replace('/', '-') and not re.search("SUPLEMENTO", ' '.join(aux)):
                    dic[(num_dodf, data_dodf)] = i
                    dodfs.append(i)

In [4]:
len(dodfs), len(dic)

(226, 227)

In [5]:
for i in dic:
    if dic[i] not in dodfs:
        print("Missing DODF:", i)

Missing DODF: (76, '20/04/2018')


In [6]:
dodfs.append("../dodfs/results/json/2018/04_Abril/DODF 076 20-04-2018 SUPLEMENTO.json")

## Parte 2 - Identificando trechos de texto contendo ato de aposentadoria nos blocos do DODF

In [7]:
path = "../dodfs/results/json/2018/04_Abril/DODF 019 04-04-2018 EDICAO EXTRA.json"
f = open(path)
data = json.load(f)

In [8]:
full_base = []
not_found = []
cnt = 0
temp = -1
for dodf in dodfs:
    f = open(dodf)
    data = json.load(f)
    aux = df.loc[(df['NUM_DODF'] == int(dodf.split('/')[-1].split()[1])) & (df['DATA_DODF'] == dodf.split('/')[-1].split()[2].replace('-', '/'))]
    for j in range(len(data)):
        data[j].append([])
        data[j][4] = data[j][4].replace('\n', ' ')
    for i in aux.index:
        cnt += 1
        if not df.loc[i, 'text'] or pd.isna(df.loc[i, 'text']):
            continue
        find = False
        for j in range(len(data)):
            entity = re.search(df.loc[i, 'text'].replace('(', '\(').replace(')', '\)'), data[j][4])#.replace('\n', ' '))
            if entity:
                temp = entity
                find = True
                if entity.span() not in data[j][5]:
                    data[j][5].append(entity.span())
        if not find:
            not_found.append((dodf, df.loc[i, 'text'], i))
    full_base.append(data)

In [9]:
print(f"Total de anotacoes de atos:{cnt}\nAtos nao identificados:{len(not_found)}")

Total de anotacoes de atos:5516
Atos nao identificados:132


In [10]:
print("Identificando trechos de atos com sobreposicao")
for i in range(len(full_base)):
    for j in range(len(full_base[i])):
        full_base[i][j][5].sort()
        for k in range(1, len(full_base[i][j][5])):
            if full_base[i][j][5][k-1][1] >= full_base[i][j][5][k][0]:
                print(f"ERRO em DODF:{i}, Bloco:{j}")
                break

Identificando trechos de atos com sobreposicao
ERRO em DODF:71, Bloco:286
ERRO em DODF:88, Bloco:1770
ERRO em DODF:133, Bloco:427
ERRO em DODF:165, Bloco:469
ERRO em DODF:165, Bloco:470
ERRO em DODF:165, Bloco:484


### Correcao dos atos que se sobrepoem

In [11]:
# Fix DODF 71, bloco 286
full_base[71][286][5][1] = (778, 1285)

# Fix DODF 88, bloco 1770
full_base[88][1770][5][8] = (3378, 3724)

# Fix DODF:133, Bloco:427
full_base[133][427][5][1] = (664, 1272)

# Fix DODF:165, Bloco:469
full_base[165][469][5] = [(181, 799), (801, 1319), (1321, 1847)]

# Fix DODF:165, Bloco:470
full_base[165][470][5] = [(119, 623), (625, 1001), (1003, 1328), (1330, 1823), (1825, 2313), (2315, 2820), (2822, 3322), (3324, 3819), (3821, 4175), (4177, 4675)]

# Fix DODF:165, Bloco:484
full_base[165][484][5] = [(224, 822), (823, 1341), (1343, 1869), (1871, 2364), (2366, 2872), (2873, 3243), (3244, 3569), (3571, 4066), (4067, 4555), (5063, 5563), (5565, 6063), (6064, 6418), (6420, 6918)]



In [12]:
print("Identificando trechos de atos com sobreposicao")
for i in range(len(full_base)):
    for j in range(len(full_base[i])):
        full_base[i][j][5].sort()
        for k in range(1, len(full_base[i][j][5])):
            if full_base[i][j][5][k-1][1] >= full_base[i][j][5][k][0]:
                print(f"ERRO em DODF:{i}, Bloco:{j}")
                break

Identificando trechos de atos com sobreposicao


## Parte 3 - Rotulando dados no padrao MUC

In [13]:
# # Fix - full_base[137][316][5] tinha um trecho de texto que cobria dois atos, corrigido para nao dar overlap
# # no segundo ato de aposentadoria
# for i in range(len(full_base[137][316][5])):
#     if full_base[137][316][5][i] == (5637, 6496):
#         print("trocou")
#         full_base[137][316][5][i] = (6147, 6496)

In [14]:
def MUCify_block(block, i, j):
    MUC_data = []
    texto = block[4].replace('\n', ' ')
    if block[5]:
        block[5].sort(reverse=True)
        for ato in block[5]:
            if re.search("</*aposentadoria>", texto[ato[0]:ato[1]+1]):
                print(f"DODF:{i}, bloco:{j}")
                print("Sobreposicao de atos")
                break
            texto = texto[:ato[0]] + " <aposentadoria> " + texto[ato[0]:ato[1]+1] + " </aposentadoria> " + texto[ato[1]+1:]
    return texto

# aux = full_base[0][364]
# aux_r = MUCify_sentences(aux)

In [15]:
muc_base = []
# Para cada DODF na base
for i in range(len(full_base)):
    # Para cada bloco no DODF
    for j in range(len(full_base[i])):
#         print(f"DODF:{i}, bloco:{j}")
        muc_base.append(MUCify_block(full_base[i][j], i, j))

## Parte 4 - Convertendo do padrao MUC para IOB

In [16]:
def split_sentences(sent_to_split, is_aposentadoria):
    labels = []
    text = sent_to_split.split('.')
    if is_aposentadoria:
        labels = ['I' for _ in range(len(text))]
        labels[0] = 'B'
    else:
        labels = ['O' for _ in range(len(text))]
    return text, labels

def MUCtoIOB_block(block):
    block_sentences = []
    block_labels = []
    texto = block.split()
    i = 0
    sent = ""
    while i < len(texto):
        if texto[i] == "<aposentadoria>":
            sentences, labels = split_sentences(sent, is_aposentadoria=False)
            sent = ""
            for s, l in zip(sentences, labels):
                if s == '':
                    continue
                block_sentences.append(s)
                block_labels.append(l)
        elif texto[i] == "</aposentadoria>":
            sentences, labels = split_sentences(sent, is_aposentadoria=True)
            sent = ""
            for s, l in zip(sentences, labels):
                if s == '':
                    continue
                block_sentences.append(s)
                block_labels.append(l)
        else:
            sent += ' ' + texto[i]
        i += 1
    if sent:
        sentences, labels = split_sentences(sent, is_aposentadoria=False)
        sent = ""
        for s, l in zip(sentences, labels):
            if s == '':
                continue
            block_sentences.append(s)
            block_labels.append(l)
    return block_sentences, block_labels

In [17]:
iob_sentence = []
iob_label = []
for block in muc_base:
    aux_sent, aux_label = MUCtoIOB_block(block)
    iob_sentence.append(aux_sent)
    iob_label.append(aux_label)

## Parte 5 - Salvando os dados no formato CoNLL

In [18]:
f = open('seg_data.txt', 'w')
for sent_seq, label_seq in zip(iob_sentence, iob_label):
    for sent, label in zip(sent_seq, label_seq):
        f.write(f"{label} {sent}\n")
    f.write("\n")