In [1]:
from labelbox import Client
from getpass import getpass
import requests
from collections import Counter
import os
import pandas as pd
import sys

In [2]:
def show_progress(i, n):
    j = (i + 1) / n
    sys.stdout.write('\r')
    sys.stdout.write("Estado da extração: [%-20s] %d%%" % ('='*int(20*j), 100*j))
    sys.stdout.flush()

In [3]:
def connect_to_api(api_key, project_id):
    ENDPOINT = "https://api.labelbox.com/graphql"
    
    client = Client(api_key=api_key, endpoint=ENDPOINT)
    project = client.get_project(project_id)
    
    return client, project

In [4]:
def extract_labels(project):
    export_url = project.export_labels()
    exports = requests.get(export_url).json()
    
    labels = []
    title = []
    value = []
    file = []
    texts = []
    loc_begin = []
    loc_end = []

    i = 0

    for labeled in exports:
        if sum([r["score"] for r in labeled["Reviews"]]) >= 1:
            req = requests.get(labeled['Labeled Data'])

            text = req.text.encode('latin-1').decode('utf-8')

            if "objects" in labeled["Label"].keys():
                for entity in labeled['Label']['objects']:
                    location = entity['data']['location']
                    labels.append(text[location['start']:location['end']+1])
                    title.append(entity['title'])
                    value.append(entity['value'])
                    file.append(labeled['External ID'])
                    texts.append(text)
                    
                    loc_begin.append(location["start"])
                    loc_end.append(location["end"]+1)

        show_progress(i, len(exports))
        i += 1
    
    return labels, title, value, file, texts, loc_begin, loc_end

In [5]:
def parse_extracted_labels(labels, title, value, file, texts, loc_begin, loc_end):
    data = pd.DataFrame(list(zip(title, value, labels, file, loc_begin, loc_end)), columns =['titulo_entidade', 'valor_entidade', 'entidade', 'arquivo_rast', 'loc_begin', 'loc_end'])
    
    data["ato"] = data["arquivo_rast"].str.split("_DODF").str[0]
    data["ato"] = data["ato"].str.split("_").str[1:].str.join("_")
    data["ato"] = data["ato"].str.split("rinst_").str[-1]
    
    atos_pegar = ["extrato_de_contrato_ou_convenio", "extrato_de_aditamento_contratual", "aviso_de_aditamento_contratual"]
    
    data_text = pd.DataFrame(list(zip(file, texts)), columns=["arquivo_rast", "text"])
    
    data_text["ato"] = data_text["arquivo_rast"].str.split("_DODF").str[0]
    data_text["ato"] = data_text["ato"].str.split("_").str[1:].str.join("_")
    data_text["ato"] = data_text["ato"].str.split("rinst_").str[-1]
    
    data_text = data_text[~data_text["text"].duplicated()].reset_index(drop=True)
    
    data_text["dodf"] = "DODF" + data_text["arquivo_rast"].str.split("_DODF").str[-1].str.split(".txt").str[0]
        
    assert len(data_text) == len(data["arquivo_rast"].unique()), "Filtragem por texto repetido falhou."
    assert data["arquivo_rast"].isin(data_text["arquivo_rast"]).all() and data_text["arquivo_rast"].isin(data["arquivo_rast"]).all(), "Correspondência entre os dfs não garantida."
    
    return data, data_text

In [8]:
path_data = "./atos_revisados/geral/"

In [9]:
api_keys = {
    "responsavel": ["gabriel", "vitor", "manuela", "thiago"],
    "project_id": ["ckynt28qw0u5n0z7s1idj22x0", "ckyoimw4d9dkg0z7s3mvhhsll", "ckyofsou26tl80z6g5y6l9fdj", "ckyot7qqr0xmh0z8r2o9t3ljq"],
    "api_key": ["eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3libzRxZjdib2h5MHo4eThrenQ3YW0zIiwib3JnYW5pemF0aW9uSWQiOiJja3libzRxZXlib2h4MHo4eWMwYmFncXZmIiwiYXBpS2V5SWQiOiJja3lseGtjMHYwM2tlMTA3amRlM3Nha3NzIiwic2VjcmV0IjoiZTM0ZGQ4Mzk3NzM0NGEyNTdhNzQyNTFhMzI3ZmRkMjMiLCJpYXQiOjE2NDI2MjAwNDcsImV4cCI6MjI3Mzc3MjA0N30.05SO9IIYic1eU_-YPhP8ldgyj2Oj8rUCsRmbEmcz8ig", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3lidnFyYnMwejI1MHphazNub3o3emJsIiwib3JnYW5pemF0aW9uSWQiOiJja3lidnFyYmYwejI0MHphazBkeTFiNGtqIiwiYXBpS2V5SWQiOiJja3luZ2xrOTlmMjJ2MTBjbTBpeXQ3MXV5Iiwic2VjcmV0IjoiMjAwZTJlNzg4ZDNjZjhkZDNkODdmMmQ4OTc0YTk2YzIiLCJpYXQiOjE2NDI3MTI0ODMsImV4cCI6MjI3Mzg2NDQ4M30.CGPSbYZKG7Pno8J7SCsfuZ9LWMedRTN_vxHYbMqRlvc", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3libjJqMXZiYWd0MHphOTJieHNobTJtIiwib3JnYW5pemF0aW9uSWQiOiJja3libjJqMWViYWdvMHphOWZubmdob2t4IiwiYXBpS2V5SWQiOiJja3lucjQwaXgwY3FmMHo3czYxZTA5OXliIiwic2VjcmV0IjoiZTIwZjE0MDJiZTM1NDhiMTUzNWFjNTNmYWI1MWE2OTEiLCJpYXQiOjE2NDI3MzAxNDAsImV4cCI6MjI3Mzg4MjE0MH0.Wn3hqKSyinXmcIiLpFb7MCPrwPEzPRlXmTLU95acy8E", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3lvcmp5b3dlcDI2MHo3bTBkNGdmZTcxIiwib3JnYW5pemF0aW9uSWQiOiJja3lvcmp5b21lcDI1MHo3bWVwMGJicmlhIiwiYXBpS2V5SWQiOiJja3pqcmU2MjQzcWN4MHo3eTVleWpmMDMwIiwic2VjcmV0IjoiYjhlYTljZGY4OTNhZjFkNjc0NWRiNWUwNzc4YjI0OTEiLCJpYXQiOjE2NDQ2NjU1MzEsImV4cCI6MjI3NTgxNzUzMX0.wDLQxEeZo_J1-dMF1gB_QHHK64JSI4gAS7L-k6QaksQ"]
}

In [10]:
for n in range(len(api_keys["responsavel"])):
    responsavel = api_keys["responsavel"][n]
    project_id = api_keys["project_id"][n]
    api_key = api_keys["api_key"][n]
    
    client, project = connect_to_api(api_key, project_id)
    labels, title, value, file, texts, loc_begin, loc_end = extract_labels(project)
    data, data_text = parse_extracted_labels(labels, title, value, file, texts, loc_begin, loc_end)
    
    data.to_parquet(path_data + responsavel + "_atos_data.parquet", index=False)
    data_text.to_parquet(path_data + responsavel + "_atos_data_text.parquet", index=False)
    
    print()
    print(n)
    print()

0

1

2

3

