In [1]:
import argparse
import json
from glob import glob
import pandas as pd
import pickle
import numpy as np
import re
import os
import operator as op
import warnings 
from owlready2 import * #
import random
import unicodedata
warnings.filterwarnings('ignore')



## Carregar ontologia

In [2]:
onto_name = "OntoGeoLogicaInstanciasRelacoes"
onto = get_ontology("../../KnowledgeGraph/OntoGeoLogicaInstanciasRelacoes.owl")
onto.load()

get_ontology("http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#")

## Funções para procurar na ontologia

In [3]:
def get_relations_between_uris(uri_1, uri_2): 
    #funcao que acessa a ontologia e procura relacao entre URIs
    dict_relation_uris = {}
    #Pega as relacoes que a URI1 tem
    relation_query_results = list(default_world.sparql("""
            SELECT DISTINCT ?rel
            WHERE{?uri ?rel ?obj
                 FILTER(contains(str(?rel), "http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#"))
                 FILTER (contains(str(?uri), """ + '"' + uri_1 + '"' + """))
                 }
            """))
    
    relations_str = []
    for relation_uris in relation_query_results:
        relations_str.append(str(relation_uris[0]).rsplit(".",1)[-1])
        
    # Para cada tipo de relação procura se existe match entre URI1 e URI2
    for relation in relations_str:
        relation_between_words = list(default_world.sparql("""
                PREFIX prefix: <http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#>
                SELECT distinct ?y ?x2
                WHERE{?y prefix:""" +  relation  +  """ ?x1

                      FILTER (contains(str(?y), """ + '"' + uri_1  + '"' + """))        

                      ?x2 rdf:type ?j                                   
                      FILTER (contains(str(?x2), """ + '"' + uri_2  + '"' + """))

                      FILTER ( ?x2 = ?x1 )
                    }
                """))
        dict_relation_uris[relation] = relation_between_words
    return dict_relation_uris

def go_through_relations(uri1,uri2):
    relation_uris = get_relations_between_uris(uri1, uri2)            
    if relation_uris != {}: #talvez exista relacao entre URIs, dicionario pode vir vazio -> []
        for x, y in relation_uris.items():#procurar por relacao
            if y != []: #existe alguma relacao
#                 print(x)
                return x
    return None

### Funções para printar informações

In [4]:
def print_sentence_text(sentence):
    #printa e retorna o texto original da sentenca
    size_sentence = int(sentence.iloc[-1]["end"])
    text = " "*size_sentence
    for index, row in sentence.iterrows():
        text = text[:int(row["start"])] + row["form"] +text[int(row["end"]):]
    print(text)
    print("-------------")
    return text

def print_relation_entities(word1,word2,ent1,ent2,URI_1,URI_2,relation_type,text):
    #printa as entidades e relacao entre elas
    print('Token 1 = ', word1, '--- Class 1 = ', ent1, '--- URI 1 = ', URI_1)
    print('Token 2 = ', word2, '--- Class 2 = ', ent2,'--- URI 2 = ',URI_2)
    print('Relation Type = ', relation_type)
    print(text)
    print("-------------")
    pass

## Funções para gerar Jsons a serem lidos no labelstudio

In [5]:
class ResultRelationJson(object):
    def __init__(self, from_id, to_id, relations, direction = "right"):
        self.dict = {
            "from_id": str(from_id),
            "to_id": str(to_id),
            "type": "relation",
            "direction": direction,
            "labels": relations
        }
    def get_dict(self):
        return self.dict
class ResultNERJson(object):
    def __init__(self, row):     
        self.result_dict = {
            "value": {
            "start": row["start_word"],
            "end": row["end_word"],
            "text": row["word_join"],
            "labels": [
              row["label_word"]
            ],
            "URI": row["URI"]
            },
            
            "id": row["index_e"],
            "from_name": "label",
            "to_name": "text",
            "type": "labels",
            "origin": "prediction"
        }
    def get_dict(self):
        return self.result_dict  
class CreateOutput(object):
    def __init__(self, text, filtred_sentence, entity_name_new):
        self.filtred_sentence = filtred_sentence
        self.entity_name_new = entity_name_new
        self.main_dict = {
            "id": 1,
            "data": {
              "text": text #sentenca inteira
            },
            "annotations": []
        }
        self._add_annotations()      
    def _add_annotations(self):
        results = []
        count = 0        
        for index, row in self.entity_name_new.iterrows(): 
            results.append(ResultNERJson(row).get_dict())        
        item = [{
              "id": 1,
              "created_username": " null, 0",
              "created_ago": "",
              "result": results
            }]
        self.main_dict["annotations"] = item
    def get_output(self):
        return self.main_dict
    def add_relationship(self, from_id, to_id, relations, direction):
        results = self.main_dict.get("annotations")[0].get("result")
        relation = ResultRelationJson(from_id, to_id, [relations], direction).get_dict()
        results.append(relation)
        self.main_dict["annotations"][0]["result"] = results   
        
def combine_itens_from_lists_add_in_json(from_id_vec, to_id_vec, relation_from_vec, output):
    for idxRelation in range(0,len(from_id_vec)):
        direction = "right"
        output.add_relationship(from_id=from_id_vec[idxRelation], to_id=to_id_vec[idxRelation], relations = relation_from_vec[idxRelation], direction=direction)
    return output
def saveJsonFiles(df,from_id,to_id, lista_relaoces_sentence,sentence,SentenceNum,path):
    #cria e salva o arquivo Json para labelstudio
    text = sentence.iloc[0]['text']
    print('Saved Json ->', True)
    output = CreateOutput(text,sentence, df)
    combine_itens_from_lists_add_in_json(from_id, to_id, lista_relacoes_sentence, output)
    print("-------------")
    with open(os.path.join(path,f"{SentenceNum}.json"), "w") as outfile: 
        json.dump(output.get_output(), outfile) 
        
def get_df_forJsons(sentence,idxTokens):
    #retorna um dataframe com as informações das entidades e uma string contendo o nome completo da entidade
    df_save_words = pd.DataFrame(columns=['index_e', "LABEL", "START", "END",\
                                      "TEXT", "word_join", "start_word", "end_word", "label_word","URI"])

    index_e = sentence.iloc[idxTokens]['index_e']
    label = sentence.iloc[idxTokens]['deps']
    start = sentence.iloc[idxTokens]['word_join_start']
    end = start + len(sentence.iloc[idxTokens]['form'])
    text_ent = sentence.iloc[idxTokens]['form']
    word_join = sentence.iloc[idxTokens]['word_join']
    start_word = sentence.iloc[idxTokens]['word_join_start']
    end_word = sentence.iloc[idxTokens]['word_join_end']
    label_word = label.replace("B=","")
    URI = sentence.iloc[idxTokens]['grafo']

    df_save_words.loc[len(df_save_words.index)] = [index_e, label, start, end, text_ent, word_join,
                                                   start_word,
                                                   end_word,
                                                   label_word,
                                                   URI]

    return df_save_words, word_join
        
def create_df_JsonFiles(df_entity,x,token,token2,URI_1,URI_2,idxTokens,idxTokens2,from_id,to_id,sentence):
    #retorna o dataframe utilizado para criacao dos arquivos Json para labelstudio
    entity_name_new_token1,wordjoin_1 = get_df_forJsons(sentence,idxTokens)
    entity_name_new_token2,wordjoin_2 = get_df_forJsons(sentence,idxTokens2)
    if idxTokens not in from_id and idxTokens not in to_id:
        df_entity = pd.concat([df_entity, entity_name_new_token1])
    if idxTokens2 not in from_id and idxTokens2 not in to_id:
        df_entity = pd.concat([df_entity, entity_name_new_token2])

#     print_relation_entities(wordjoin_1,wordjoin_2,token.replace('B=',''),token2.replace('B=',''),URI_1,URI_2,x)
    return df_entity  

### Funções para criar dataframe para modelo BERT

In [6]:
def create_moddedText_BERT(text,start1,end1,start2,end2,Ent1_inic,Ent1_end,Ent2_inic,Ent2_end):
    new_end_ent1 = end1 + len(Ent1_inic)
    new_start_ent2 = start2 + len(Ent1_inic) + len(Ent1_end)
    new_end_ent2 = end2 + len(Ent1_inic) + len(Ent1_end) + len(Ent2_inic)      
    #adicionando [E1] e [/E1]
    text_new = text[:start1] + Ent1_inic + text[start1:]
    text_new = text_new[:new_end_ent1] + Ent1_end + text_new[new_end_ent1:]
    #adicionando [E2] e [/E2]
    text_new2 = text_new[:new_start_ent2] + Ent2_inic + text_new[new_start_ent2:]
    text_new2 = text_new2[:new_end_ent2] + Ent2_end + text_new2[new_end_ent2:]
    
    return text_new2

def createText_sentence_BERT(text,start_1,end_1,start_2,end_2):
    #funcao que retorna um novo texto para sentenca com [E1] e [E2] adicionados junto de cada entidade
    start_ent1, start_ent2 = start_1, start_2
    end_ent1, end_ent2 = end_1, end_2
    Ent1_inic, Ent1_end = '[E1] ', ' [/E1]'
    Ent2_inic, Ent2_end = '[E2] ', ' [/E2]'
    
    if start_ent1 < start_ent2: #[E1] vem antes de [E2]
        text_new = create_moddedText_BERT(text,start_ent1,end_ent1,start_ent2,end_ent2,\
                                  Ent1_inic,Ent1_end,Ent2_inic,Ent2_end)
    else: #[E2] vem antes de [E1]      
        text_new = create_moddedText_BERT(text,start_ent2,end_ent2,start_ent1,end_ent1,\
                                      Ent2_inic,Ent2_end,Ent1_inic,Ent1_end)  
    return text_new

def create_bert_dataframe(df_bert,idxTokens,idxTokens2,sentence,URI_1,URI_2,has_relation,relation_type,SentenceNumber):
    #retorna o dataframe com as informacoes de cada sentenca para utilizar no modelo BERT
    df_bert_temp = pd.DataFrame(columns=['#Sentence','sentence','Ent1','Ent2','URI_1','URI_2','has_relation','relation'])
    text = sentence.iloc[0]['text']
    wordjoin_1, wordjoin_2 = sentence.iloc[idxTokens]['word_join'], sentence.iloc[idxTokens2]['word_join']
    ent1, ent2 = sentence.iloc[idxTokens]['deps'], sentence.iloc[idxTokens2]['deps']
    ent1, ent2 = ent1.replace("B=",""), ent2.replace("B=","")
    start_1, start_2 = sentence.iloc[idxTokens]['word_join_start'], sentence.iloc[idxTokens2]['word_join_start']
    end_1, end_2 = sentence.iloc[idxTokens]['word_join_end'], sentence.iloc[idxTokens2]['word_join_end']
    text_bert_ents = createText_sentence_BERT(text,start_1,end_1,start_2,end_2)
    df_bert_temp.loc[0] = [SentenceNumber,
                           text_bert_ents,
                           ent1,
                           ent2,
                           URI_1,
                           URI_2,
                           has_relation,
                           relation_type]
    df_bert = pd.concat([df_bert, df_bert_temp])
    if relation_type!='no_relation':
        print_relation_entities(wordjoin_1,wordjoin_2,ent1,ent2,URI_1,URI_2,relation_type,text_bert_ents)
    return df_bert

## Funções para processar as sentenças

In [7]:
#utils
def create_relations_dataframe(df_relation,token,token2,URI_1,URI_2,x,originalSentenceNumber):
    #retorna dataframe das entidades e suas relações em cada linha
    #importante para contabilizar os tipos de relação
    df_relation_new = pd.DataFrame(columns=['Relation','Ent1','Ent2','URI_1','URI_2','#Sentence'])
    df_relation_new.loc[0] = [x,
                            token.replace('B=',''),
                            token2.replace('B=',''),
                            URI_1,
                            URI_2,
                            originalSentenceNumber]
    df_relation = pd.concat([df_relation, df_relation_new])
    return df_relation

def verifica_pares_entidade_interesse(ENT_1, ENT_2,relation_type):
    #verifica se a relacao encontrada vai ser do tipo temporal_relation CRONO->CRONO
    #funcao talvez precise ser atualizada no futuro conforme a ontologia for povoada
    lista_from = ['POÇO','UNIDADE_LITO','UNIDADE_LITO','CAMPO','POÇO','POÇO','UNIDADE_LITO','UNIDADE_LITO']
    lista_to = ['UNIDADE_LITO','NÂOCONSOLID','ROCHA','BACIA','BACIA','CAMPO','BACIA','UNIDADE_CRONO']        
    for idx in range(0,len(lista_to)):
        if lista_from[idx] == ENT_1 and lista_to[idx] == ENT_2:
            return relation_type
    return 'temporal_relation'

def go_through_sentence(sentence_df,df_relation,df_bert,sent_numb):
    #percorre a sentenca em busca de relacoes entre entidades anotadas com URIs
    df_entity = pd.DataFrame()
    from_id, to_id = [], []
    relation_from, relation_to = [], []
    lista_relacoes_sentence = []
    is_to_save = False
#     df_bert.to_csv(save_csv_name, encoding='utf-8',index=False)
#     df_relation.to_csv('df_relation.csv', encoding='utf-8',index=False)
    for idxTokens in range(len(sentence_df)):
        token, URI_1 = sentence_df.iloc[idxTokens]['deps'], sentence_df.iloc[idxTokens]['grafo']
        for idxTokens2 in range(len(sentence_df)):
            if idxTokens != idxTokens2:
                token2, URI_2 = sentence_df.iloc[idxTokens2]['deps'], sentence_df.iloc[idxTokens2]['grafo']
                has_relation = False
                relation_type = go_through_relations(URI_1,URI_2)
                if relation_type: 
                    print("-------------")
                    print('sentence =', sent_numb)
                    is_to_save = True
                    has_relation = True
                    Ent1, Ent2 = token.replace("B=",""), token2.replace("B=","")
                    relation_type = verifica_pares_entidade_interesse(Ent1,Ent2,relation_type)
                    lista_relacoes_sentence.append(relation_type)

                    #criar df_bert para BERT RE com codigo do Fabio
                    df_bert = create_bert_dataframe(df_bert,idxTokens,idxTokens2,sentence_df,
                                                    URI_1,URI_2,
                                                    has_relation,relation_type,originalSentenceNumber)

                    #para contabilizar os pares de entidade por relacao
                    df_relation = create_relations_dataframe(df_relation,token,token2,
                                                             URI_1,URI_2,relation_type,originalSentenceNumber)
                    #listas para contabilizar relacoes, uris e classes
                    lista_relacoes.append(relation_type)
                    lista_uris.append(URI_1)
                    lista_uris.append(URI_2)         
                    lista_classes.append(Ent1)
                    lista_classes.append(Ent2)

                    if is_to_createJsons: #se quiser criar Jsons para LabelStudio
                        df_entity = create_df_JsonFiles(df_entity,relation_type,token,token2,URI_1,URI_2,
                                                        idxTokens,idxTokens2,from_id,to_id,sentence_df)
                        from_id.append(idxTokens)
                        to_id.append(idxTokens2) 

                else: #nao achou relacao
                    relation_type = 'no_relation'

                    df_bert = create_bert_dataframe(df_bert,idxTokens,idxTokens2,sentence_df,
                                                    URI_1,URI_2,has_relation,relation_type,originalSentenceNumber)
                        
    return lista_relacoes,lista_uris,lista_classes,\
            df_bert, df_relation, df_entity, \
            lista_relacoes_sentence, from_id, to_id, is_to_save

### Ler arquivo csv (ou pkl) com as sentenças pós filtragem

In [8]:
#df_filtred_sentences = pickle.load(open('df_filtred_petroner_uri_2023_04_05.conllu.pkl', 'rb'))
#df_filtred_sentences = pd.read_csv('df_filtred_petroner_uri_2023_04_05_conllu.csv')

df_filtred_sentences = pickle.load(open('df_filtred_petroner_uri_valid.conllu.pkl', 'rb'))
df_filtred_sentences = pd.read_csv('df_filtred_petroner_uri_valid_conllu.csv')

df_group = df_filtred_sentences.groupby('sentence')
print('Numero total de sentenças pos-filtragem -> ',len(df_group))

Numero total de sentenças pos-filtragem ->  446


### Escolher se deseja criar Jsons para labelstudio

In [9]:
is_to_createJsons = True
# is_to_createJsons = False

### Folder outputs

In [10]:
#save_folder_path = "./JSONs_04_05" #local onde são salvos os Jsons para labelstudio
save_folder_path = "./JSONs_valid"
save_csv_name = 'df_bert_sentences_valid.csv'

## Rotina para processar as sentenças já filtradas 

In [None]:
%%time

numberSentences = df_filtred_sentences.iloc[-1]['sentence'] #numero de sentencas diferentes no arquivo ja filtrado
lista_relacoes, lista_uris, lista_classes, list_sentences_dict = [], [], [], []
df_relation, df_bert = pd.DataFrame(), pd.DataFrame()

countJsons=0
for idx in range(1,len(df_group)):
    filtred_sentence = df_group.get_group(idx)#aqui filtred_sentence é um dataframe da sentenca
    originalSentenceNumber = filtred_sentence.iloc[0]['#sentence_original']
    text = filtred_sentence.iloc[0]['text']
    lista_relacoes,lista_uris,lista_classes,df_bert,\
    df_relation,df_entity,lista_relacoes_sentence,\
    from_id,to_id, is_to_save = go_through_sentence(filtred_sentence,df_relation,df_bert,originalSentenceNumber) 
    df_bert.to_csv(save_csv_name, encoding='utf-8',index=False)
    df_relation.to_csv('df_relation_valid.csv', encoding='utf-8',index=False)
    if is_to_save and is_to_createJsons:#Jsons somente criados para sentencas com relacao
        countJsons+=1
        saveJsonFiles(df_entity,from_id,to_id, 
                      lista_relacoes_sentence,filtred_sentence,originalSentenceNumber,save_folder_path)
#     raise SystemExit("Stop right there!")    
print("-------------")
print("Number of Jsons saved = ", countJsons)

# pickle.dump(df_relation, open('df_relation.pkl','wb'), protocol=pickle.HIGHEST_PROTOCOL)
df_relation.to_csv('df_relation_valid.csv', encoding='utf-8',index=False)
df_bert.to_csv(save_csv_name, encoding='utf-8',index=False)

relacoes, numb_rel = np.unique(lista_relacoes, return_counts = True)
pickle.dump(lista_relacoes,open('lista_relacoes_valid.pkl','wb'),protocol = pickle.HIGHEST_PROTOCOL)

uris, numb_uris = np.unique(lista_uris, return_counts = True)
pickle.dump(lista_uris,open('lista_uris_valid.pkl','wb'),protocol = pickle.HIGHEST_PROTOCOL)

classes, numb_classes = np.unique(lista_classes, return_counts = True)
pickle.dump(lista_classes,open('lista_classes_valid.pkl','wb'),protocol = pickle.HIGHEST_PROTOCOL)

-------------
sentence = 34
Token 1 =   poço 7-BRG-12-SE --- Class 1 =  POÇO --- URI 1 =  #POCO_CD_POCO_007553
Token 2 =   Bacia de Sergipe/Alagoas --- Class 2 =  BACIA --- URI 2 =  #BASE_CD_BACIA_116
Relation Type =  located_in
Observações:  1.a biozona foi originalmente definida em depósitos    [E2] Bacia de Sergipe/Alagoas [/E2] (testemunhos    [E1] poço 7-BRG-12-SE [/E1]), por Beurlen et al (1987)..
-------------
Saved Json -> True
-------------
-------------
sentence = 47
Token 1 =   Albiano --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Albian
Token 2 =   Aptiano --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Aptian
Relation Type =  temporal_relation
Cronoestratigrafia: [E1] Albiano [/E1], podendo, entretanto, englobar parte    [E2] Aptiano [/E2]..
-------------
-------------
sentence = 47
Token 1 =   Aptiano --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Aptian
Token 2 =   Albiano --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Albian
Relation Type =  temporal_relation
Cronoestratigrafia: [E2] A

### Contabilização das relações encontradas

In [13]:
relacoes.tolist()

['constituted_by',
 'crosses',
 'has_age',
 'interval_finishes',
 'located_in',
 'temporal_relation']

In [14]:
numb_rel.tolist()

[402, 29, 191, 1, 526, 1143]

### Contabilização das classes encontradas

In [15]:
classes.tolist()

['BACIA',
 'CAMPO',
 'NÃOCONSOLID',
 'POÇO',
 'ROCHA',
 'UNIDADE_CRONO',
 'UNIDADE_LITO']

In [16]:
numb_classes.tolist()

[492, 162, 7, 118, 402, 2246, 1157]

### Contabilização das URIs encontradas 

In [17]:
uris.tolist()

['#Alagoas_Age',
 '#Albian',
 '#Aptian',
 '#Aratu_Age',
 '#Archean',
 '#Atokan_Age',
 '#BASE_CD_BACIA_020',
 '#BASE_CD_BACIA_030',
 '#BASE_CD_BACIA_051',
 '#BASE_CD_BACIA_076',
 '#BASE_CD_BACIA_080',
 '#BASE_CD_BACIA_090',
 '#BASE_CD_BACIA_096',
 '#BASE_CD_BACIA_100',
 '#BASE_CD_BACIA_106',
 '#BASE_CD_BACIA_116',
 '#BASE_CD_BACIA_210',
 '#BASE_CD_BACIA_215',
 '#BASE_CD_BACIA_230',
 '#BASE_CD_BACIA_240',
 '#BASE_CD_BACIA_250',
 '#BASE_CD_BACIA_256',
 '#BASE_CD_BACIA_260',
 '#BASE_CD_BACIA_266',
 '#BASE_CD_BACIA_270',
 '#BASE_CD_BACIA_281',
 '#BASE_CD_BACIA_300',
 '#BASE_CD_BACIA_316',
 '#BASE_CD_BACIA_381',
 '#Barremian',
 '#Bartonian',
 '#Bashkirian',
 '#Berriasian',
 '#Buracica_Age',
 '#Burdigalian',
 '#CAMP_CD_CAMPO_0003',
 '#CAMP_CD_CAMPO_0004',
 '#CAMP_CD_CAMPO_0012',
 '#CAMP_CD_CAMPO_0017',
 '#CAMP_CD_CAMPO_0027',
 '#CAMP_CD_CAMPO_0065',
 '#CAMP_CD_CAMPO_0077',
 '#CAMP_CD_CAMPO_0082',
 '#CAMP_CD_CAMPO_0093',
 '#CAMP_CD_CAMPO_0118',
 '#CAMP_CD_CAMPO_0174',
 '#CAMP_CD_CAMPO_0179',
 

In [18]:
numb_uris.tolist()

[64,
 129,
 112,
 31,
 6,
 40,
 2,
 12,
 3,
 2,
 1,
 2,
 35,
 49,
 4,
 111,
 1,
 4,
 11,
 55,
 8,
 1,
 9,
 26,
 27,
 75,
 28,
 22,
 4,
 19,
 2,
 1,
 2,
 45,
 4,
 1,
 1,
 1,
 5,
 1,
 4,
 4,
 6,
 1,
 2,
 1,
 1,
 1,
 4,
 4,
 1,
 2,
 1,
 3,
 2,
 1,
 1,
 1,
 13,
 2,
 2,
 2,
 4,
 1,
 1,
 1,
 4,
 1,
 4,
 3,
 3,
 2,
 2,
 1,
 1,
 1,
 8,
 2,
 2,
 1,
 1,
 6,
 1,
 1,
 12,
 1,
 1,
 1,
 1,
 6,
 4,
 1,
 1,
 1,
 3,
 1,
 1,
 3,
 3,
 3,
 46,
 135,
 51,
 47,
 36,
 18,
 36,
 98,
 10,
 16,
 35,
 29,
 18,
 28,
 2,
 2,
 14,
 52,
 46,
 30,
 94,
 2,
 2,
 17,
 11,
 1,
 1,
 1,
 6,
 120,
 2,
 36,
 2,
 5,
 13,
 4,
 8,
 24,
 26,
 8,
 2,
 3,
 20,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 4,
 1,
 6,
 4,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 4,
 2,
 1,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 4,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 48,
 16,
 4,
 56,
 4,
 58,
 2,
 54,
 6,
 3,
 12,
 14,
 8,
 14,
 59,
 6,
 2,
 2,

### Verificação de pares de entidades por tipo de relação

In [19]:
# df_filtred = pickle.load(open('df_relation.pkl', 'rb'))
df_relations = pd.read_csv('df_relation.csv')
df_grp = df_relations.groupby('Relation')
relations_groups = df_grp.groups
relations = list(relations_groups)
lista_pares = []
for relation in relations:
    df_rel = df_grp.get_group(relation)
    list_rel = []
    for idx_rel in range(0,len(df_rel)):
        par = df_rel.iloc[idx_rel]['Ent1'] + ' + ' + df_rel.iloc[idx_rel]['Ent2']
        list_rel.append(par)
    lista_pares.append(list_rel)
print('Number of types of relations ->', len(lista_pares))

Number of types of relations -> 6


### Avaliar idx_pair de 0 ao tamanho apresentado acima para verificar os pares de entidades

In [20]:
idx = 0
pares, numb_pares = np.unique(lista_pares[idx], return_counts = True)
print('Relation -> ',relations[idx])
print('Entities pair -> ',pares.tolist())
print('Number of ocorrences -> ',numb_pares.tolist())

Relation ->  constituted_by
Entities pair ->  ['UNIDADE_LITO + ROCHA']
Number of ocorrences ->  [402]
