In [26]:
import argparse
import json
from glob import glob
import pandas as pd
import pickle
import numpy as np
import re
import os
import operator as op
import warnings 
from owlready2 import * #
import random
import unicodedata
warnings.filterwarnings('ignore')

## Carregar ontologia

In [27]:
onto_name = "OntoGeoLogicaPovoadaInstanciasRelacoes"
onto = get_ontology("../../KnowledgeGraph/OntoGeoLogicaInstanciasRelacoes.owl")
onto.load()

get_ontology("http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#")

## Folder outputs

In [29]:
save_folder_path = "./JSONs"
save_csv_name = 'df_bert_sentences.csv'

## Funções para gerar Jsons a serem lidos no labelstudio

In [30]:
class ResultRelationJson(object):
    def __init__(self, from_id, to_id, relations, direction = "right"):
        self.dict = {
            "from_id": str(from_id),
            "to_id": str(to_id),
            "type": "relation",
            "direction": direction,
            "labels": relations
        }
    def get_dict(self):
        return self.dict
    
class ResultNERJson(object):
    def __init__(self, row):     
        self.result_dict = {
            "value": {
            "start": row["start_word"],
            "end": row["end_word"],
            "text": row["word_join"],
            "labels": [
              row["label_word"]
            ],
            "URI": row["URI"]
            },
            
            "id": row["index_e"],
            "from_name": "label",
            "to_name": "text",
            "type": "labels",
            "origin": "prediction"
        }
    def get_dict(self):
        return self.result_dict
    
    
class CreateOutput(object):
    def __init__(self, text, filtred_sentence, entity_name_new):
        self.filtred_sentence = filtred_sentence
        self.entity_name_new = entity_name_new
        self.main_dict = {
            "id": 1,
            "data": {
              "text": text #sentenca inteira
            },
            "annotations": []
        }
        self._add_annotations()
        
    def _add_annotations(self):
        results = []
        count = 0        
        for index, row in self.entity_name_new.iterrows(): 
            results.append(ResultNERJson(row).get_dict())        
        item = [{
              "id": 1,
              "created_username": " null, 0",
              "created_ago": "",
              "result": results
            }]
        self.main_dict["annotations"] = item
    
    def get_output(self):
        return self.main_dict
    
    def add_relationship(self, from_id, to_id, relations, direction):
        results = self.main_dict.get("annotations")[0].get("result")
        relation = ResultRelationJson(from_id, to_id, [relations], direction).get_dict()
        results.append(relation)
        print('-----------')
        print("relation")
        print(relation)
        self.main_dict["annotations"][0]["result"] = results
        
def combine_itens_from_lists_add_in_json(from_id_vec, to_id_vec, relation_from_vec, output):
    for idxRelation in range(0,len(from_id_vec)):
            direction = "right"
            output.add_relationship(from_id=from_id_vec[idxRelation], to_id=to_id_vec[idxRelation], relations = relation_from_vec[idxRelation], direction=direction)
    return output

## Funções para processar as sentenças

In [31]:
#utils
def check_I_entities(df_get_start_end, i,entity):
    next_entity_is_I = (df_get_start_end.iloc[i]["deps"] == entity) or (df_get_start_end.iloc[i]["deps"] == None and df_get_start_end.iloc[i+1]["deps"] == entity)
    return next_entity_is_I

def get_words_by_entities(indexes, df_get_start_end):
    df_save_words = pd.DataFrame(columns=['index_e', "LABEL", "START", "END", "TEXT", "word_join", "start_word", "end_word", "label_word","URI"])
    
    for index in indexes:
        entity = df_get_start_end.iloc[index]['deps']
        entity_I = entity.replace("B=","I=")
        count = 1
        word_join = ""
        row_main = df_get_start_end.iloc[index]
        word_join = " ".join([word_join, row_main['form']])
        start_word = row_main['start']
        end_word = row_main['end']
        label_word = row_main['deps'].replace("B=", "")
        URI = df_get_start_end.iloc[index]['misc'].get('grafo')
        while index+count != len(df_get_start_end) and (df_get_start_end.iloc[index+count]["deps"] == entity_I or check_I_entities(df_get_start_end, index+count,entity_I)):
            row = df_get_start_end.iloc[index+count]
            word_join = " ".join([word_join, row['form']])
            end_word = row['end']
            count+=1

        df_save_words.loc[len(df_save_words.index)] = [index, 
                                                       row_main['deps'],
                                                       df_get_start_end.iloc[index]['start'], #so da primeira linha
                                                       df_get_start_end.iloc[index]['end'], #so da primeira linha
                                                       row_main['form'],
                                                       word_join.strip(),
                                                       start_word,
                                                       end_word,
                                                       label_word,
                                                       URI]
    return df_save_words, word_join

def get_relations_between_uris(uri_1, uri_2):
    dict_relation_uris = {}
    
    #Pega as relacoes que a URI1 tem
    relation_query_results = list(default_world.sparql("""
            SELECT DISTINCT ?rel
            WHERE{?uri ?rel ?obj
                 FILTER(contains(str(?rel), "http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#"))
                 FILTER (contains(str(?uri), """ + '"' + uri_1 + '"' + """))
                 }
            """))
    
    relations_str = []
    for relation_uris in relation_query_results:
        relations_str.append(str(relation_uris[0]).rsplit(".",1)[-1])
        
    # Para cada tipo de relação procura se existe match entre URI1 e URI2
    for relation in relations_str:
        relation_between_words = list(default_world.sparql("""
                PREFIX prefix: <http://www.semanticweb.org/bg40/ontologies/2022/5/untitled-ontology-2#>
                SELECT distinct ?y ?x2
                WHERE{?y prefix:""" +  relation  +  """ ?x1

                      FILTER (contains(str(?y), """ + '"' + uri_1  + '"' + """))        

                      ?x2 rdf:type ?j                                   
                      FILTER (contains(str(?x2), """ + '"' + uri_2  + '"' + """))

                      FILTER ( ?x2 = ?x1 )
                    }
                """))
        dict_relation_uris[relation] = relation_between_words
    return dict_relation_uris

def create_relations_dataframe(df_relation,token,token2,URI_1,URI_2,x,originalSentenceNumber):
    df_relation_new = pd.DataFrame(columns=['Relation','Ent1','Ent2','URI_1','URI_2','#Sentence'])
    df_relation_new.loc[0] = [x,
                            token.replace('B=',''),
                            token2.replace('B=',''),
                            URI_1,
                            URI_2,
                            originalSentenceNumber]
    df_relation = pd.concat([df_relation, df_relation_new])
    return df_relation

def verifica_pares_entidade_interesse(ENT_1, ENT_2):  
    lista_from = ['POÇO','UNIDADE_LITO','UNIDADE_LITO','CAMPO','POÇO','POÇO','UNIDADE_LITO','UNIDADE_LITO']
    lista_to = ['UNIDADE_LITO','NÂOCONSOLID','ROCHA','BACIA','BACIA','CAMPO','BACIA','UNIDADE_CRONO']   
#     lista_from = ['POÇO','UNIDADE_LITO','UNIDADE_LITO','CAMPO','POÇO','POÇO','UNIDADE_LITO','UNIDADE_CRONO']
#     lista_to = ['UNIDADE_LITO','NÂOCONSOLID','ROCHA','BACIA','BACIA','CAMPO','BACIA','UNIDADE_CRONO']      
    for idx in range(0,len(lista_to)):
        if lista_from[idx] == ENT_1 and lista_to[idx] == ENT_2:
            return True  
    return False

def createText_added_entities(text,df_1,df_2):
    start_ent1, end_ent1, = int(df_1.iloc[-1]['start_word']), int(df_1.iloc[-1]['end_word'])
    start_ent2, end_ent2 = int(df_2.iloc[-1]['start_word']), int(df_2.iloc[-1]['end_word'])
    
    Ent1_inic, Ent1_end = '[E1] ', ' [/E1]'
    Ent2_inic, Ent2_end = '[E2] ', ' [/E2]'
    
    if start_ent1 < start_ent2: #[E1] vem antes de [E2]
        new_end_ent1 = end_ent1 + len(Ent1_inic)
        new_start_ent2 = start_ent2 + len(Ent1_inic) + len(Ent1_end)
        new_end_ent2 = end_ent2 + len(Ent1_inic) + len(Ent1_end) + len(Ent2_inic)
        
        #adicionando [E1] e [/E1]
        text_new = text[:start_ent1] + Ent1_inic + text[start_ent1:]
        text_new = text_new[:new_end_ent1] + Ent1_end + text_new[new_end_ent1:]
        #adicionando [E2] e [/E2]
        text_new2 = text_new[:new_start_ent2] + Ent2_inic + text_new[new_start_ent2:]
        text_new2 = text_new2[:new_end_ent2] + Ent2_end + text_new2[new_end_ent2:]
    
    else: #[E2] vem antes de [E1]      
        new_end_ent2 = end_ent2 + len(Ent2_inic)
        new_start_ent1 = start_ent1 + len(Ent2_inic) + len(Ent2_end)
        new_end_ent1 = end_ent1 + len(Ent2_inic) + len(Ent2_end) + len(Ent1_inic)
        
        #adicionando [E2] e [/E2]
        text_new = text[:start_ent2] + Ent2_inic + text[start_ent2:]
        text_new = text_new[:new_end_ent2] + Ent2_end + text_new[new_end_ent2:]
        #adicionando [E1] e [/E1]
        text_new2 = text_new[:new_start_ent1] + Ent1_inic + text_new[new_start_ent1:]
        text_new2 = text_new2[:new_end_ent1] + Ent1_end + text_new2[new_end_ent1:]
        
    return text_new2

def print_sentence_text(sentence):
    size_sentence = int(sentence.iloc[-1]["end"])
    text = " "*size_sentence
    for index, row in sentence.iterrows():
        text = text[:int(row["start"])] + row["form"] +text[int(row["end"]):]
    print(text)
    print("-------------")
    return text

def create_bert_dataframe(df_bert,idxTokens,idxTokens2,sentence,text,has_relation,relation_type,SentenceNumber):
    df_1, wordjoin_1_trash = get_words_by_entities([idxTokens],sentence)
    df_2, wordjoin_2_trash = get_words_by_entities([idxTokens2],sentence)
    ent1, ent2 = df_1.iloc[-1]['LABEL'], df_2.iloc[-1]['LABEL']
    ent1, ent2 = ent1.replace('B=',''), ent2.replace('B=','')
    text_bert_ents = createText_added_entities(text,df_1,df_2)
#     print(text_bert_ents)
    df_bert_temp = pd.DataFrame(columns=['index_e','sentence','Ent1','Ent2','has_relation','relation'])
    df_bert_temp.loc[0] = [SentenceNumber,
                           text_bert_ents,
                           ent1,
                           ent2,
                           has_relation,
                           relation_type]
    df_bert = pd.concat([df_bert, df_bert_temp])
    return df_bert

def saveJsonFiles(df,text,from_id,to_id, lista_relaoces_sentence,sentence,SentenceNum,path):
    print("-------------")
    print(df.head(50))
    print("-------------")
    print("sentence-> ",SentenceNum)
    print("-------------")
    print(text)
    print("-------------")
    print('Saved Json ->', True)
    output = CreateOutput(text,sentence, df)
    combine_itens_from_lists_add_in_json(from_id, to_id, lista_relacoes_sentence, output)
    print("-------------")
    with open(os.path.join(path,f"{SentenceNum}.json"), "w") as outfile: 
        json.dump(output.get_output(), outfile) 
        
def create_df_JsonFiles(df_entity,x,token,token2,URI_1,URI_2,idxTokens,idxTokens2,from_id,to_id,sentence):
    entity_name_new_token1, wordjoin_1 = get_words_by_entities([idxTokens],sentence)
    if idxTokens not in from_id and idxTokens not in to_id:
        df_entity = pd.concat([df_entity, entity_name_new_token1])
    entity_name_new_token2, wordjoin_2 = get_words_by_entities([idxTokens2],sentence)
    if idxTokens2 not in from_id and idxTokens2 not in to_id:
        df_entity = pd.concat([df_entity, entity_name_new_token2])
    print('Token 1 = ', wordjoin_1, '--- Class 1 = ', token.replace('B=',''), '--- URI 1 = ', URI_1)
    print('Token 2 = ', wordjoin_2, '--- Class 2 = ', token2.replace('B=',''),'--- URI 2 = ',URI_2)
    print('Relation Type = ', x)
    print("-------------")
    return df_entity
    
def getDictBert(df,text,lista_relacoes_sentence,from_id,to_id,list_sentences_dict):
    sentence_dict = []
    list_tokens_dict = []
    list_relations_dict = []
    document = text
    for idxTokenList in range(0,df.shape[0]):
        word_join = df.iloc[idxTokenList]['word_join']
        start = int(df.iloc[idxTokenList]['start_word'])
        end = int(df.iloc[idxTokenList]['end_word'])
        token_start = df.iloc[idxTokenList]['index_e'] #filtred_sentence.iloc[5]
        token_end = token_start + op.countOf(word_join," ")
        entity_label = df.iloc[idxTokenList]['LABEL'].replace('B=','')
        tokens_dict = {'text': word_join,
                       'start': start,
                       'end': end,
                       'token_start': token_start,
                       'token_end': token_end,
                       'entity_label': entity_label
                      }
        list_tokens_dict.append(tokens_dict)
    for idxRelList in range(0,len(lista_relacoes_sentence)):
        relation_from_id = from_id[idxRelList]
        relation_to_id = to_id[idxRelList]
        relation_label = lista_relacoes_sentence[idxRelList]
        relations_dict = {'child': relation_from_id,
                          'head': relation_to_id,
                          'relationLabel': relation_label} 
        list_relations_dict.append(relations_dict)
    sentence_dict = {'document': document,
                     'tokens': list_tokens_dict,
                     'relations': list_relations_dict
                    }
    list_sentences_dict.append(sentence_dict)
    with open('file_sentencasBERT.json', 'w') as fout:
        json.dump(list_sentences_dict , fout)
        
    return list_sentences_dict


### Ler arquivo csv (ou pkl) com as sentenças pós filtragem

In [32]:
df_filtred = pickle.load(open('df_filtred_petroner_uri_2023_04_05.conllu.pkl', 'rb'))
#df_filtred = pd.read_csv('df_filtred_petroner_uri_2023_04_05_conllu.csv')
df_group = df_filtred.groupby('sentence')
print('Numero total de sentenças pos-filtragem -> ',len(df_group))

Numero total de sentenças pos-filtragem ->  4542


## Rotina para processar as sentenças já filtradas 

In [33]:
%%time

numberSentences = df_filtred.iloc[-1]['sentence'] #numero de sentencas diferentes no arquivo ja filtrado

lista_relacoes, lista_uris, lista_classes, list_sentences_dict = [], [], [], []

df_relation, df_bert = pd.DataFrame(), pd.DataFrame()

countJsons=0
for idx in range(1,len(df_group)):
    filtred_sentence = df_group.get_group(idx)#aqui filtred_sentence é um dataframe da sentenca
    originalSentenceNumber = filtred_sentence.iloc[0]['#sentence_original']
    print("-------------")
    print('sentence =', originalSentenceNumber)
    print('idx in filtred file =', idx)   
    text = print_sentence_text(filtred_sentence)
    
    df_entity = pd.DataFrame()
    from_id, to_id = [], []
    relation_from, relation_to = [], []
    lista_relacoes_sentence = []
    is_to_save = False #se cada sentenca vai ser salva ao fim de seu processamento
    df_bert.to_csv(save_csv_name, encoding='utf-8',index=False)
    for idxTokens in range(0,len(filtred_sentence)):
        token = filtred_sentence.iloc[idxTokens]['deps']     
        if "B=" in token and filtred_sentence.iloc[idxTokens]['misc'].get('grafo'): #encontrou o comeco de uma entidade com URI
            URI_1 = filtred_sentence.iloc[idxTokens]['misc'].get('grafo')    
            for idxTokens2 in range(0,len(filtred_sentence)):
                token2 = filtred_sentence.iloc[idxTokens2]['deps']   
                if idxTokens!= idxTokens2 and "B=" in token2 and filtred_sentence.iloc[idxTokens2]['misc'].get('grafo'): #encontrou o comeco de uma  outra entidade com URI
                    URI_2 = filtred_sentence.iloc[idxTokens2]['misc'].get('grafo')
                    has_relation = False
                    relation_URIs = get_relations_between_uris(URI_1, URI_2)            
                    if relation_URIs != {}: #talvez exista relacao entre URIs, dicionario pode vir vazio -> []
                        for x, y in relation_URIs.items():
                            if y != []: #existe alguma relacao
                                is_to_save = True
                                has_relation = True
                                relation_type = x
                                Ent1 = token.replace("B=","")
                                Ent2 = token2.replace("B=","")
                                is_rel_interesse = verifica_pares_entidade_interesse(Ent1,Ent2)
                                if is_rel_interesse == False:
                                    relation_type = 'temporal_relation'
                                lista_relacoes_sentence.append(relation_type)
                                
                                #para depois contabilizar os pares de entidade por relacao
                                df_relation = create_relations_dataframe(df_relation,token,token2,URI_1,URI_2,x,originalSentenceNumber)
                            
                                #criar df_bert para BERT RE com codigo do Fabio
                                df_bert = create_bert_dataframe(df_bert,idxTokens,idxTokens2,filtred_sentence,
                                                                text,has_relation,relation_type,originalSentenceNumber)
                                
                                df_entity = create_df_JsonFiles(df_entity,x,token,token2,URI_1,URI_2,
                                                                idxTokens,idxTokens2,from_id,to_id,filtred_sentence)
                                
                                from_id.append(idxTokens)
                                to_id.append(idxTokens2)    
                                
                                #listas para contabilizar relacoes, uris e classes
                                lista_relacoes.append(relation_type)
                                lista_uris.append(URI_1)
                                lista_uris.append(URI_2)         
                                lista_classes.append(Ent1)
                                lista_classes.append(Ent2)

                    if not has_relation: #nao achou relacao
                        relation_type = 'no_relation'
                        df_bert = create_bert_dataframe(df_bert,idxTokens,idxTokens2,filtred_sentence,
                                                        text,has_relation,relation_type,originalSentenceNumber)
                        
    if is_to_save:
        countJsons+=1
        #salvar arquivo json para labelstudio
        saveJsonFiles(df_entity,text,from_id,to_id, 
                      lista_relacoes_sentence,filtred_sentence,originalSentenceNumber,save_folder_path)
        #salvar json para bert (modelo nao foi usado)
#         list_sentences_dict = getDictBert(df_entity,text,
#                                           lista_relacoes_sentence,from_id,to_id,list_sentences_dict)
        
print("-------------")
print("Number of Jsons saved = ", countJsons )

# pickle.dump(df_relation, open('df_relation.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
df_relation.to_csv('df_relation.csv', encoding='utf-8',index=False)
df_bert.to_csv(save_csv_name, encoding='utf-8',index=False)

relacoes, numb_rel = np.unique(lista_relacoes, return_counts = True)
uris, numb_uris = np.unique(lista_uris, return_counts = True)
classes, numb_classes = np.unique(lista_classes, return_counts = True)

-------------
sentence = 6
idx in filtred file = 1
Membro Mucuri, Eocretáceo    Bacia    Espirito Santo..
-------------
Token 1 =   Membro Mucuri --- Class 1 =  UNIDADE_LITO --- URI 1 =  #membro_010
Token 2 =   Bacia Espirito Santo --- Class 2 =  BACIA --- URI 2 =  #BASE_CD_BACIA_270
Relation Type =  located_in
-------------
-------------
  index_e           LABEL START END    TEXT             word_join start_word  \
0       0  B=UNIDADE_LITO     0   6  Membro         Membro Mucuri          0   
0       4         B=BACIA    29  34   Bacia  Bacia Espirito Santo         29   

  end_word    label_word                 URI  
0       13  UNIDADE_LITO         #membro_010  
0       52         BACIA  #BASE_CD_BACIA_270  
-------------
sentence->  6
-------------
Membro Mucuri, Eocretáceo    Bacia    Espirito Santo..
-------------
Saved Json -> True
-----------
relation
{'from_id': '0', 'to_id': '4', 'type': 'relation', 'direction': 'right', 'labels': ['located_in']}
-------------
-------------




Token 1 =   aptianos --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Aptian
Token 2 =   aptianos --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Aptian
Relation Type =  participates_in
-------------
Token 1 =   aptianos --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Aptian
Token 2 =   intra-albiano --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Albian
Relation Type =  interval_meets
-------------
Token 1 =   aptianos --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Aptian
Token 2 =   aptianos --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Aptian
Relation Type =  participates_in
-------------
-------------
  index_e            LABEL START   END           TEXT      word_join  \
0      23  B=UNIDADE_CRONO   162   175  intra-albiano  intra-albiano   
0      99  B=UNIDADE_CRONO   676   684       aptianos       aptianos   
0     185  B=UNIDADE_CRONO  1229  1237       aptianos       aptianos   

  start_word end_word     label_word      URI  
0        162      175  UNIDADE_CRONO  #Albian  
0        676      684  UNIDADE




Token 1 =   Albiano --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Albian
Token 2 =   intra-albiano --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Albian
Relation Type =  participates_in
-------------
Token 1 =   intra-albiano --- Class 1 =  UNIDADE_CRONO --- URI 1 =  #Albian
Token 2 =   Albiano --- Class 2 =  UNIDADE_CRONO --- URI 2 =  #Albian
Relation Type =  participates_in
-------------
-------------
  index_e            LABEL START  END           TEXT      word_join  \
0      19  B=UNIDADE_CRONO   127  134        Albiano        Albiano   
0      41  B=UNIDADE_CRONO   253  266  intra-albiano  intra-albiano   

  start_word end_word     label_word      URI  
0        127      134  UNIDADE_CRONO  #Albian  
0        253      266  UNIDADE_CRONO  #Albian  
-------------
sentence->  415
-------------
De acordo com os arcabouços bioestratigráficos elaborados    partir de outros grupos fósseis (palinomorfos, principalmente), o Albiano,    margem sudeste, é dividido em duas ou mais biozonas e as in

KeyboardInterrupt: 

In [35]:
relacoes, numb_rel = np.unique(lista_relacoes, return_counts = True)
uris, numb_uris = np.unique(lista_uris, return_counts = True)
classes, numb_classes = np.unique(lista_classes, return_counts = True)

### Contabilização das relações encontradas

In [36]:
relacoes.tolist()

['constituted_by', 'located_in', 'temporal_relation']

In [37]:
numb_rel.tolist()

[4, 16, 18]

### Contabilização das classes encontradas

In [38]:
classes.tolist()

['BACIA', 'POÇO', 'ROCHA', 'UNIDADE_CRONO', 'UNIDADE_LITO']

In [39]:
numb_classes.tolist()

[16, 8, 4, 36, 12]

### Contabilização das URIs encontradas 

In [40]:
uris.tolist()

['#Albian',
 '#Aptian',
 '#BASE_CD_BACIA_096',
 '#BASE_CD_BACIA_100',
 '#BASE_CD_BACIA_116',
 '#BASE_CD_BACIA_270',
 '#BASE_CD_BACIA_281',
 '#Cenomanian',
 '#POCO_CD_POCO_007553',
 '#POCO_CD_POCO_010471',
 '#POCO_CD_POCO_012996',
 '#Turonian',
 '#formacao_163',
 '#formacao_166',
 '#formacao_319',
 '#grupo_000',
 '#marlstone',
 '#membro_010',
 '#shale']

In [41]:
numb_uris.tolist()

[16, 14, 2, 6, 6, 1, 1, 4, 2, 4, 2, 2, 2, 2, 6, 1, 2, 1, 2]

### Verificação de pares de entidades por tipo de relação

In [None]:
# df_filtred = pickle.load(open('df_relation.pkl', 'rb'))
df_relations = pd.read_csv('df_relation.csv')
df_grp = df_relations.groupby('Relation')
relations_groups = df_grp.groups
relations = list(relations_groups)
lista_pares = []
for relation in relations:
    df_rel = df_grp.get_group(relation)
    list_rel = []
    for idx_rel in range(0,len(df_rel)):
        par = df_rel.iloc[idx_rel]['Ent1'] + ' + ' + df_rel.iloc[idx_rel]['Ent2']
        list_rel.append(par)
    lista_pares.append(list_rel)
print('Number of types of relations ->', len(lista_pares))

### Avaliar idx_pair de 0 ao tamanho apresentado acima para verificar os pares de entidades

In [None]:
idx = 0
pares, numb_pares = np.unique())[idx], return_counts = True)
print('Relation -> ',relations[idx])
print('Entities pair -> ',pares.tolist())
print('Number of ocorrences -> ',numb_pares.tolist())

In [13]:
df_bert

Unnamed: 0,index_e,sentence,Ent1,Ent2,has_relation,relation
0,6,"[E1] Membro Mucuri [/E1], [E2] Eocretáceo [/E2...",UNIDADE_LITO,UNIDADE_CRONO,False,no_relation
0,6,"[E1] Membro Mucuri [/E1], Eocretáceo [E2] B...",UNIDADE_LITO,BACIA,True,located_in
0,6,"[E2] Membro Mucuri [/E2], [E1] Eocretáceo [/E1...",UNIDADE_CRONO,UNIDADE_LITO,False,no_relation
0,6,"Membro Mucuri, [E1] Eocretáceo [/E1] [E2] B...",UNIDADE_CRONO,BACIA,False,no_relation
0,6,"[E2] Membro Mucuri [/E2], Eocretáceo [E1] B...",BACIA,UNIDADE_LITO,False,no_relation
...,...,...,...,...,...,...
0,119,"modelo proposto autor, são discutidos con...",NÃOCONSOLID,BACIA,False,no_relation
0,119,"modelo proposto autor, são discutidos con...",NÃOCONSOLID,UNIDADE_CRONO,False,no_relation
0,119,"modelo proposto autor, são discutidos con...",NÃOCONSOLID,BACIA,False,no_relation
0,119,"modelo proposto autor, são discutidos con...",BACIA,BACIA,False,no_relation
