In [1]:
import warnings
warnings.filterwarnings("ignore")
import Entities_Mapper as em
import json
import PredicateMapper as pm
import re

In [2]:
class Mapping:
    def __init__(self, input_path, verbs_map_path, embd_verbs_path, output_path):
        self.input_path = input_path
        self.output_path = output_path
        self.verbs_map_path = verbs_map_path
        self.embd_verbs_path = embd_verbs_path
        self.mapping_result = [] ## store the result



    def is_passive(self, s , p , o ):
        # Unpack the triple into subject, predicate, and object
        subject = s
        predicate = p
        obj = o
        
        # Define patterns for identifying passive voice
        passive_patterns = [
            # re.compile(r"\b(?:am|is|are|was|were|been|being)\b", re.IGNORECASE),
            re.compile(r"\b(?:-by)\b", re.IGNORECASE),
        ]
        
        # Check if any passive voice pattern is present in the predicate
        is_passive = any(pattern.search(predicate) for pattern in passive_patterns)
        
        # Determine the voice based on the analysis
        if is_passive:
            return True
        else:
            return False

    def convert_to_active(self,s,p,o):
      # Unpack the triple into subject, predicate, and object
        subject = s
        predicate = p
        obj = o
        
        # subject, predicate, obj = passive_triple
        predicate_without_by = re.sub(r"\b(?:-by)\b", "", predicate, flags=re.IGNORECASE).strip()
        
        active_triple = (obj,predicate_without_by, subject)
        return active_triple

    def apply(self):
        entities_mapper = em.EntitiesMapper(self.input_path, self.output_path )
        entities_mapper.run()
        predicate_mapper = pm.PredicateMapper(self.input_path, self.verbs_map_path,  self.embd_verbs_path)
        # predicate_mapper.predicate_mapping(0.7)
        for element in entities_mapper.input_triples:
            subject = element["subject"]
            predicate = element["predicate"]
            object = element["object"]
            # if predicate in predicate_mapper.mapped_predicate.keys():
            predicate = predicate_mapper.direct_mapping(predicate)
            if subject in entities_mapper.label2cskg_entity.keys():
                subject = entities_mapper.label2cskg_entity[subject]
            if object in entities_mapper.label2cskg_entity.keys():
                 object = entities_mapper.label2cskg_entity[object]

            ## verify if active or passive and convert to active
            if self.is_passive(subject, predicate, object):
                subject, predicate, object = self.convert_to_active(subject, predicate, object)
            ## lemma of predicates
            # predicate = self.lemmatize_predicate(predicate)
            
            self.mapping_result.append(
                {
                        'sentence': element['sentence'],
                        'subject': subject,
                        'predicate': predicate,
                        'object': object,
                        'confidence': element['confidence'],
                        'first_validation': element['first_validation']
                    }
                
            )
            for element in self.mapping_result:
                element["subject"] = self.supprimer_ponctuation(element["subject"])
                element["object"] = self.supprimer_ponctuation(element["object"])
                element['first_validation'] = self.supprimer_ponctuation(element["first_validation"])
            self.handle_duplicates()
            # self.handle_duplicates()

    
    
    ### avec la semilarité des predicats !!!
    def apply_sim_pred(self):
        entities_mapper = em.EntitiesMapper(self.input_path, self.output_path )
        entities_mapper.run()
        predicate_mapper = pm.PredicateMapper(self.input_path, self.verbs_map_path,  self.embd_verbs_path)
        predicate_mapper.similarity_predicate_mapping(0.7)
        for element in entities_mapper.input_triples:
            subject = element["subject"]
            predicate = element["predicate"]
            object = element["object"]
            if predicate in predicate_mapper.mapped_predicate.keys():
                predicate = predicate_mapper.mapped_predicate[predicate]
            if subject in entities_mapper.label2cskg_entity.keys():
                subject = entities_mapper.label2cskg_entity[subject]
            if object in entities_mapper.label2cskg_entity.keys():
                 object = entities_mapper.label2cskg_entity[object]

            ## verify if active or passive and convert to active
            if self.is_passive(subject, predicate, object):
                subject, predicate, object = self.convert_to_active(subject, predicate, object)
            ## lemma of predicates
            # predicate = self.lemmatize_predicate(predicate)
            
            self.mapping_result.append(
                {
                        'sentence': element['sentence'],
                        'subject': subject,
                        'predicate': predicate,
                        'object': object,
                        'confidence': element['confidence'],
                        'first_validation': element["first_validation"]
                        
                    }
                
            )
        for element in self.mapping_result:
            element["subject"] = self.supprimer_ponctuation(element["subject"])
            element["object"] = self.supprimer_ponctuation(element["object"])
            element['first_validation'] = self.supprimer_ponctuation(element["first_validation"])
        self.handle_duplicates()
            
        
    def supprimer_ponctuation(self, phrase):
        # Définition des caractères de ponctuation à supprimer
        ponctuations = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'
        # Filtrer les caractères pour ne garder que ceux qui ne sont pas des ponctuations
        phrase_sans_ponctuation = ''.join(caractere for caractere in phrase if caractere not in ponctuations)
        return phrase_sans_ponctuation 
        
    def handle_duplicates(self):
        # Create a dictionary to store unique sentences with their confidence
        unique_entries = {}
        for entry in self.mapping_result:
            key = (entry['subject'], entry['predicate'], entry['object'])
            if key in unique_entries:
                # Vérifie si la phrase est déjà présente pour éviter la duplication
                if entry['sentence'] not in unique_entries[key]['sentence']:
                    unique_entries[key]['sentence'] += f" | {entry['sentence']}"
                if entry['first_validation'] == 'True':
                    unique_entries[key]['first_validation'] = 'True'
            else:
                unique_entries[key] = entry
        self.mapping_result = list(unique_entries.values())
        return list(unique_entries.values())
       
    def write_to_json(self):
        with open(self.output_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(self.mapping_result, jsonfile, ensure_ascii=False, indent=2)
    def save_results(self):
        pass

In [6]:
# input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/triplets_validator/Bench_merged_withoutGPT_v.json"

# ## Computer Science
# input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/triplets_validator/Bench_merged_withGPT_v.json"
# output_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/post_processing/mapping/Bench_mapped_withtGP_v.json"

## Music
input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/triplets_validator//MusicBench_withGPT_v.json"
output_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/post_processing/mapping/Music_mapped_withGP_v.json"

verbs_map_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/CSKG_VerbNet_verb_map.csv"
embd_path = 'C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/predicate_embeddings.json'


mapping = Mapping(input_path,verbs_map_path, embd_path ,output_path)
mapping.apply()
mapping.write_to_json()

get Entities and pairs
	>> Entities to be mapped: 412
- 	 >> Mapping with wikidata started
	 >> Wikidata Processed 100 entities in 55.64 secs.
	 >> Wikidata Processed 200 entities in 116.78 secs.
	 >> Wikidata Processed 300 entities in 188.22 secs.
	 >> Wikidata Processed 400 entities in 282.14 secs.
> Mapped to Wikidata: 122
- 	 >> Mapping with dbpedia started
-------------------------------------------


In [7]:
len(mapping.mapping_result)

372

In [None]:
predicate_mapper = pm.PredicateMapper(input_path, verbs_map_path,  embd_path)

In [None]:
predicate_mapper.direct_mapping("be use")

In [17]:
s = """
Constructing knowledge graphs (KGs) from textual data isessential for numerous applications, but presents significant challenges
due to the complexity of natural language. Traditional approaches often
rely on Open Information Extraction (OpenIE) for extracting domain-
independent binary relations. However, they are prone to generating
noisy data and incorrect triples. While Named Entity Recognition (NER)
is commonly used to filter entities and enhance data quality, it can
compromise the domain independence of these approaches and overlook
crucial information. To address these limitations, we introduce a novel
pipeline that aims to preserve the domain independence of KG construc-
tion while reducing the prevalence of incorrect triples, thus offering a
cost-effective solution without the need for domain-specific adaptations.
The pipeline utilizes state-of-the-art OpenIE techniques combined with
syntactic cleaning strategies focused on identifying noun phrases to mini-
mize noise and isolate entities. Furthermore, it leverages Large Language
Models (LLMs), specifically GPT-4, to verify the correctness and rele-
vance of extracted triples in relation to the source text. the pipeline’s
performance is evaluated using two gold standards in distinct domains
(i.e. computer science and music), to assess its domain independence.
Experimental results demonstrated its high recall compared to the re-
call one of the state-of-the-art approaches for computer science KG con-
struction, and its precision is notably enhanced through the incorpo-
ration of LLMs. This highlights the potential of LLMs for validating
extracted information. Moreover, experiments conducted on the music
domain corpus showed that our approach maintains good performance
across different fields, underscoring its versatility and effectiveness in
domain-independent KG construction.
"""
s1 = s.split()

In [18]:
s2 = """
Constructing knowledge graphs (KGs) from textual data is essential for numerous applications, but presents significant challenges due to the complexity of natural language. Traditional approaches often rely on Open Information Extraction (OpenIE) for extracting domain-independent binary relations. However, they are prone to generating noisy data and incorrect triples. While Named Entity Recognition (NER) is commonly used to filter entities and enhance data quality, it can compromise the domain independence of these approaches and overlook crucial information. To address these limitations, we introduce a novel pipeline that aims to preserve the domain independence of KG construction while reducing the prevalence of incorrect triples, thus offering a cost-effective solution without the need for domain-specific adaptations. 
The pipeline utilizes state-of-the-art OpenIE techniques combined with syntactic cleaning strategies focused on identifying noun phrases to minimize noise and isolate entities. Furthermore, it leverages Large Language Models (LLMs), specifically GPT-4, to verify the correctness and relevance of extracted triples in relation to the source text. 
the pipeline's performance is evaluated using two gold standards in distinct domains (i.e. computer science and music), to assess its domain independence. Experimental results demonstrated its high recall compared to the recall one of the state-of-the-art approaches for computer science KG construction, and its precision is notably enhanced through the incorporation of LLMs. This highlights the potential of LLMs for validating extracted information. Moreover, experiments conducted on the music domain corpus showed that our approach maintains good performance across different fields, underscoring its versatility and effectiveness in domain-independent KG construction.

"""
s2 = s2.split()

In [19]:
for i,j in zip(s1,s2):
    if i != j:
        print("ok")

ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok


In [33]:
def handle_duplicates(data):
        # Create a dictionary to store unique sentences with their confidence
        unique_sentences = {}
    
        # Iterate through each element in the JSON
        for item in data:
            sentence_key = (item['subject'], item['predicate'], item['object'])
            sentence_value = item['sentence']
    
            # Check if the sentence already exists in the dictionary
            if sentence_key in unique_sentences:
                # Check if the sentence is identical
                if unique_sentences[sentence_key]['sentence'] != sentence_value:
                    # Merge sentences by adding a semicolon
                    unique_sentences[sentence_key]['sentence'] += ";" + sentence_value
            else:
                unique_sentences[sentence_key] = {'sentence': sentence_value}
    
        # Convert the dictionary into a list of dictionaries for saving
        unique_data = [{'subject': key[0], 'predicate': key[1], 'object': key[2], 'sentence': value['sentence'], 'first_validation': value['first_validation']} for key, value in unique_sentences.items()]
        result = unique_data
        return result

In [34]:
import sys
sys.path.append('../../utilities')
import utilities as u
import json

In [36]:
data = u.read_json_file(output_path)
un = handle_duplicates(data)

KeyError: 'first_validation'

In [32]:
len(data)

161

In [37]:
data

[{'sentence': 'Cyber-physical attacks (CPAs) are classified as the major threatening of SGs security because Cyber-physical attacks (CPAs) may lead to severe consequences such as large blackout and destruction of infrastructures.',
  'subject': 'large blackout',
  'predicate': 'is-a',
  'object': 'severe consequence',
  'confidence': '-',
  'first_validation': 'True'},
 {'sentence': 'Cyber-physical attacks (CPAs) are classified as the major threatening of SGs security because Cyber-physical attacks (CPAs) may lead to severe consequences such as large blackout and destruction of infrastructures.',
  'subject': 'destruction',
  'predicate': 'is-a',
  'object': 'severe consequence',
  'confidence': '-',
  'first_validation': 'True'},
 {'sentence': 'Computer networks consist of several assets such as hardware, software, and data sources.',
  'subject': 'hardware,',
  'predicate': 'is-a',
  'object': 'asset',
  'confidence': '-',
  'first_validation': 'True'},
 {'sentence': 'Computer networ

In [38]:
def supprimer_ponctuation(phrase):
    # Définition des caractères de ponctuation à supprimer
    ponctuations = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'
    
    # Filtrer les caractères pour ne garder que ceux qui ne sont pas des ponctuations
    phrase_sans_ponctuation = ''.join(caractere for caractere in phrase if caractere not in ponctuations)
    
    return phrase_sans_ponctuation
def remove_duplicates_with_sentence_check(entries):
    unique_entries = {}
    for entry in entries:
        key = (entry['subject'], entry['predicate'], entry['object'])
        if key in unique_entries:
            # Vérifie si la phrase est déjà présente pour éviter la duplication
            if entry['sentence'] not in unique_entries[key]['sentence']:
                unique_entries[key]['sentence'] += f" | {entry['sentence']}"
            if entry['first_validation'] == 'True':
                unique_entries[key]['first_validation'] = 'True'
        else:
            unique_entries[key] = entry
    return list(unique_entries.values())

result = remove_duplicates_with_sentence_check(data)

In [40]:
len(result)

147

In [48]:
def supprimer_ponctuation(phrase):
    # Définition des caractères de ponctuation à supprimer
    ponctuations = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'
    
    # Filtrer les caractères pour ne garder que ceux qui ne sont pas des ponctuations
    phrase_sans_ponctuation = ''.join(caractere for caractere in phrase if caractere not in ponctuations)
    
    return phrase_sans_ponctuation


for i in data:
    i["subject"] = supprimer_ponctuation(i["subject"])
    i["object"] =supprimer_ponctuation(i["object"])
    i['first_validation'] = supprimer_ponctuation(i["first_validation"])

In [60]:
supprimer_ponctuation("relational database,")

'relational database'