In [2]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd


## read data -----> utilities
def read_json_file(file_path):
    """
    Read a JSON file and return its contents as a Python dictionary.

    :param file_path: The path to the JSON file.
    :type file_path: str
    :return: A dictionary representing the JSON data.
    :rtype: dict
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {file_path}: {e}")
    except Exception as e:
        print(f"An error occurred while reading the file {file_path}: {e}")
        
def loadVerbMap(verb_map_path):
    verb_info = pd.read_csv(verb_map_path, sep=',')
    verb_map = {}
    for i,r in verb_info.iterrows():
        for j in range(34):
            verb = r['v' + str(j)]
            if str(verb) != 'nan':
                verb_map[verb] = r['predicate']
    return verb_map

def encode_and_store(sentences, model, file_path):
    # Encode sentences
    embeddings = model.encode(sentences, convert_to_tensor=True)

    # Create a dictionary with sentences as keys and their embeddings as values
    embeddings_dict = {sentence: embedding.tolist() for sentence, embedding in zip(sentences, embeddings)}

    # Save the dictionary to a JSON file
    with open(file_path, 'w') as file:
        json.dump(embeddings_dict, file)

def load_embeddings(file_path):
    # Load the embeddings from the JSON file
    with open(file_path, 'r') as file:
        embeddings_dict = json.load(file)

    # Convert the embeddings from list to numpy array
    embeddings_dict = {sentence: [float(value) for value in embedding] for sentence, embedding in embeddings_dict.items()}
    return embeddings_dict

# # Example sentences
# sentences = list(verb_map.keys())

# # Model initialization
# model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# # File path to store and load embeddings

# embd_path = 'C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/predicate_embeddings.json'

# # Encode sentences, store in a file, and then load them
# encode_and_store(sentences, model, embd_path)
# loaded_embeddings = load_embeddings(file_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#!/usr/bin/env python
# coding: utf-8
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import PredicateMapper_utilities

class PredicateMapper:
    def __init__(self, triples_path,verbs_path, embd_path):
        self.predicate_embd = PredicateMapper_utilities.load_embeddings(embd_path)
        self.verb_map = PredicateMapper_utilities.loadVerbMap(verbs_path)
        self.input_triples = PredicateMapper_utilities.read_json_file(triples_path)
        self.mapped_predicate = {}
        
    def similarity_mapping(self, predicate):
       
        model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
        embd_predicate = model.encode(predicate, convert_to_tensor=True)
        embd_predicate = np.array(embd_predicate).reshape(1, -1)
        
        max_similarity = float('-inf')
        max_element = None
    
        for k, v in self.predicate_embd.items():
            embd_verb = np.array(v).reshape(1, -1)
            similarity = cosine_similarity(embd_predicate, embd_verb)[0, 0]
    
            if similarity > max_similarity:
                max_similarity = similarity
                max_element = {k: similarity}
    
        # print(max_element)
        return max_element
    

    def get_last_pp(self, predicate):
        ## return a tuple the predicate with out last pp and the pp
        pass



    def check_last_word_preposition(self, predicate):
    # Liste des prépositions en anglais (à compléter si nécessaire)
        prepositions = ["aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "around", 
                        "as", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", 
                        "concerning", "considering", "despite", "down", "during", "except", "for", "from", "in", 
                        "inside", "into", "like", "near", "of", "off", "on", "onto", "out", "outside", "over", 
                        "past", "regarding", "round", "since", "through", "throughout", "to", "toward", "under", 
                        "underneath", "until", "unto", "up", "upon", "with", "within", "without"]
    
        words = predicate.split()
        last_word = words[-1].lower()  # Convertir en minuscules pour la comparaison en anglais
    
        if last_word in prepositions:
            # Le dernier mot est une préposition
            sentence_without_preposition = ' '.join(words[:-1])
            return (sentence_without_preposition, last_word)
        else:
            # Le dernier mot n'est pas une préposition
            return (predicate,None)
    
    ## return the dic of mapped predicate
        # threshold: the predicate mapping threshold
        # option: keep or  delete for unmpapped predicate
            ## keep: keep the as them (not mention them in the dict mapping)
            ## delete: in the mapping dict add a message that indicate thist triple should be deleted ("invalid triple")
    def predicate_mapping(self, threshold, option = "keep"):
        for triple in tqdm(self.input_triples, desc="predicate mapping"):
            predicate = triple["predicate"]
            predicate_ = self.check_last_word_preposition(predicate)
            if predicate_:
                if predicate_[0] in self.verb_map.keys(): 
                    pp = ""
                    if predicate_[1]:
                        pp = " "+ predicate_[1]
                    self.mapped_predicate[predicate] = self.verb_map[predicate_[0]] + pp
                    # print("direct mapping:  ", predicate,"-->" ,self.verb_map[predicate_[0]] + pp )
                else:
                   
                    max_sim_verb = self.similarity_mapping(predicate_[0])
                    # print(max_sim_verb)
                    if list(max_sim_verb.values())[0] >= threshold:
                        pp = ""
                        if predicate_[1]:
                            pp = " "+ predicate_[1]
                        self.mapped_predicate[predicate] = self.verb_map[list(max_sim_verb.keys())[0]] + pp
                        # print("sim:",max_sim_verb,"\n")
                        # print("similarity mapping:  ", predicate,"-->", self.verb_map[list(max_sim_verb.keys())[0]] + pp )
                    else:
                        ### keep or delete ! 
                        if option == "delete":
                            self.mapped_predicate[predicate] = "invalid triple"
                        if option == "keep":
                            pass
                        else:
                            print("invalid option !")
                            break

In [4]:
input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/post_processing/syntactic_cleaning/cleaned_triples.json"
verbs_map_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/CSKG_VerbNet_verb_map.csv"
embd_path = 'C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/predicate_embeddings.json'
triples = PredicateMapper_utilities.read_json_file(input_path)

ok


In [8]:
pm = PredicateMapper(input_path,verbs_map_path,embd_path)
pm.predicate_mapping(0.7)


predicate mapping: 100%|██████████| 36/36 [01:13<00:00,  2.05s/it]


In [1]:
pm.mapped_predicate

NameError: name 'pm' is not defined

#### similarity based mapping (test)

In [5]:
import PredicateMapper_utilities
import numpy as np
def similarity_mapping(predicate, predicate_embd):
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    embd_predicate = model.encode(predicate, convert_to_tensor=True)
    embd_predicate = np.array(embd_predicate).reshape(1, -1)

    max_similarity = float('-inf')
    max_element = None

    for k, v in predicate_embd.items():
        embd_verb = np.array(v).reshape(1, -1)
        similarity = cosine_similarity(embd_predicate, embd_verb)[0, 0]

        if similarity > max_similarity:
            max_similarity = similarity
            max_element = {k: similarity}

    # print(max_element)
    return max_element
# p ='are required to build'
# p = 'have been repeated for'
# p = 'do need do need to'
# p = "is creating"
# p = "to dive deep"
# p = 'will use'
p = 'is created'
predicate_embd = PredicateMapper_utilities.load_embeddings(embd_path)
similarity_mapping(p, predicate_embd)

{'create': 0.8466176138921004}

In [7]:
verb_map = PredicateMapper_utilities.loadVerbMap(verbs_map_path)

In [8]:
verb_map['create']

'produces'

In [194]:
def validation(ent):
        validEntities = []
		# brown_ic = wordnet_ic.ic('ic-brown.dat')
        semcor_ic = wordnet_ic.ic('ic-semcor.dat')
        for e in ent:		
            valid = True
            for synset in wn.synsets(e):
                ic_value = semcor_ic['n'][synset.offset()]
                if ic_value <= 4 and ic_value > 0:
                    valid = False
                    #print(e, 'NOT', ic_value)
                    break
                if valid:
                    validEntities.append(e)
        return validEntities

In [196]:
ent = ["paper", "method", "computer", "human"]
validation(ent)

['paper',
 'paper',
 'paper',
 'method',
 'method',
 'computer',
 'computer',
 'human',
 'human',
 'human',
 'human']

In [159]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

In [167]:
# nltk.download('wordnet_ic')
nltk.download('wordnet')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

[nltk_data] Downloading package wordnet to C:\Users\admin-
[nltk_data]     user\AppData\Roaming\nltk_data...


In [186]:
synsets = wn.synsets('machine learning')
valid = True
for synset in synsets:
        ic_value = semcor_ic['n'][synset.offset()]
        # if ic_value <= 4 and ic_value > 0:
        #         valid = False
        #         #print(e, 'NOT', ic_value)
        #         break
        # if valid:
            

In [188]:
synsets

[]

In [174]:
def validation(self):
		brown_ic = wordnet_ic.ic('ic-brown.dat')
		semcor_ic = wordnet_ic.ic('ic-semcor.dat')
		for e in self.inputEntities:
			if e in self.blacklist or len(e) <= 2 or e.isdigit() or e[0].isdigit() or len(nltk.word_tokenize(e)) >= 7:# # no blacklist, no 1-character entities, no only numbers, no entities that start with a number, no entities with more than 7 tokens
				continue			
  
			if e in self.csoTopics:
				self.validEntities.add(e)
			elif e in self.magTopics:
				self.validEntities.add(e)
			else:
				valid = True
				for synset in wn.synsets(e):
					ic_value = semcor_ic['n'][synset.offset()]
					if ic_value <= 4 and ic_value > 0:
						valid = False
						#print(e, 'NOT', ic_value)
						break
				if valid:
					self.validEntities.add(e)

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     ---------------------------------------- 0.0/250.0 kB ? eta -:--:--
     ------ ------------------------------ 41.0/250.0 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 250.0/250.0 kB 3.8 MB/s eta 0:00:00
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [29]:
import re
from nltk.stem import WordNetLemmatizer

def is_passive(s,p,o):
    # Unpack the triple into subject, predicate, and object
    subject = s
    predicate = p
    obj = o
    
    # Define patterns for identifying passive voice
    passive_patterns = [
        re.compile(r"\b(?:by)\b", re.IGNORECASE),
    ]
    
    # Check if any passive voice pattern is present in the predicate
    is_passive = any(pattern.search(predicate) for pattern in passive_patterns)
    
    # Determine the voice based on the analysis
    if is_passive:
        return True
    else:
        return False

def convert_to_active(s,p,o):
  # Unpack the triple into subject, predicate, and object
    subject = s
    predicate = p
    obj = o
    
    # subject, predicate, obj = passive_triple
    predicate_without_by = re.sub(r"\b(?:by)\b", "", predicate, flags=re.IGNORECASE).strip()
    lemmatizer = WordNetLemmatizer()
    lemmatized_verb = lemmatizer.lemmatize(predicate_without_by, pos='v')
    active_triple = (obj,lemmatized_verb, subject)
    return active_triple


In [27]:
convert_to_active(triple_passive)

('attacker', 'exploit', 'vulnirability')

In [1]:
def detecter_patron(texte):
    # Séparation du texte par "-"
    parties = texte.split("-")
    
    # Vérification si le texte a exactement deux parties et si les deux parties sont du texte
    if len(parties) == 2 and all(partie.strip().isalpha() for partie in parties):
        return True
    else:
        return False

True