In [120]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd


## read data -----> utilities
def read_json_file(file_path):
    """
    Read a JSON file and return its contents as a Python dictionary.

    :param file_path: The path to the JSON file.
    :type file_path: str
    :return: A dictionary representing the JSON data.
    :rtype: dict
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {file_path}: {e}")
    except Exception as e:
        print(f"An error occurred while reading the file {file_path}: {e}")
        
def loadVerbMap(verb_map_path):
    verb_info = pd.read_csv(verb_map_path, sep=',')
    verb_map = {}
    for i,r in verb_info.iterrows():
        for j in range(34):
            verb = r['v' + str(j)]
            if str(verb) != 'nan':
                verb_map[verb] = r['predicate']
    return verb_map

def encode_and_store(sentences, model, file_path):
    # Encode sentences
    embeddings = model.encode(sentences, convert_to_tensor=True)

    # Create a dictionary with sentences as keys and their embeddings as values
    embeddings_dict = {sentence: embedding.tolist() for sentence, embedding in zip(sentences, embeddings)}

    # Save the dictionary to a JSON file
    with open(file_path, 'w') as file:
        json.dump(embeddings_dict, file)

def load_embeddings(file_path):
    # Load the embeddings from the JSON file
    with open(file_path, 'r') as file:
        embeddings_dict = json.load(file)

    # Convert the embeddings from list to numpy array
    embeddings_dict = {sentence: [float(value) for value in embedding] for sentence, embedding in embeddings_dict.items()}
    return embeddings_dict

# Example sentences
sentences = list(verb_map.keys())

# Model initialization
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# File path to store and load embeddings

embd_path = 'C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/predicate_embeddings.json'

# Encode sentences, store in a file, and then load them
encode_and_store(sentences, model, embd_path)
loaded_embeddings = load_embeddings(file_path)

In [286]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import PredicateMapper_utilities

class PredicateMapper:
    def __init__(self, verbs_path, triples, embd_path):
        self.predicate_embd = PredicateMapper_utilities.load_embeddings(embd_path)
        self.verb_map = PredicateMapper_utilities.loadVerbMap(verbs_path)
        self.input_triples = triples
        self.mapped_predicate = {}
        
    ##this function will be used in predicate mapping return the max of sim
    # def similarity_mapping(self, predicate):
    #     print(predicate)
    #     model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    #     embd_predicate = model.encode(predicate, convert_to_tensor=True)
    #     embd_predicate = np.array(embd_predicate).reshape(1, -1)
    #     results = []
    #     for k,v in self.predicate_embd.items():
    #         embd_verb = np.array(v).reshape(1, -1)
    #         results.append(
    #             {
    #                 k: cosine_similarity(embd_predicate, embd_verb)[0, 0]
    #             }
    #         )
    #     max_element = max(results, key=lambda x: list(x.values())[0])
    #     print(max_element)
    #     return max_element


    def similarity_mapping(self, predicate):
       
        model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
        embd_predicate = model.encode(predicate, convert_to_tensor=True)
        embd_predicate = np.array(embd_predicate).reshape(1, -1)
        
        max_similarity = float('-inf')
        max_element = None
    
        for k, v in self.predicate_embd.items():
            embd_verb = np.array(v).reshape(1, -1)
            similarity = cosine_similarity(embd_predicate, embd_verb)[0, 0]
    
            if similarity > max_similarity:
                max_similarity = similarity
                max_element = {k: similarity}
    
        # print(max_element)
        return max_element
    

    def get_last_pp(self, predicate):
        ## return a tuple the predicate with out last pp and the pp
        pass



    def check_last_word_preposition(self, predicate):
    # Liste des prépositions en anglais (à compléter si nécessaire)
        prepositions = ["aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "around", 
                        "as", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "by", 
                        "concerning", "considering", "despite", "down", "during", "except", "for", "from", "in", 
                        "inside", "into", "like", "near", "of", "off", "on", "onto", "out", "outside", "over", 
                        "past", "regarding", "round", "since", "through", "throughout", "to", "toward", "under", 
                        "underneath", "until", "unto", "up", "upon", "with", "within", "without"]
    
        words = predicate.split()
        last_word = words[-1].lower()  # Convertir en minuscules pour la comparaison en anglais
    
        if last_word in prepositions:
            # Le dernier mot est une préposition
            sentence_without_preposition = ' '.join(words[:-1])
            return (sentence_without_preposition, last_word)
        else:
            # Le dernier mot n'est pas une préposition
            return (predicate,None)
    
    ## return the dic of mapped predicate
        # threshold: the predicate mapping threshold
        # option: keep or  delete for unmpapped predicate
            ## keep: keep the as them (not mention them in the dict mapping)
            ## delete: in the mapping dict add a message that indicate thist triple should be deleted ("invalid triple")
    def predicate_mapping(self, threshold, option = "keep"):
        for triple in tqdm(self.input_triples, desc="predicate mapping"):
            predicate = triple["predicate"]
            predicate_ = self.check_last_word_preposition(predicate)
            if predicate_:
                if predicate_[0] in self.verb_map.keys(): 
                    pp = ""
                    if predicate_[1]:
                        pp = " "+ predicate_[1]
                    self.mapped_predicate[predicate] = self.verb_map[predicate_[0]] + pp
                    # print("direct mapping:  ", predicate,"-->" ,self.verb_map[predicate_[0]] + pp )
                else:
                   
                    max_sim_verb = self.similarity_mapping(predicate_[0])
                    # print(max_sim_verb)
                    if list(max_sim_verb.values())[0] >= threshold:
                        pp = ""
                        if predicate_[1]:
                            pp = " "+ predicate_[1]
                        self.mapped_predicate[predicate] = self.verb_map[list(max_sim_verb.keys())[0]] + pp
                        # print("sim:",max_sim_verb,"\n")
                        # print("similarity mapping:  ", predicate,"-->", self.verb_map[list(max_sim_verb.keys())[0]] + pp )
                    else:
                        ### keep or delete ! 
                        if option == "delete":
                            self.mapped_predicate[predicate] = "invalid triple"
                        if option == "keep":
                            pass
                        else:
                            print("invalid option !")
                            break

In [287]:
input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/post_processing/syntactic_cleaning/cleaned_triples.json"
verbs_map_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/CSKG_VerbNet_verb_map.csv"
embd_path = 'C:/Users/admin-user/Desktop/my_phd/implementations_KG/resources/predicate_embeddings.json'
triples = PredicateMapper_utilities.read_json_file(input_path)

In [288]:
pm = PredicateMapper(verbs_map_path, triples,embd_path)
pm.predicate_mapping(0.7)

predicate mapping: 100%|██████████| 57/57 [01:59<00:00,  2.10s/it]


In [289]:
pm.mapped_predicate

{'consist of': 'skos:broader/is/hyponym-of of',
 'have': 'includes',
 'can be exploited by': 'uses by',
 'are offered to analyze': 'analyzes',
 'to analyze': 'analyzes',
 'may be selected for': 'acquires for',
 'is': 'skos:broader/is/hyponym-of',
 'are to analyze more accurately': 'analyzes',
 'is considered in': 'includes in',
 'is applied on': 'uses on',
 'are compared with': 'matches with',
 'to precisely predict': 'predicts',
 'affects': 'causes',
 'records': 'acquires',
 'is used to identify': 'identifies',
 'to identify': 'identifies',
 'may lead to': 'guides to',
 'are required to build': 'based-on',
 'to build': 'based-on',
 'assigns': 'provides',
 'is to present': 'provides',
 'have been introduced to classify': 'classifies',
 'provided by': 'provides by',
 'is used in': 'uses in',
 'using': 'uses',
 'are': 'skos:broader/is/hyponym-of',
 'achieved respectively': 'acquires',
 'suggests': 'proposes',
 'be': 'skos:broader/is/hyponym-of',
 'plays in': 'executes in',
 'allows': 'su

#### similarity based mapping (test)

In [290]:
def similarity_mapping(predicate, predicate_embd):
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    embd_predicate = model.encode(predicate, convert_to_tensor=True)
    embd_predicate = np.array(embd_predicate).reshape(1, -1)

    max_similarity = float('-inf')
    max_element = None

    for k, v in predicate_embd.items():
        embd_verb = np.array(v).reshape(1, -1)
        similarity = cosine_similarity(embd_predicate, embd_verb)[0, 0]

        if similarity > max_similarity:
            max_similarity = similarity
            max_element = {k: similarity}

    # print(max_element)
    return max_element
# p ='are required to build'
# p = 'have been repeated for'
# p = 'do need do need to'
# p = "is creating"
# p = "to dive deep"
# p = 'will use'
p = 'violate'
predicate_embd = PredicateMapper_utilities.load_embeddings(embd_path)
similarity_mapping(p, predicate_embd)

{'kill': 0.4583008035607263}

In [291]:
verb_map = PredicateMapper_utilities.loadVerbMap(verbs_map_path)

In [292]:
verb_map['kill']

'affects'