In [33]:
import json
import sys
sys.path.append('../utilities')
import utilities as u
from openai import OpenAI
import os
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize

class GPT_KG:
   
    def __init__(self, data_path, output_path, gpt_key):
        ## input_path:  triplets file path
        ## output_path: directory for outputs
        ##gpt_key: openai key
        self.data = u.read_json_file(data_path)
        self.output_path = output_path
        self.openai_key = gpt_key
        self.client = OpenAI(api_key= self.openai_key)
        self.results = []

    def chat_gpt(self, prompt):
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature = 0
        )
        
        return response.choices[0].message.content.strip()
    
    
    ## retourne le prompt pour la validation avec la phrase
    def Triple_sentence_prompt(self,paragraph, sentence):
        prompt = ( 
    """
From the given sentence, extract triplets for the construction of a knowledge graph. Ensure you follow the specified output format for each identified triplet. The triplets should be structured in a way that clearly identifies the subject entity, predicate (relation), and object entity. 
Output Format: 
For each extracted triplet, please structure the information in a dictionary format as follows: 
{"subject": "The entity or concept that is the starting point of the relation (usually a noun phrase).",
"predicate": "The action or relation connecting the subject to the object.", 
"object": "The entity or concept that is affected or linked to the subject by the predicate (also a noun phrase)."} 

Examples: 
Exemple 1: 
sentence: Computer Programming competence is a good research field in which students of Computer Science can be assisted by an Intelligent Tutoring System (ITS).
extracted triplets: 
{“subject”: “Computer Programming competence”, “predicate” : is-a, “object”: research field}
{“subject”: “Intelligent Tutoring System”, “predicate” : assist, “object”: students of Computer Science}
 Exemple 2: 
sentence: A series of feature extractors learned from CNN have been used in other computer vision tasks. However, CNN features of different layers aim to encode different-level information. 
extracted triplets: 
{“subject”: “serie of feature extractor”, “predicate” : “learned from”, “object”: “CNN ”} {“subject”: “serie of feature extractor”, “predicate” : “used-in”, “object”: “computer vision task”} {“subject”: “CNN features of different layers”, “predicate” : “encode”, “object”: “different-level information ”}
Exemple 3: 
sentence: Existing deep learning algorithms are widely used on RGB images or video data. extracted triplets: 
{“subject”: “Existing deep learning algorithms”, “predicate” : “used on”, “object”: “RGB image data”} 
{“subject”: “Existing deep learning algorithms”, “predicate” : “used on”, “object”: “RGB video data”} 
Exemple 4: 
sentence: Meanwhile, with the development of low-cost RGB-D sensors (such as Microsoft Kinect and Xtion Pro-Live), high-quality RGB-D data can be easily acquired and used to enhance computer vision algorithms. 
extracted triplets: {“subject”: “Microsoft Kinect”, “predicate” : “is-a”, “object”: “low-cost RGB-D sensors”} 
{“subject”: “Xtion Pro-Live”, “predicate” : “is-a”, “object”: “low-cost RGB-D sensors”} {“subject”: “high-quality RGB-D data”, “predicate” : “enhance”, “object”: “computer vision algorithm”}

    """
    "\n###\n"
    '\n"paragraph context: "\n'
    f'"{paragraph} "\n'
    '\n"Test sentence: : "\n' 
     f'"{sentence} "\n'
    '\nResponse:'
)
                    
        return prompt
    
    def get_triplets(self,sentence,paragraph):
        prompt = self.Triple_sentence_prompt(sentence,paragraph)
        triplets = self.chat_gpt(prompt)
        return triplets
        
        
        
    def get_all_triples(self):
        for element in tqdm(self.data, desc="triplets extraction", unit="abstract"):
            
            abstract = self.data[element]["abstract"]
            id = self.data[element]["id"]
            sentences = sent_tokenize(abstract)
            for sentence in sentences:
                triplets = self.get_triplets(abstract, sentence)
                if triplets:
                    self.results.append(
                        {"sentence": sentence,
                        "id": id,
                        "triplets": triplets}
                    )
        u.save_to_json(output_path, self.results)
        return self.results
    

In [34]:
input_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/data/ziwei_cs_data/CS_bench.json"
output_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/informations_extraction/gpt_kg.json"
api_key = "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"

In [35]:
gpt_kg = GPT_KG(input_path, output_path, api_key )

In [36]:
results = gpt_kg.get_all_triples()

triplets extraction: 100%|██████████| 12/12 [09:46<00:00, 48.91s/abstract]


In [30]:
triples = gpt_kg.get_triplets(paragraph, sentence)

In [55]:
result_dict = json.loads(results[10]["triplets"].split("\n")[0])

{'subject': 'TCBR',
 'predicate': 'can enable us to stabilize',
 'object': 'the target generator',
 'sentence': 'ok'}

In [75]:
def triplets_formating(data):
    res = []
    for element in data:
        triplets = element["triplets"].split("\n")
        for triplet in triplets:
            try:
                if triplet[-1] != '}':
                    triplet = triplet[:-1]
                    if triplet[-1] == ',':
                        triplet = triplet[:-1]
                triplet_dict = json.loads(triplet)
                triplet_dict["sentence"] = element["sentence"]
                triplet_dict["id"] = element["id"]
                res.append(triplet_dict)
            except json.JSONDecodeError as e:
                # Return an error message if the string cannot be converted
                print(triplet)
    return res
res = triplets_formating(gpt_kg.results)      

The given sentence does not contain any triplets
This sentence does not contain any triplets
No triplets can be extracted from the test sentence


In [78]:
output_path_format = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/src/informations_extraction/gpt_kg_vf.json"
u.save_to_json(output_path_format, res)

In [28]:
for element in gpt_kg.data:
    abstract = gpt_kg.data[element]["abstract"]
    id = gpt_kg.data[element]["id"]
    sentences = sent_tokenize(abstract)
    for sentence in sentences:
        triplets = gpt_kg.get_triplets(abstract, sentence)
        if triplets:
            self.results.append(
                "sentence": sentence,
                "id": id,
                "triplets": triplets,
            )
    return results


str

In [7]:
import nltk
from nltk.tokenize import sent_tokenize

paragraph = "Computer networks consist of several assets such as hardware, software, and data sources. These assets have often some vulnerabilities which can be exploited by attackers that violate security policies in the network. Considering the limited budget, the network administrator should analyze and prioritize these vulnerabilities to be able to efficiently protect a network by mitigating the most risky ones. So far, several security parameters are offered to analyze security risks from the network security administrator's perspective. The major drawback of these methods is that they do not consider attacker's motivation. Depending on the motivation of potential attackers, different attack path may be selected for network security compromise. So, attacker's motivation is a key factor in predicting the attacker's behavior. In this paper, the attacker's motivation is considered in the process of security risk analysis, so network administrators are able to analyze security risks more accurately. The proposed method is applied on a network and the results are compared with novel works in this area. The experimental results show that network administrator will be able to precisely predict the behavior of attackers and apply countermeasures more efficiently."

sentences = sent_tokenize(paragraph)

for i in sentences:
    print("\n ",i)


  Computer networks consist of several assets such as hardware, software, and data sources.

  These assets have often some vulnerabilities which can be exploited by attackers that violate security policies in the network.

  Considering the limited budget, the network administrator should analyze and prioritize these vulnerabilities to be able to efficiently protect a network by mitigating the most risky ones.

  So far, several security parameters are offered to analyze security risks from the network security administrator's perspective.

  The major drawback of these methods is that they do not consider attacker's motivation.

  Depending on the motivation of potential attackers, different attack path may be selected for network security compromise.

  So, attacker's motivation is a key factor in predicting the attacker's behavior.

  In this paper, the attacker's motivation is considered in the process of security risk analysis, so network administrators are able to analyze secur