Split input data

In [None]:
import pandas as pd
Input_Data_Frame=pd.read_csv("1000_articles_of_100CVEs_in_2024.csv")

In [None]:
Split_length = 700
from semantic_text_splitter import TextSplitter # type: ignore
import pandas as pd # type: ignore

splitter = TextSplitter.from_tiktoken_model("gpt-4", Split_length)

Core_DataFrame = pd.DataFrame()

total_overlong = 0

for i, row in Input_Data_Frame.iterrows():
    content_parts = splitter.chunks(row['content'])
    
    temp_df = pd.DataFrame({
        'content part order': range(0, len(content_parts)),
        'content part': content_parts,
        'content': row['content']
    })
    
    for col in Input_Data_Frame.columns:
        if col != 'content':
            temp_df[col] = [row[col]] * len(temp_df)
    
    if len(temp_df) > 1:
        total_overlong += 1

    Core_DataFrame = pd.concat([Core_DataFrame, temp_df], ignore_index=True)


In [None]:
import time
from datetime import datetime, timedelta
from IPython.display import clear_output, display, HTML
import json
from openai import OpenAI


def download_jsonl(batchid, printstatus=True):
    # Download the result of the jsonl file based on the file id, and the result in the 'answer' key is stored as a list
    import os         
    os.environ["OPENAI_API_KEY"] = "<APIKEYHERE>"
    api_key = os.getenv("OPENAI_API_KEY")
    api_base = 'https://api.openai.com/v1'
    client = OpenAI(api_key=api_key, base_url=api_base)
    # Retrieve batch status
    full_status = client.batches.retrieve(batchid)
    status = full_status.status
    failed_count = full_status.request_counts.failed
    if failed_count > 0:
        print(f"{failed_count} requests failed.")
        print("Full status:", full_status)
        
    # Initialize return dictionary
    result = {
        "status": status,
        "answer": None,
        "fullstatus": full_status,
        "responsedf": None,
        "fullresponse": None,
        "errordf": None,
        "errorid": None
    }
    
    # Print batch status
    if printstatus:
        print(f"Batch status: {status}")
    
    def jsonl_to_dataframe(jsonl_data):
        import pandas as pd
        # Split the JSONL data into individual JSON objects
        json_objects = []
        for line in jsonl_data.split('\n'):
            if line.strip():  # Ignore empty lines
                try:
                    json_objects.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError: {e} for line: {line}")

        # Create DataFrame
        df = pd.json_normalize(json_objects)
        return df
    
    # If the status is 'completed', retrieve the file content
    if status == 'completed':
        outputid = full_status.output_file_id
        file_response = client.files.content(outputid)
        responsedf = jsonl_to_dataframe(file_response.text)
        # Convert custom_id to int
        responsedf['custom_id'] = responsedf['custom_id'].astype(int)
        responsedf.sort_values(by="custom_id", inplace=True)
        responsedf.reset_index(drop=True, inplace=True)
        responses = [responsedf.loc[x, 'response.body.choices'][0]['message']['content'] for x in range(len(responsedf))]
        result["answer"] = responses
        result["responsedf"] = responsedf
        result["fullresponse"] = file_response.text
        print("File content retrieved. Please check the 'answer' key in the dictionary.")
    else:
        if printstatus:
            print("Batch status is not 'completed', not attempting to retrieve file content.")
    
    if failed_count > 0:
        print(f"Since {failed_count} requests failed, additionally outputting errordf and errorid.")
        
        # Retrieve the error file ID and download the content
        errorid = full_status.error_file_id
        file_error = client.files.content(errorid)
        
        # Print error file content
        print("Error details:", file_error.text)
        
        # Parse the error file content and extract the custom_id list
        errordf = jsonl_to_dataframe(file_error.text)
        result['errorid'] = errordf['custom_id'].astype(int).tolist()
        
        # Store the error data in the result dictionary
        result["errordf"] = errordf

        
    return result

def auto_down_ans(id, max_wait_time=60, jobtype="chat"):
    max_wait_time_seconds = max_wait_time * 60  # Convert minutes to seconds
    start_time = time.time()

    while True:
        clear_output(wait=True)  # Clear previous output
        current_time = datetime.now().strftime("%H:%M:%S")
        elapsed_time = time.time() - start_time
        elapsed_str = str(timedelta(seconds=int(elapsed_time)))  # Convert elapsed time to HH:MM:SS format

        display(HTML(f"<p>Current time: <b>{current_time}</b></p>"))
        display(HTML(f"<p>Elapsed time: <b>{elapsed_str}</b></p>"))
        display(HTML('<p style="color:red;">Processing...</p>'))  # Display "Processing" in red
        if jobtype == "chat":
            try:
                download = download_jsonl(id)
                ans = download['answer']
                if ans is not None:
                    clear_output(wait=True)  # Clear previous output
                    total_time_str = str(timedelta(seconds=int(time.time() - start_time)))
                    display(HTML(f"<p>Current time: <b>{current_time}</b></p>"))
                    display(HTML(f"<p>Total running time: <b>{total_time_str}</b></p>"))
                    display(HTML('<p style="color:green;">Completed!</p>'))  # Display "Completed" in green
                    return ans
                else:
                    fullstatus = download['fullstatus']
                    completed_count = fullstatus.request_counts.completed
                    total_count = fullstatus.request_counts.total
                    failed_count = fullstatus.request_counts.failed
                    html_content = f"""
                    <p>
                        Completed: <b style='color:green'>{completed_count}</b> &nbsp;&nbsp;
                        Total: <b style='color:blue'>{total_count}</b> &nbsp;&nbsp;
                        Failed: <b style='color:red'>{failed_count}</b>
                    </p>
                    """

                    display(HTML(html_content))
            except Exception as e:
                print(f'Download failed, error: {e}')
                
        if jobtype == "embedding":
            try:
                import os         
                os.environ["OPENAI_API_KEY"] = "<APIKEYHERE>"
                api_key = os.getenv("OPENAI_API_KEY")
                api_base = 'https://api.openai.com/v1'
                client = OpenAI(api_key=api_key, base_url=api_base)
                # Retrieve batch status
                full_status = client.batches.retrieve(id)
                outputid = full_status.output_file_id
                file_response = client.files.content(outputid)
                ans_text = file_response.text
                list_embedding = [json.loads(line)['response']['body']['data'][0]['embedding'] 
                  for line in ans_text.split("\n") if line.strip()]
                
                clear_output(wait=True)  # Clear previous output
                total_time_str = str(timedelta(seconds=int(time.time() - start_time)))
                display(HTML(f"<p>Current time: <b>{current_time}</b></p>"))
                display(HTML(f"<p>Total running time: <b>{total_time_str}</b></p>"))
                display(HTML('<p style="color:green;">Completed!</p>'))  # Display "Completed" in green
                    
                return list_embedding
            
            except Exception as e:
                print(f'Embedding failed: {e}')
                
        if elapsed_time > max_wait_time_seconds:
            clear_output(wait=True)  # Clear previous output
            total_time_str = str(timedelta(seconds=int(elapsed_time)))
            display(HTML(f"<p>Current time: <b>{current_time}</b></p>"))
            display(HTML(f"<p>Total running time: <b>{total_time_str}</b></p>"))
            display(HTML('<p style="color:red;">Exceeded maximum wait time, download failed.</p>'))
            return None

        if elapsed_time < 600:  # Query every 3 seconds for the first 10 minutes
            wait_time = 3
        else:  # Query every 1 minute after 10 minutes
            wait_time = 60

        time.sleep(wait_time)


Extract Secruity-related name

In [None]:
import tools

def name_extraction_prompt_maker(single_value):
    prompts = [
        {
            "role": "user",
            "content": '''Please execute the following steps to extract all proper nouns from the input text:

Step 1: Identification

    Identify all proper nouns that are specific to the field of computer science.

Inclusion Criteria (Rule1):

    Only include proper nouns that are directly mentioned in the text.
    Focus on specific names related to:
        Malware
        Unique vulnerabilities (specifically referenced by CVE identifiers)
        Distinct network identifiers (IP addresses, domain names, URLs)
        Detailed Indicators of Compromise (IOCs) such as file paths, hash values, port numbers, registry key changes, MAC addresses, and unique signatures.
        Software names (e.g., antivirus programs, hacking tools, operating systems)
        Hardware names (e.g., specific models of routers, servers, or IoT devices)
        Specific names of hacker groups or threat actors
        Specific names of security protocols or encryption algorithms
        Specific names of programming languages, libraries, or frameworks
        Specific names of computer science concepts or technologies

Step 2: Exclusion
    After listing all names according to Rule1, evaluate each against Rule2.
    For each name you exclude based on Rule2, provide a concise justification. Rule2 states that you should exclude the following types of proper nouns:
        Human names
        Country names
        City names

Final Step: Final Output

    After the evaluation, present the final, curated list that adheres to Rule1 and Rule2.

    Format request:
    You should use the JSON format to present the names in each step.
    An example JSON output should look like this. Make sure the keys are the same as "Step 1", "Step 2", "Final Step", and the values are lists of names with double quotes.
    {
        "Step 1": ["ExampleMalware1", "ExampleMalware2", "Country1", "human1"],
        "Step 2": ["ExampleMalware1:Include", "ExampleMalware2:Include", "Country1:Exclude,country name", "human1:Exclude, human name"],
        "Final Step": ["ExampleMalware1", "ExampleMalware2"]
    }
            '''
        },
        {
            "role": "assistant",
            "content": "Understood. I will now proceed to extract the names."
        },
        {
            "role": "user",
            "content": "The malware named DarkComet was used by the hacker group APT29. It exploited the CVE-2012-0158 vulnerability to infiltrate systems using IP address 192.168.0.10. The attack was identified through the SHA256 hash: a3d5f8e7e6b8f2348d7b9d8a6f9a0f2a2c1e1f3f8a9f4f8d6c7f8e8f1f1a3d6b. The affected systems were running the Windows 7 operating system and utilized the D-Link DIR-615 router. The malware communicated using the SSL/TLS protocol and was written in C++. This incident was reported in New York, USA, and highlighted vulnerabilities in network security architecture. The person who discovered it was named John Doe."
        },
        {
            "role": "assistant",
            "content": '''```json{
                "Step 1": [
                    "DarkComet",
                    "APT29",
                    "CVE-2012-0158",
                    "192.168.0.10",
                    "SHA256:a3d5f8e7e6b8f2348d7b9d8a6f9a0f2a2c1e1f3f8a9f4f8d6c7f8e8f1f1a3d6b",
                    "Windows 7",
                    "D-Link DIR-615",
                    "SSL/TLS",
                    "C++",
                    "New York",
                    "USA",
                    "John Doe",
                    "network security architecture"
                ],
                "Step 2": [
                    "DarkComet:Include",
                    "APT29:Include",
                    "CVE-2012-0158:Include",
                    "192.168.0.10:Include",
                    "SHA256:a3d5f8e7e6b8f2348d7b9d8a6f9a0f2a2c1e1f3f8a9f4f8d6c7f8e8f1f1a3d6b:Include",
                    "Windows 7:Include",
                    "D-Link DIR-615:Include",
                    "SSL/TLS:Include",
                    "C++:Include",
                    "New York:Exclude, city name",
                    "USA:Exclude, country name",
                    "John Doe:Exclude, human name",
                    "network security architecture:Include"
                ],
                "Final Step": [
                    "DarkComet",
                    "APT29",
                    "CVE-2012-0158",
                    "192.168.0.10",
                    "SHA256:a3d5f8e7e6b8f2348d7b9d8a6f9a0f2a2c1e1f3f8a9f4f8d6c7f8e8f1f1a3d6b",
                    "Windows 7",
                    "D-Link DIR-615",
                    "SSL/TLS",
                    "C++",
                    "network security architecture"
                ]
            }```'''
        },
        {
            "role": "user",
            "content": "The hacker group known as Fancy Bear used the malware Zeus to exploit the CVE-2015-2426 vulnerability. The attack was carried out via the domain name malicious-site.com and was tracked using the IOC MD5 hash: 5d41402abc4b2a76b9719d911017c592. The targeted systems were protected by the McAfee antivirus and operated on the Ubuntu Linux OS, connected through a Netgear Nighthawk X6 router. Communications were secured by the SSH protocol, and the attack code was written in Python. The incident was detected in Tokyo, Japan, and the initial analysis was done by Jane Smith."
        },
        {
            "role": "assistant",
            "content": '''```json{
                "Step 1": [
                    "Fancy Bear",
                    "Zeus",
                    "CVE-2015-2426",
                    "malicious-site.com",
                    "MD5:5d41402abc4b2a76b9719d911017c592",
                    "McAfee",
                    "Ubuntu Linux",
                    "Netgear Nighthawk X6",
                    "SSH",
                    "Python",
                    "Tokyo",
                    "Japan",
                    "Jane Smith"
                ],
                "Step 2": [
                    "Fancy Bear:Include",
                    "Zeus:Include",
                    "CVE-2015-2426:Include",
                    "malicious-site.com:Include",
                    "MD5:5d41402abc4b2a76b9719d911017c592:Include",
                    "McAfee:Include",
                    "Ubuntu Linux:Include",
                    "Netgear Nighthawk X6:Include",
                    "SSH:Include",
                    "Python:Include",
                    "Tokyo:Exclude, city name",
                    "Japan:Exclude, country name",
                    "Jane Smith:Exclude, human name"
                ],
                "Final Step": [
                    "Fancy Bear",
                    "Zeus",
                    "CVE-2015-2426",
                    "malicious-site.com",
                    "MD5:5d41402abc4b2a76b9719d911017c592",
                    "McAfee",
                    "Ubuntu Linux",
                    "Netgear Nighthawk X6",
                    "SSH",
                    "Python"
                ]
            }```'''
        },
        {
            "role": "user",
            "content": "A notorious hacker group called Anonymous deployed the Stuxnet malware by exploiting CVE-2010-2568. The malware was spread through the IP address 10.0.0.1 and was identified by the SHA1 hash: da39a3ee5e6b4b0d3255bfef95601890afd80709. The infected systems were running the Debian OS and were protected by Symantec Antivirus, connected through a Cisco ASA 5505 firewall. The malware communication was encrypted using the RSA algorithm, and the attack script was developed using the Java programming language. This attack was reported in Berlin, Germany, by a cybersecurity analyst named Michael Johnson."
        },
        {
            "role": "assistant",
            "content": '''```json{
                "Step 1": [
                    "Anonymous",
                    "Stuxnet",
                    "CVE-2010-2568",
                    "10.0.0.1",
                    "SHA1:da39a3ee5e6b4b0d3255bfef95601890afd80709",
                    "Debian",
                    "Symantec Antivirus",
                    "Cisco ASA 5505",
                    "RSA",
                    "Java",
                    "Berlin",
                    "Germany",
                    "Michael Johnson"
                ],
                "Step 2": [
                    "Anonymous:Include",
                    "Stuxnet:Include",
                    "CVE-2010-2568:Include",
                    "10.0.0.1:Include",
                    "SHA1:da39a3ee5e6b4b0d3255bfef95601890afd80709:Include",
                    "Debian:Include",
                    "Symantec Antivirus:Include",
                    "Cisco ASA 5505:Include",
                    "RSA:Include",
                    "Java:Include",
                    "Berlin:Exclude, city name",
                    "Germany:Exclude, country name",
                    "Michael Johnson:Exclude, human name"
                ],
                "Final Step": [
                    "Anonymous",
                    "Stuxnet",
                    "CVE-2010-2568",
                    "10.0.0.1",
                    "SHA1:da39a3ee5e6b4b0d3255bfef95601890afd80709",
                    "Debian",
                    "Symantec Antivirus",
                    "Cisco ASA 5505",
                    "RSA",
                    "Java"
                ]
            }```'''
        },
        {
            "role": "user",
            "content": f"Good job! Now, I will give you another new sentence. Follow the same steps to extract the names and provide the JSON format output. Don't use the example names from the previous example. Here is the sentence: \"{single_value}\""
        }
    ]
    return prompts

input_list=Core_DataFrame["content part"].tolist()
input_prompts_batch=[]
for i in input_list:
    input_prompts_batch.append(name_extraction_prompt_maker(i))
input_prompts_batch=input_prompts_batch*3

code_name="SECCG CVE website process name extraction"

jsonl_file= tools.create_jsonl(input_prompts_batch,model='gpt4', temp=1, token=16384, jsonlname=code_name, possible_output=300)
    
ids = tools.upload_RUN_PAY_jsonl(jsonl_file, code_name)

ans=tools.auto_down_ans(ids['batch_id'])

In [None]:
df = tools.ans_to_df(ans)

from collections import Counter
import numpy as np

all_keep_items = []
total_correct = 0
partial_nan_count = 0
all_nan_count = 0

for i in range(len(input_list)):
    try:
        step1 = df.loc[i, 'Final Step']
        step2 = df.loc[i + len(input_list), 'Final Step']
        step3 = df.loc[i + 2 * len(input_list), 'Final Step']

        step1_is_nan = isinstance(step1, float) and np.isnan(step1)
        step2_is_nan = isinstance(step2, float) and np.isnan(step2)
        step3_is_nan = isinstance(step3, float) and np.isnan(step3)

        nan_count = sum([step1_is_nan, step2_is_nan, step3_is_nan])

        if nan_count == 0:
            total_correct += 1
        elif nan_count == 3:
            all_nan_count += 1
        else:
            partial_nan_count += 1

        step1 = step1 if isinstance(step1, list) else []
        step2 = step2 if isinstance(step2, list) else []
        step3 = step3 if isinstance(step3, list) else []

        combined_steps = step1 + step2 + step3
        keep_items = [item for item, count in Counter(combined_steps).items() if count >= 2]
        all_keep_items.append(keep_items)
    except Exception as e:
        print(f"Error at index {i}: {e}")
        all_keep_items.append([])

Core_DataFrame['specific_names'] = all_keep_items


Classify CTI sentence

In [None]:
from nltk.tokenize import sent_tokenize # type: ignore
import pickle
all_sentences = [sentence for i in range(Core_DataFrame.shape[0]) for sentence in sent_tokenize(Core_DataFrame.loc[i, 'content part'])]

user_input = input('About to save pkl for colab usage, input "Y", "y" or "1" to continue: ').lower()

# Check if the input is "y", "1", or "Y"
if user_input in ['y', '1']:
    # Remove the previous file
    try:
        os.remove("transferColab_sentences_list.pkl")
    except FileNotFoundError:
        pass

    # Save the new pkl file
    with open("transferColab_sentences_list.pkl", "wb") as file:
        pickle.dump(all_sentences, file)
    print("Saved as transferColab_sentences_list.pkl. The size is:", len(all_sentences))
else:
    print("Operation canceled.")
    
# Read back the sentence classification file ↑ >>>
print('Reading: download_back_sentence classification_simple.pkl file from colab')
with open('download_back_sentence_classification_simple.pkl', 'rb') as file:
    sentence_label_dict = pickle.load(file)
print("Output length:", len(sentence_label_dict))
sentence_CTIlabel_dict = {key: value for key, value in sentence_label_dict.items() if '1' in str(value)}

sentence_CTIlabel_key = list(sentence_CTIlabel_dict.keys())

Core_DataFrame['CTIlabelsentence_inpart'] = ''
for index in range(Core_DataFrame.shape[0]):
    content_part = Core_DataFrame.loc[index, 'content part']
    CTI_sentence = []
    for single_sentence in sentence_CTIlabel_key:
        if single_sentence in content_part:
            CTI_sentence.append(single_sentence + '\n')
    
    Core_DataFrame.loc[index, 'CTIlabelsentence_inpart'] = ''.join(CTI_sentence)
print('Added: CTIlabelsentence_inpart column, to indicate CTI sentences in the part')


Worker Agent

In [None]:
def generate_prompt(text,extra_prompt,extra_prompt_hightlight,extra_prompt_targetname):
    promptmessage = [
    {
    "role": "user",
    "content": 
    ''' 
    As an AI trained in entity extraction and relationship extraction. You're an advanced AI expert, so even if I give you a complex sentence, you'll still be able to perform the relationship extraction task. The ouput of the task is a list of list.
    
    Triple format explanation:
        The triple is a basic data structure used to represent knowledge in the form of a semantic graph. It consists of three elements:

        Subject: A noun that represents the main entity or concept.
        Relation: A verb or phrase that describes the relationship between the subject and the object.
        Object: A noun that represents the entity or concept related to the subject through the relation.
        
        Example of a triple: [Formbook, is, malware]. "Formbook" is the subject, "is" is the relation, and "malware" is the object. This structure helps in describing how entities are related within a specific context, such as computer science.
        
        As counterexample: [FinSpy malware, was the final payload] is not a valid triple, sicne it does not contain exactly 3 elements. Also, [FinSpy malware, was, the final payload, that will be used] is not a valid triple, since it contains 4 elements. The subject and the object should be Noun. The relation should be a relation words that connects the subject and the object, and expresses how they are related.
        
    Your output format requirement:
        "The expected output format is json format. It has key as "worker agent" and value as a list of list."
    
    You complete the task by following the rules below:
    
        Rule 1:
        You extract triples related to computer science, focusing on sentences explicitly involving concepts and entities related to the field, such as programming languages, software and hardware technologies, algorithms, network protocols, security vulnerabilities, operating systems, etc.

        Example for Rule 1:
        Input: "Formbook is malware, while the Sun is a star and Picasso is a painter."
        Output: {"worker agent": [["Formbook", "is", "malware"]]}
        Explanation: The triple ["Formbook", "is", "malware"] is related to computer science as it describes a relationship involving a malware. The other triples, ["Sun", "is", "star"] and ["Picasso", "is", "painter"], are excluded because they pertain to astronomy and art, not computer science.

        Rule 2:
        If the input sentences do not contain any computer science-related triples, you must output: [[Blank placeholders since the current text does not contain any computer science-related triples]].

        Example for Rule 2:
        Input: "The Sun is a star and Picasso is a painter."
        Output: {"worker agent": [["Blank placeholders since the current text does not contain any computer science-related triples"]]}
        Explanation: The input sentence does not contain any triples related to computer science, so the output is a blank placeholder.

        Rule 3:
        If you find that you are keeping output triples not related to computer science, stop as soon as possible and output: [[Blank placeholders since the current text does not contain any computer science-related triples]].

        Example for Rule 3:
        Input: "The Sun is a star and Picasso is a painter. Jack likes Picasso's painting."
        Output: {"worker agent": [["Blank placeholders since the current text does not contain any computer science-related triples"]]}
        Explanation: Although you might identify triples initially, none of them are related to computer science. Therefore, you should stop and output the blank placeholder, overriding any previously considered triples.

        Rule 4:
        The subject or object should be strictly singular. Remove all non-computer-related adjectives. Verbs must be in the infinitive form, excluding plural and any non-simple tense.

        Example for Rule 4:
        Input: "Formbook is a very dangerous malware, Formbook checked the system."
        Output: {"worker agent": [["Formbook", "be", "malware"], ["Formbook", "check", "system"]]}
        Explanation: The subject "Formbook" is singular. The adjective "very dangerous" is removed because it's not computer-related, and the verb "checked" is converted to its infinitive form "check".

        Rule 5:
        If the subject or object in a triple contains pronouns such as "it," "they," "malware," "Trojan," "attack," "ransomware," or other general terms, replace them with the specific name they refer to in the sentence.

        Example for Rule 5:
        Input: "The malware is Formbook. This malware is used to steal sensitive information. It is a banking malware."
        Output: {"worker agent": [["Formbook", "is", "malware"], ["Formbook", "is used to steal", "sensitive information"], ["Formbook", "is", "banking malware"]]}
        Explanation: The pronouns "this malware" and "it" refer to "Formbook," so they are replaced accordingly.

        Rule 6:
        If the subject or object in a triple contains multiple entities with parallel relationships to the same object or subject, split these entities into separate triples.

        Example for Rule 6:
        Input: "Formbook and Trickbot are malware. They are used to steal sensitive information."
        Output: {"worker agent": [["Formbook", "is", "malware"], ["Trickbot", "is", "malware"], ["Formbook", "is used to steal", "sensitive information"], ["Trickbot", "is used to steal", "sensitive information"]]}
        Explanation: The sentence contains two entities, "Formbook" and "Trickbot," that relate to "malware" and the action of "stealing sensitive information." These are split into separate triples to accurately represent their relationships.

        Rule 7:
        For entities described by relative pronouns, extract triples based on the relative pronouns.

        Example for Rule 7:
        Input: "The malware, which is known as Formbook, is used to steal sensitive information when it is downloaded from the internet."
        Output: {"worker agent": [["Formbook", "is", "malware"], ["Formbook", "is used to steal", "sensitive information"], ["Formbook", "downloaded from", "internet"]]}
        Explanation: The relative pronoun "which" refers to "Formbook," so the triples are extracted based on this relationship.

        Rule 8:
        For entities that have a probabilistic relationship (e.g., "could," "can," "may"), extract the triples as if the relationship is true.

        Example for Rule 8:
        Input: "Malware like Formbook could be used to steal sensitive information, and when downloaded from the internet, it may cause data loss."
        Output: {"worker agent": [["Formbook", "could be used to steal", "sensitive information"], ["Formbook", "downloaded from", "internet"], ["Formbook", "may cause", "data loss"]]}
        Explanation: The input sentence suggests a probabilistic relationship, so the triples are extracted as though these relationships are true.

        Rule 9:
        The relationship between the subject and object can be based on "of." In this case, "of" is the relationship word.

        Example for Rule 9:
        Input: "The analysis of the malware Formbook shows that it is used to steal sensitive information."
        Output: {"worker agent": [["analysis", "of", "malware Formbook"], ["analysis", "shows", "it is used to steal sensitive information"]]}
        Explanation: The word "of" is used as the relationship between "analysis" and "malware Formbook," creating the first triple.

        Rule 10:
        Consider gerunds as verbs in the relationship. Convert the gerund to the verb and output two triples: one where the gerund acts as the relation, and another where it acts as the object.

        Example for Rule 10:
        Input: "The most common way of Formbook stealing sensitive information is through phishing emails."
        Output: {"worker agent": [["Formbook", "steal", "sensitive information"], ["way of Formbook stealing sensitive information", "is through", "phishing emails"]]}
        Explanation: The gerund "stealing" is treated as a verb in the triple "Formbook steal sensitive information." Additionally, the phrase "way of Formbook stealing sensitive information" is used as a subject in another triple.

        Rule 11:
        The object in the triple should be a noun without any verb or adjective. If the object has a verb or adjective, split the object into a new triple.

        Example for Rule 11:
        Input: "Formbook is a malware designed to run as a deleter."
        Output: {"worker agent": [["Formbook", "is", "malware"], ["malware", "designed to run as", "deleter"]]}
        Explanation: The object "malware" has a descriptive verb phrase "designed to run as," which is split into a separate triple.

        Rule 12:
        If the relationship between the subject and object contains a descriptive noun, output two triples: one with the full relationship and another with the descriptive noun as the subject.

        Example for Rule 12:
        Input: "Formbook dumps information of low-level system settings to a text file."
        Output: {"worker agent": [["Formbook", "dumps", "information"], ["information", "of", "low-level system settings"]]}
        Explanation: The descriptive noun "information" can be used as the subject in a new triple, focusing on its relationship with "low-level system settings."
              
    \
    '''
    },
    {
    "role": "assistant",
    "content": "I got it."
    },
    {
    "role": "user",
    "content": "Here is one sentence from example article:\"Leafminer attempts to infiltrate target networks through various means of intrusion: watering hole websites, vulnerability scans of network services on the internet, and brute-force/dictionary login attempts.\""
    },

    {
    "role": "user",
    "content": "Here is one sentence from example article:\"Kismet is also a powerful tool for penetration testers that need to better understand their target and perform wireless LAN discovery.\""
    },
    {
    "role": "assistant",
    "content": 
        """```{
        "worker agent": [
            ["Kismet", "is", "tool"],
            ["tool", "for", "penetration testers"],
            ["penetration testers", "need to better understand", "target"],
            ["penetration testers", "perform", "wireless LAN discovery"]
        ]
    }```
    """
    },
    {
    "role": "user",
    "content": "Here is one sentence from example article:\"Legendary Pokémon , or Pokémon Illusions are extremely rare and often very powerful Pokémon that are often associated with legends of creation and/or destruction within their endemic regions. \""
    },
    {
    "role": "assistant",
    "content": "```{\"worker agent\": [[\"Blank placeholders since the current text does not contain any computer science related triples\"]]}```"
    },
    {
    "role": "user",
    "content": "Here is one sentence from example article:\"The Royal Knights  are a group of thirteen Mega-level[1] Holy Warrior Digimon[2] that are the Digital World's sacred guardians,[3] and are famed among Digimon as guardian deities of the Computer Network.[4][5] The group was founded by Imperialdramon Paladin Mode,\""
    },
    {
    "role": "assistant",
    "content": "```{\"worker agent\": [[\"The Royal Knights\", \"are\", \"a group of thirteen Mega-level Holy Warrior Digimon\"], [\"The Royal Knights\", \"are\", \"the Digital World's sacred guardians\"], [\"The Royal Knights\", \"are\", \"guardian deities of the Computer Network\"], [\"The group\", \"was founded by\", \"Imperialdramon Paladin Mode\"], [\"I find that the sentence does not contain any computer science related triples, and I am keeping output triples that are not related to computer science. So I stop here and output Blank placeholders in the end since the current text does not contain any computer science related triples\", [\"Blank placeholders since the current text does not contain any computer science related triples\"]]}```"
    },
    {"role": "user",
    "content": 
    """
    Here are my new sentence, extract all possible entity triples from it. Now, I start to give you sentence.\""
    """+text+
    """\"Now, my input text are over. You MUST follow the rules I told you. 
    """+extra_prompt+extra_prompt_hightlight+extra_prompt_targetname
    },
    ]
    return promptmessage

all_Worker_prompts=[]

for index in range(Core_DataFrame.shape[0]):
    enhance=True
    extra_highlight_word=""
    extra_highlight_sentence=""
    keywords=Core_DataFrame.loc[index, 'specific_names']
    if keywords!=None and len(keywords)>0 and enhance == True:
        extra_highlight_word = "\nSupport information for this task: When you find those keywords in above sentence, you should pay more attention to them and extract more triples about them:\""+str(keywords)+'\"'

    hightlight=Core_DataFrame.loc[index, 'CTIlabelsentence_inpart']    
    if hightlight!=None and len(hightlight)>0 and enhance == True:
        extra_highlight_sentence="\nSupport information for this task: those sentences contain some important information, you should pay more attention to them and extract more triples from them.\""+str(hightlight)+'\"'
    
    text_thispart=Core_DataFrame.loc[index, 'content part']
    
    Target_name=Core_DataFrame.loc[index, 'CVE True Name']
    
    extra_prompt_targetname="\n Support information for this task: In my input, if you find some text are discussing about a specific CVE's information, but the CVE name is missing, you should consider the Key CVE name as the subject name, and output triple with that Key CVE name as the subject name. For example, if you find the example sentence like:\"Vulnerability's target: Formbook, Kismet, and Leafminer\" with the example Key CVE name as CVE-2000-1234, you should consider Key CVE name as the subject name, and output triple with that Key CVE name as the subject name, like this:```{worker agent: [[\"CVE-2000-1234\", \"target\", \"Formbook\"], [\"CVE-2000-1234\", \"target\", \"Kismet\"], [\"CVE-2000-1234\", \"target\", \"Leafminer\"]]}```. Now, the actual key CVE name of my input text is:\""+str(Target_name)+'\"'

    promptmessage=generate_prompt(text_thispart,extra_highlight_word,extra_highlight_sentence,extra_prompt_targetname)
    #promptmessage=generate_prompt(text_thispart,"","","")
    
    all_Worker_prompts.append(promptmessage)
    
all_prompts_Worker_3times=all_Worker_prompts*3

code_name="SECCG worker agent V2 mini"
jsonl_file= tools.create_jsonl(all_prompts_Worker_3times,model='gpt4mini', temp=1, token=16384, jsonlname=code_name, possible_output=300)

ids = tools.upload_RUN_PAY_jsonl(jsonl_file, code_name)

ans=tools.auto_down_ans(ids['batch_id'])

df=tools.ans_to_df(ans)

if len(ans)/3==len(all_Worker_prompts):
    Core_DataFrame['worker1_result']=''
    Core_DataFrame['worker2_result']=''
    Core_DataFrame['worker3_result']=''
    Core_DataFrame['blank_content_flag']=''
    

    for index in range(len(all_Worker_prompts)):
        agent_1_result=str(df.loc[index, 'worker agent'])
        agent_2_result=str(df.loc[index+len(all_Worker_prompts),'worker agent'])
        agent_3_result=str(df.loc[index+2*len(all_Worker_prompts),'worker agent'])
        
        for keyword in ['CVExxx', 'Formbook', 'XLoader', 'Malwaresavetextfile', 'Leafminer', 'FinSpy', 'Kismet', 'Specificnamesofa']:
            if keyword in agent_1_result and keyword not in Core_DataFrame.loc[index, 'content part']:
                agent_1_result="ERROR"
            if keyword in agent_2_result and keyword not in Core_DataFrame.loc[index, 'content part']:
                agent_2_result="ERROR"
            if keyword in agent_3_result and keyword not in Core_DataFrame.loc[index, 'content part']:
                agent_3_result="ERROR"
        
        blank_flag_count=0
        if "Blank placeholders" in agent_1_result:
            blank_flag_count+=1
        if "Blank placeholders" in agent_2_result:
            blank_flag_count+=1
        if "Blank placeholders" in agent_3_result:
            blank_flag_count+=1
            
        if blank_flag_count>=2:
            Core_DataFrame.loc[index, 'blank_content_flag'] = True
        else:
            Core_DataFrame.loc[index, 'blank_content_flag'] = False
            
        Core_DataFrame.loc[index, 'worker1_result'] = str(agent_1_result)
        Core_DataFrame.loc[index, 'worker2_result'] = str(agent_2_result)
        Core_DataFrame.loc[index, 'worker3_result'] = str(agent_3_result)


Integrator Agent

In [None]:
def generate_prompt_Integrator(inlist_A, inlist_B, inlist_C):
    promptmessage = [
        {
            "role": "user",
            "content": """Your primary objective is to consolidate entity extraction results derived from sentence or sentences by different distinct assistants into a singular, coherent output. This process necessitates a careful analysis to identify and merge triples that, despite potentially differing in phrasing, communicate identical information. Additionally, it is imperative to filter out any triples that fail to meet the specified criteria for validity and relevance.

            Key Points: 
            1. Eliminate Invalid Triples: Any triples not conforming to the structure of having exactly one SUBJECT, one RELATION, and one OBJECT should be removed. 
            2. Exclude triples missing any of the three essential components [SUBJECT, RELATION, OBJECT]. 
            3. Discard any triples that introduce information not explicitly mentioned in the provided sentence. 
            4. Consolidation means merge triples that convey the same information but are expressed using different terminologies or structures. This involves combining synonyms or semantically similar phrases that refer to the same entities or actions.
            
            Your answer should be in JSON format. The keys are "My thoughts for this task" and "Final Output". You should first output your thoughts for this task in the "My thoughts for this task" key, and then output the final output in the "Final Output" key. The value of "My thoughts for this task" is a long string, and the value of "Final Output" is a list of lists."""
        },
        {"role": "assistant", "content": "Understood."},
        {
            "role": "user",
            "content": """My inputs are:
            Assistant A: [FinSpy, designed by, Gamma Group], [FinSpy, to spy on, digital communications], [FinSpy, is named by, Gamma Group]
            Assistant B: [FinSpy, is designed by, Gamma Group], [FinSpy, is designed from, Gamma Group], [Surveillance software, is used for, spying on digital communications]
            Assistant C: [FinSpy, developed by, Gamma Group], [FinSpy, be said that is designed by, Gamma Group], [FinSpy, monitors, communications],[FinSpy, has]
            """
        },
        {
            "role": "assistant",
            "content": """
            ```{
                "My thoughts for this task": "First, I will identify and remove any redundant triples across the inputs. For example, 'designed by' and 'developed by' are similar, but 'designed by' is more precise, so I will retain 'designed by' as the final relation. Next, I will consolidate triples that express the same idea in different words, such as 'spy on' and 'monitors,' prioritizing the more frequently used term, which in this case is 'spy on.' I will then discard any duplicate triples that convey the same information, such as '[FinSpy, is designed by, Gamma Group]' and '[FinSpy, is designed from, Gamma Group],' keeping only one instance. Next, I will remove any triples that appear in only one assistant's output, like '[FinSpy, is named by, Gamma Group].' Next, I will discard any triples that do not have exactly three elements, such as '[FinSpy, has],' which only has two. Finally, I will output the final result in the 'Final Output' key.",
                "Final Output": [
                    ["FinSpy", "designed by", "Gamma Group"],
                    ["FinSpy", "to spy on", "digital communications"]
                ]
            }```"""
        },
        {
            "role": "user",
            "content": "Good, this time I will give you a new Original Sentence and three assistants' results. Follow the same steps to consolidate the results and provide the JSON format output. Here is the new three Assistant results:\""
            + "Assistant A: " + str(inlist_A)
            + "Assistant B: " + str(inlist_B)
            + "Assistant C: " + str(inlist_C)
            + "Now, my inputs are end."
        }
    ]
    return promptmessage

all_prompts_Integrator=[]
for index in range(Core_DataFrame.shape[0]):
    text_thispart=Core_DataFrame.loc[index, 'content part']
    worker1_result=Core_DataFrame.loc[index, 'worker1_result']
    worker2_result=Core_DataFrame.loc[index, 'worker2_result']
    worker3_result=Core_DataFrame.loc[index, 'worker3_result']
    
    promptmessage=generate_prompt_Integrator(worker1_result,worker2_result,worker3_result)
    
    all_prompts_Integrator.append(promptmessage)

code_name="SECCG Integrator agent"
jsonl_file= tools.create_jsonl(all_prompts_Integrator,model='gpt4', temp=1, token=16384, jsonlname=code_name, possible_output=300)


ids = tools.upload_RUN_PAY_jsonl(jsonl_file, code_name)

ans=tools.auto_down_ans('batch_uR8lyeieivKcwTFrAOTHg3mk')

total_results = [
    f"WORKER 1 result: {row['worker1_result']}\n"
    f"WORKER 2 result: {row['worker2_result']}\n"
    f"WORKER 3 result: {row['worker3_result']}"
    for _, row in Core_DataFrame.iterrows()
]

df_withIntegrator=tools.ans_and_inputs_to_df([total_results],['SECCG 3 Worker Result'],all_prompts_Integrator,ans)

Core_DataFrame['Integrator_result']=df_withIntegrator['Final Output']

Refiner Agnet

In [None]:
def generate_prompt_postprocess(text):
        promptmessage = [
        {
        "role": "user",
        "content": 
        ''' 
        You play the role of an entity extraction expert and modify/simplify/split the text (extracted multiple triples) in the entity extraction result I gave you (a python dictionary with key as the source sentence with ellipsis and value as the extracted triples) according to the following rules. A triple is a basic data structure used to represent knowledge graphs, which are structured semantic knowledge bases that describe concepts and their relationships in the physical world. A triple consists of three elements: [SUBJECT, RELATION,OBJECT]. The subject and the object are entities, which can be things, people, places, events, or abstract concepts. The relation is a relation that connects the subject and the object, and expresses how they are related. 
        
        For example, [Formbook, is, malware] is a triple that describes the relationship between the malware Formbook and the concept of malware.
        
        You should follow the rules below to modify the triples:
        RULE 1: Simplify the subject, object, and relation into a more concise, generic expression. When you encounter a plural or past tense form, convert it to singular or present tense.  
        Example for Rule 1: Input: "[Formbook, targets, Windows users],". Output:```json {"Thoughts": "The word "targets" is in plural form, so I converted it to singular form.", "Triple": "[Formbook, target, Windows user]"} ```
        
        RULE2: If a subject or object contains a proper noun that is a specific name of a malware, Trojan horse, CVE, or hacking organization, remove the proper noun and keep the generic term. When you encounter such subject or object that contains modifiers and adjectives, remove the modifiers and adjectives to keep the core term.
        Example for Rule 2: Input: "[Formbook malware, use, website bookmarks]". Output:```json {"Thoughts": "The word "Formbook" is a specific name of a malware, so I removed the additional suffix, which is "malware".", "Triple": "[Formbook, use, website bookmark]"} ```
        
        Rule3: If the object itself contains a new relation, create a new triple based on the object, and convert the original triple into a simple form.
        Example for Rule 3: Input: "[Formbook, save dump file to, a folder in desktop]". Output:```json {"Thoughts": "The object "a folder in desktop" contains a new relation "in desktop", so I created a new triple based on the object.", "Triple1": "[Formbook, save dump file, a folder]", "Triple2": "[a folder, in, desktop]"} ```
        
        Rule 4: Split a complex triple into multiple simpler forms. 
        Example for Rule 4: Input: "[Formbook and XLoader, are, malware]". Output:```json {"Thoughts": "The triple contains two subjects, so I split it into two triples.", "Triple1": "[Formbook, be, malware]", "Triple2": "[XLoader, be, malware]"} ```
        
        Rule 5: If the [subject,relation] in a triple can be formed into a new [subject,relation,object] triple because relation itself has a new object in it, create a new triple while keeping the original one. 
        Example: Input:"[Formbook, save XLoader to, desktop]". Output:```json {"Thoughts": "The relation "save XLoader to" can be split into a new triple.", "Triple1": "[Formbook, save, XLoader]", "Triple2": "[Formbook, save XLoader to, desktop]"} ```
        
        Rule 6: If Subject or Object contains an MD5, registry, path, or other identifier that contains prefixes, remove their prefixes to generate a new triple.
        Example for Rule 6: Input: "[Formbook's hash, is, md5:xxxxx]". Output:```json {"Thoughts": "The object contains an MD5 identifier, so I removed the prefix.", "Triple": "[Formbook's hash, is, xxxxx]"} ```
        
       Rule 7: If  Subject or Object contain version information or OS information or CVE number or other specific information, keep them and apply Rule 1-6 to the rest of the triple.
        Example for Rule 7: Input: "[Formbook v1.0, is, malware]". Output:```json {"Thoughts": "The subject contains version information, so I kept it and applied Rule 1-6 to the rest of the triple.", "Triple": "[Formbook v1.0, be, malware]"} ```
        
        Output format requirement:
        Your output MUST be in JSON format. The keys are \"Thoughts\" and \"Triple\". The value of \"Thoughts\" is a string describing your thought process for modifying the triple, and the value of \"Triple\" is the modified triple. Don't use any other format, if my input has nothing, you don't need to write anything, only output this JSON:
        ```json
        [
        {
            "Thoughts": "No input",
            "Triple": "", 
        ]```
        
        A example of the output format is shown below:
        ```json
        [
        {
            "Thoughts": "Example of thoughts",
            "Triple1": "Example of triple1 because the original triple can split into multiple triples.", 
            "Triple2": "Example of triple1 because the original triple can split into multiple triples."
        },
        {
            "Thoughts": "SExample of thoughts",
            "Triple": "Example of triple after the modification based on rules."
        }
    ]```
        '''
        "Here is my entity extraction result:\""+str(text)+"""\". Now, you apply the rules above to modify the triples. 
        
        """
        },
        ]
        return promptmessage

all_prompts_Refiner=[]
for index in range(Core_DataFrame.shape[0]):
    text_thispart=Core_DataFrame.loc[index, 'Integrator_result']
    
    promptmessage=generate_prompt_postprocess(text_thispart)
    
    all_prompts_Refiner.append(promptmessage)

code_name="SECCG Refiner agent"
jsonl_file= tools.create_jsonl(all_prompts_Refiner,model='gpt4', temp=1, token=16384, jsonlname=code_name, possible_output=300)

ids = tools.upload_RUN_PAY_jsonl(jsonl_file, code_name)

ans=tools.auto_down_ans(ids['batch_id'])


In [None]:
df=tools.ans_to_df(ans)

def capture_triples(text):
    triples = re.findall(r'"Triple\d*":\s*"(\[.*?\])"', text)
    return triples

triples_str=[capture_triples(i) for i in ans]

Core_DataFrame['Refiner_result']=triples_str


Merger agent

In [None]:
def generate_prompt(longmem,shortmem):
        promptmessage = [
        {
        "role": "user",
        "content": 
        '''You are a triples integration assistant. Triple is a basic data structure, which describes concepts and their relationships. A triple in long-term and short-term memory MUST has THREE elements: [Subject, Relation, Object]. You are now reading a whole article and extract all triples from it. But you can only see part of the article at a time. In order to record all the triples from a article, you have the following long-term memory area to record the triples from the entire article. long-term memory stores information on the aricle parts you have already read.
        -The start of the long-term memory area-
        #Triples will be added here
        -The end of the short-term memory area-
        Second, you now see a part of this article. Based on this part, you already extract such triples and place them in your short-term memory: 
        -The start of the short-term memory area-
        #Triples will be added here
       -The end of the short-term memory area-
        Third, now review your long-term memory and short-term memory. Modify the short-term memory into a new short-term memory. You should follow following rules to modify triples in short-term memory to make them consistent with triples in long-term memory. You should write down how you use the rule to modify the triples in short-term memory. 
        
        Rule 1. You notice that in these triples, some triples have subjects and objects that contain partially identical terms and refer to the same specific nouns, but these specific nouns have prefixes/suffixes/modifiers that make them not identical. You should delete the prefixes/suffixes/modifiers and unify them into the same specific nouns.
        
        Before rule: [the Formbook, is designed to run as, a deleter] [Formbook sample, is designed to run as, one-time encryptor]

        After rule: [Formbook, is designed to run as, a deleter] [Formbook, is designed to run as, one-time encryptor]

        Explanation: The words "the Formbook" and "Formbook sample" refer to the same entity, so they are unified to use the exact same subject "Formbook" for consistency.
        
        Rule 2. Be especially careful that when you meet specific names of malware,CVE, Trojans, hacker organizations, etc., always use their specific names and remove the prefixes/suffixes/modifiers.
        
        Before rule: [Malware Formbook, is, malware] 
        
        After rule: [Formbook, is, malware]
        
        Explanation: The word "Formbook" is a specific name of malware, so it should be used as the subject of the triple and the prefix "Malware" should be removed.
        
        Rule 3. Don't add unexisting triples to your new short-term memory. 
    
        Suppose you find in long-term memory: [the malware, download, Leafminer] and in short-term memory: [Formbook, is, malware]. You cannot add a new triple in new short term memory: [Formbook, download, Leafminer]. Because you don't have evidence that "the malware" in the long-term memory specifically refers to "Formbook".
        
        Rule 4. Don't add unexisting triples that don't exsit in long-term memory or short-term memory to your new short-term memory. You should add triples from long-term memory or short-term memory to your new short-term memory, not from your imagination and selfcreation
        
        Rule 5. Don't add any example word like 'Formbook','XLoader','Leafminer', 'FinSpy', 'Kismet' in your new short-term memory area, they are just example words not the real triples in the long term memory area or short term memory area.
        
        Rule 6. new short-term memory area must be started with \'-The start of new short-term memory area-\' and ended with \'-The end of new short-term memory area-\'. A triple in new short-term memory MUST has THREE elements: [Subject, Relation, Object]. 
        '''
        },
        {"role": "assistant",'content': 'Yes, I understand and totally follow the rules.'},
        {"role": "user",'content': '''
         -The start of the long-term memory area-  
         [SUBJECT:AgumonHacker, RELATION:disrupts, OBJECT:network operations], [SUBJECT:HackGroup Gabumon, RELATION:targets, OBJECT:financial institutions], [SUBJECT:Malware Biyomon, RELATION:steals, OBJECT:personal data], [SUBJECT:CyberAttack 2042, RELATION:compromised, OBJECT:database systems],  [SUBJECT:A sample of Biyomon, RELATION:save, OBJECT:a payload]
         -The end of the long-term memory area-

        -The start of the short-term memory area-  
         [SUBJECT:The AgumonHacker Trojan, RELATION:created by, OBJECT:a sophisticated hacker group], [SUBJECT:Gabumon hackers, RELATION:launched, OBJECT:series of phishing campaigns], [SUBJECT:Biyomon malware program, RELATION:encrypts, OBJECT:files for ransom], [SUBJECT:Attack in 2042, RELATION:led to, OBJECT:sensitive leaks],[SUBJECT:Biyomon, RELATION:run, OBJECT:payload]
        -The end of the short-term memory area-
         '''
         },
        {"role": "assistant",'content': 
         '''
         Modifications required based on the rules:
        Apply Rule 1 and Rule 2: Unify subjects and objects with partially identical terms and remove any prefixes/suffixes/modifiers, particularly for specific names like malware and hacker groups.
        Modified short-term memory:

        [SUBJECT:The AgumonHacker Trojan, RELATION:created by, OBJECT:a sophisticated hacker group] → [SUBJECT:AgumonHacker, RELATION:created by, OBJECT:sophisticated hacker group]
        [SUBJECT:Gabumon hackers, RELATION:launched, OBJECT:series of phishing campaigns] → [SUBJECT:HackGroup Gabumon, RELATION:launched, OBJECT:phishing campaigns]
        [SUBJECT:Biyomon malware program, RELATION:encrypts, OBJECT:files for ransom] → [SUBJECT:Malware Biyomon, RELATION:encrypts, OBJECT:files for ransom]
        [SUBJECT:Attack in 2042, RELATION:led to, OBJECT:sensitive leaks] → [SUBJECT:CyberAttack 2042, RELATION:led to, OBJECT:sensitive data leaks]

        I also find a triple in long-term memory that needs to be modified based on Rule 1 and Rule 2:

        [SUBJECT:A sample of Biyomon, RELATION:save, OBJECT:a payload] → [SUBJECT:Biyomon, RELATION:save, OBJECT:payload]

        Apply Rule 3 and 4 and 5: We must ensure we do not create unexisting triples when merging the memories.
        After applying the modifications, we now create a new short-term memory area:

        -The start of new short-term memory area- 
        [SUBJECT:AgumonHacker, RELATION:created by, OBJECT:sophisticated hacker group], 
        [SUBJECT:HackGroup Gabumon, RELATION:launched, OBJECT:phishing campaigns],
        [SUBJECT:Biyomon, RELATION:encrypts, OBJECT:files for ransom],
        [SUBJECT:CyberAttack 2042, RELATION:led to, OBJECT:sensitive data leaks],
        [SUBJECT:Biyomon, RELATION:run, OBJECT:payload],
        [SUBJECT:Biyomon, RELATION:save, OBJECT:payload] 
        -The end of new short-term memory area-
         '''   },
        {"role": "user",'content': 
        '''
        Good. Now, let's swtich to another article. 
        -The start of the long-term memory area-
        '''+str(longmem)+'''
        -The end of the long-term memory area-
    
        -The start of the short-term memory area-
        '''+str(shortmem)+'''
        -The end of the short-term memory area-
        
        Now, follow the rules. Write down how you use the rule to modify the triples in short-term memory. If there is no any triple in my input short-term memory, you still need to write down \'-The start of new short-term memory area-\' and ended with \'-The end of new short-term memory area-\'. with only one blank line between them. If my input short-term memory is already perfect, you still need to write down \'-The start of new short-term memory area-\' and content of that perfect short-term memory and ended with \'-The end of new short-term memory area-\'.
        '''
        },      
        ]
        return promptmessage

def check_brackets(my_string):
    if my_string is None or len(my_string) == 0:
        return False
    my_string = my_string.strip()
    first_char_is_bracket = my_string[0] == '['
    last_char_is_bracket = my_string[-1] == ']'

    if first_char_is_bracket and last_char_is_bracket:
        return True
    else:
        return False
  
def checker(my_string):
    promptmessage = [{
        "role": "user",
        "content":'You are a result checker. You are responsible for checking the result from other AI assistants. The AI assistant may say that \" I am sorry, but I am Chat AI model and I am not able to do the task \" or \" You should do it by yourself\" or \"I am sorry, but I am not able to do the task\". If you found those words or words with simlar meaning, you must reply me \"ERROR\", other wise, you should reply me \"OK\". Here is the result from other AI assistant: '+str(my_string)}]
    import os
    from openai import OpenAI
    setmodel='<GPTNAME>'
    api_key = "<APIKEY>"
    api_base = "<APIBASE>"

    client = OpenAI(api_key=api_key, base_url=api_base)
    stream = client.chat.completions.create(
        model=setmodel,
        messages=promptmessage,
        stream=True,
        max_tokens=128,
        temperature=1,
    )
    final_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            #print(chunk.choices[0].delta.content, end="")
            final_response += chunk.choices[0].delta.content  
    return final_response 

grouped = Core_DataFrame.groupby('content')

list_of_dfs = [group for _, group in grouped]

import pandas as pd
import os
from nltk.tokenize import sent_tokenize
import pickle

# Assuming merge_extracted_triples(), check_brackets(), and checker() are already defined

triple_cache = []
text_cache = []
longmem = None

# Iterate over each sub DataFrame
for sub_df in list_of_dfs:
    # Iterate through each row in the sub DataFrame
    for i, row in sub_df.iterrows():
        clean_triple_forMEM = row['content part']  # Assuming this is the column for content parts
        this_time_test = row['some_other_column']  # Replace with actual column name

        if i == 0:
            if check_brackets(clean_triple_forMEM):
                longmem = clean_triple_forMEM
            else:
                longmem = 'No longterm memory'
            triple_cache.append(clean_triple_forMEM)
            text_cache.append(this_time_test)
            print('First thinking completed')

        if i >= 1:
            print('Past long-term memory is:')
            print(longmem)
            original_longmem = longmem
            if len(longmem) >= 1500:
                longmem = longmem[-1000:]
                if '[' in longmem:
                    longmem = longmem[longmem.index('['):]

            if check_brackets(clean_triple_forMEM):
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    print('Retry ' + str(retry_count) + ' times')
                    newlongmem = generate_prompt(longmem, clean_triple_forMEM, this_time_test)
                    print('Thinking process:')
                    print(newlongmem)
                    newlongmem = newlongmem.replace('-The start of the new short-term memory area-', '-The start of new short-term memory area-')
                    newlongmem = newlongmem.replace('-The end of the new short-term memory area-', '-The end of new short-term memory area-')

                    if '-The start of new short-term memory area-' in newlongmem and '-The end of new short-term memory area-' in newlongmem and checker(newlongmem) != 'ERROR':
                        newlongmem = newlongmem[newlongmem.rindex('-The start of new short-term memory area-') + len('-The start of new short-term memory area-'):newlongmem.rindex('-The end of new short-term memory area-')]
                        if not any(keyword in newlongmem for keyword in ['Formbook', 'XLoader', 'savetextfile', 'Leafminer', 'FinSpy', 'Kismet', 'Agumon', 'Gabumon', 'Biyomon', '2042']):
                            longmem = str(original_longmem) + ', ' + str(newlongmem)
                            retry_count = 9999
                        else:
                            retry_count += 1
                    else:
                        retry_count += 1
            else:
                longmem = original_longmem
                print('Short-term memory is not a triple')
            print('After merging: The new long-term memory is:')
            print(longmem)

            # Create a new DataFrame for saving
            new_data = pd.DataFrame({'single_article': [str(row['content'])], 'longmem': [str(longmem)]})

            try:
                # Read the existing Excel file
                longmem_cache = pd.read_excel('Knowledge Graph result cache backup.xlsx')
                # Add new data to the end of existing data
                longmem_cache = pd.concat([longmem_cache, new_data], ignore_index=True)
            except FileNotFoundError:
                # If the file does not exist, use the new data directly
                longmem_cache = new_data

            # Save the updated data to the Excel file
            longmem_cache.to_excel('Knowledge Graph result cache backup.xlsx', index=False)
            # Add the result to the cache