In [68]:
import import_ipynb
from os import walk, stat
import pandas as pd
from rdflib import Graph
from codes.prep_onto import prep_onto
from codes.prompt_API import create_news, create_newsdict, generate_onto
from UnifiedOntologyPipeline.pipeline.appendOnto import append_file, process_lines
from UnifiedOntologyPipeline.pipeline.connectIndividuals import getNamespace, connectIndividuals, clear_serialize

### **Global Variables**

In [44]:
try:
    from dotenv import load_dotenv
    load_dotenv("./codes/.env")
except ImportError:
    print("dotenv not installed, skipping...")

In [45]:
TOKEN = os.environ.get("APIKEY")
WEBUI_URL = "https://api.deepseek.com"
MODEL = "deepseek-chat"

### **News Preparation**

In [55]:
news_path = "./codes/TheHackerNews_Dataset.xlsx"
df = pd.read_excel(news_path)['Article']
df.shape

(3742,)

In [57]:
%%capture
news_path = "./codes/news/"
create_news(df, output_dir=news_path)

In [58]:
news_list = []
for file in os.listdir(news_path):
    if file.endswith(".txt"):
        news_list.append(file)
news_list.sort()

In [59]:
news_dict = create_newsdict(news_list=news_list[5:10], news_path=news_path)
news_dict

{'news006.txt': 'Click Studios, the Australian software firm which confirmed a supply chain attack affecting its Passwordstate password management application, has warned customers of an ongoing phishing attack by an unknown threat actor.\n"We have been advised a bad actor has commenced a phishing attack with a small number of customers having received emails requesting urgent action," the company said in an updated advisory released on Wednesday. "These emails are not sent by Click Studios."\nLast week, Click Studios said attackers had employed sophisticated techniques to compromise Passwordstate\'s update mechanism, using it to drop malware on user computers. Only customers who performed In-Place Upgrades between April 20, 8:33 PM UTC, and April 22, 0:30 AM UTC are said to be affected.\nWhile Passwordstate serves about 29,000 customers, the Adelaide-based firm maintained that the total number of impacted customers is very low. It\'s also urging users to refrain from posting correspon

### **Input Ontology**

In [60]:
uni_onto_path = "./codes/uni_onto/RefinedUnifiedOntology.ttl"
filtered_onto_path = "./codes/filtered_onto/FilteredUnifiedOntology.ttl"

if os.path.exists(uni_onto_path):
    prep_onto(uni_onto_path, filtered_onto_path)
    with open(filtered_onto_path, 'r') as file:
        initial_ontology = file.read()
        file.close()
    
else:
    initial_ontology = None

### **Prompts Preparation**

In [61]:
initial_prompt_text = """I will provide you with a news article, and I want to generate an ontology from it. 
Please extract key concepts, relationships, and categories from the article and structure them into an ontology. 
The ontology should be in a structured format of Turtle (.ttl). 
Here is the news content: 
"""

In [62]:
prompt_text = """I will provide you with a news article, and I want to expand upon an existing ontology. 
Please analyze the new article, extract key concepts, relationships, and categories, and integrate them into the existing ontology while maintaining consistency and avoiding redundancy. 
Ensure that new concepts complement the previous ontology rather than duplicating existing ones.
Only include the instances and properties essential for this specific news. 
Here is the news content: 
"""

### **Prompt to LLM**

In [None]:
ontoList, responseList, promptList = generate_onto(initial_prompt_text=initial_prompt_text, 
                                                   prompt_text=prompt_text, 
                                                   news_dict=news_dict,
                                                   initial_ontology=initial_ontology)  

### **Ontology Pipeline**

In [66]:
def getFilenames(input_path, extension=''):
    return [f for f in next(walk(input_path), (None, None, []))[2] if f.endswith('.ttl')]  # [] if no file

In [None]:
input_path = "./1_ontologiesInput/"
connect_output_DIR = "./2_connectOutput/"
merge_output_DIR = "./3_mergeOutput/"
refine_output_DIR = "./4_refineOutput/"
input = getFilenames(input_path, '.ttl')

In [80]:
"""
Every files in {input_path} will be updated with a new class,
and an individual from said class.
The class is named "CyberNews", and the individual is named after the file's name.
Every other individuals in the ontology will have object property of "relatedTo" to said individual. 
"""

connected = []
news_class = "CybersecurityNewsArticle"

for ind, file in enumerate(input):
    if stat(input_path + file).st_size == 0: # skip empty files
        continue
    g = Graph()
    process_lines(f"{input_path}{file}", f"{input_path}{file}")
    g.parse(f"{input_path}{file}", format="ttl")

    prefix, uri = getNamespace(g)
    news_name = file.split('.')[0]  # Get filename, exclude extension
    connectIndividuals(g, prefix, uri, news_name, news_class)   

    OutputDes = f"{connect_output_DIR}{news_name}_connected.ttl"  # Get new files' names
    connected.append(OutputDes)
    clear_serialize(g, OutputDes)

In [81]:
merged_file = f"./{merge_output_DIR}UnifiedOntology.ttl"

In [82]:
for ind, connected_file in enumerate(connected):
    append_file(source_file=connected_file, target_file=merged_file)

In [83]:
g = Graph()
g.parse(merged_file, format="ttl")
clear_serialize(g, merged_file)