In [None]:
import import_ipynb
import os
from shutil import copy
import pandas as pd
from rdflib import Graph

In [None]:
project_dir = os.path.abspath("../../deepseek_pipeline")

In [None]:
try:
    os.chdir(project_dir)
    print("Changed working directory to:", os.getcwd())
except FileNotFoundError:
    print(f"Error: The directory {project_dir} does not exist.")

In [None]:
from codes.prompt_API import create_news, create_newsdict, chat, extract_turtle, save_ontologies
from UnifiedOntologyPipeline.pipeline.appendOnto import append_file, process_lines
from UnifiedOntologyPipeline.pipeline.connectIndividuals import getNamespace, connectIndividuals, clear_serialize

### **Global Variables**

In [None]:
try:
    from dotenv import load_dotenv
    load_dotenv("./codes/.env")
except ImportError:
    print("dotenv not installed, skipping...")

In [None]:
TOKEN = os.environ.get("APIKEY")
URL = "https://api.deepseek.com"
MODEL = "deepseek-chat"
START_IDX = int(os.environ.get("NEWS_START"))
END_IDX = int(os.environ.get("NEWS_END"))

### **News Preparation**

In [None]:
news_path = "./codes/TheHackerNews_Dataset.xlsx"
df = pd.read_excel(news_path)['Article']
df.shape

In [None]:
%%capture
news_path = "./0_newsInput/"
create_news(df, output_dir=news_path)

In [None]:
news_list = []
for file in os.listdir(news_path):
    if file.endswith(".txt"):
        news_list.append(file)
news_list.sort()

In [None]:
chosen_news_list = news_list[1:5]

In [None]:
# chosen_news_list = news_list[START_IDX:END_IDX]

In [None]:
news_dict = create_newsdict(news_list=chosen_news_list, news_path=news_path)
news_dict

### **Prompts Preparation**

In [None]:
initial_prompt_text = """I will provide you with a news article, and I want to generate an ontology from it. 
Please extract key concepts, relationships, and categories from the article and structure them into an ontology. 
The ontology should be in a structured format of Turtle (.ttl). Use "@prefix ex: <http://example.org/ontology#>". 
Here is the news content: 
"""

### **Prompt to LLM**

In [None]:
input_onto_DIR = "./baseline/1_ontologiesInputBL/"

In [None]:
ontoDict = {}
promptList = []
responseList = []

for news, news_content in news_dict.items():
    prompt = f"""
    {initial_prompt_text}
    {news_content}
    """
    
    try:
        # Send Prompt to LLM
        response = chat(prompt, URL, TOKEN, MODEL)
        
        # Debug variables
        promptList.append(prompt)
        responseList.append(response)
        
        ttl_content = extract_turtle(response['choices'][0]['message']['content'])
        if ttl_content:
            ontoDict[news] = ttl_content
            save_ontologies(news, ttl_content, output_dir=input_onto_DIR)
    except Exception as e:
        ontoDict[news] = str(e)

### **Ontology Merge Pipeline**

In [None]:
def getFilenames(input_path, extension=''):
    return [f for f in next(os.walk(input_path), (None, None, []))[2] if f.endswith('.ttl')]  # [] if no file

In [None]:
connect_output_DIR = "./baseline/2_connectOutputBL/"
merge_output_DIR = "./baseline/3_mergeOutputBL/"
refine_output_DIR = "./baseline/4_refineOutputBL/"
input = getFilenames(input_onto_DIR, '.ttl')

In [None]:
"""
Every files in {input_path} will be updated with a new class,
and an individual from said class.
The class is named "CyberNews", and the individual is named after the file's name.
Every other individuals in the ontology will have object property of "relatedTo" to said individual. 
"""

connected = []
news_class = "CybersecurityNewsArticle"

for ind, file in enumerate(input):
    if os.stat(input_onto_DIR + file).st_size == 0: # skip empty files
        continue
    g = Graph()
    process_lines(f"{input_onto_DIR}{file}", f"{input_onto_DIR}{file}")
    g.parse(f"{input_onto_DIR}{file}", format="ttl")

    prefix, uri = getNamespace(g)
    news_name = file.split('.')[0]  # Get filename, exclude extension
    connectIndividuals(g, prefix, uri, news_name, news_class)   

    OutputDes = f"{connect_output_DIR}{news_name}_connected.ttl"  # Get new files' names
    connected.append(OutputDes)
    clear_serialize(g, OutputDes)

In [None]:
merged_file = f"./{merge_output_DIR}UnifiedOntology.ttl"

In [None]:
for ind, connected_file in enumerate(connected):
    append_file(source_file=connected_file, target_file=merged_file)

In [None]:
g = Graph()
g.parse(merged_file, format="ttl")
clear_serialize(g, merged_file)

### **Ontology Refinement Pipeline**

In [None]:
from UnifiedOntologyPipeline.tool.extractClass import *

In [None]:
%%capture
loginHuggingFace()

In [None]:
path_to_onto = "./3_mergeOutput/UnifiedOntology.ttl"
df = getOntoClasses(path_to_onto)

In [None]:
name_tokenizer = "bert-base-uncased"
name_model = "OhWayTee/bert-taxonomy"
name_pipeline = "text-classification"

classifier = getClassifier(name_tokenizer, name_model, name_pipeline)

In [None]:
df_pred = getPredictions(classifier, df)

In [None]:
df_top = sortPredictions(df_pred)

In [None]:
confidence_threshold = 0.9

In [None]:
df_superclass = df_top[(df_top['label'] == "LABEL_1") & (df_top['score'] >= confidence_threshold)] # classB is SUBCLASS of classA
df_subclass = df_top[(df_top['label'] == "LABEL_2") & (df_top['score'] >= confidence_threshold)] # classA is SUBCLASS of classB
[df_superclass.shape, df_subclass.shape]

In [None]:
df_superclass_filtered = df_superclass.sort_values('score', ascending=False).drop_duplicates('classB') # remove duplicate subclasses
df_subclass_filtered = df_subclass.sort_values('score', ascending=False).drop_duplicates('classA')
[df_superclass_filtered.shape, df_subclass_filtered.shape]

In [None]:
g = Graph()
g.parse(path_to_onto, format='turtle')
prefix, uri = getNamespace(g)
print(prefix, uri)

In [None]:
for idx, row in df_superclass_filtered.iterrows():
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX {prefix}: <{uri}>
    INSERT{{
        ?subclass rdfs:subClassOf ?superclass .
    }}
    WHERE {{
        ?subclass rdfs:label "{row['classB']}" .
        ?superclass rdfs:label "{row['classA']}" .
    }}
    """
    g.update(query)

In [None]:
for idx, row in df_subclass_filtered.iterrows():
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX {prefix}: <{uri}>
    INSERT{{
        ?subclass rdfs:subClassOf ?superclass .
    }}
    WHERE {{
        ?subclass rdfs:label "{row['classA']}" .
        ?superclass rdfs:label "{row['classB']}" .
    }}
    """
    g.update(query)

In [None]:
refine_onto_path = "./baseline/4_refineOutputBL/RefinedUnifiedOntology.ttl"

In [None]:
clear_serialize(g, refine_onto_path)