In [28]:
import requests
import os
import json
import pandas as pd
import re

In [29]:
try:
    from dotenv import load_dotenv

    load_dotenv(".env")
except ImportError:
    print("dotenv not installed, skipping...")

In [None]:
TOKEN = os.environ.get("APIKEY")
WEBUI_URL = "https://api.deepseek.com"
MODEL = "deepseek-chat"

In [31]:
def create_news(df, limit=20, output_dir="./news/"):
    news_list = []
    for idx, news in enumerate(df):
        filename = f"news{str(idx + 1).zfill(3)}.txt"
        file_path = os.path.join(output_dir, filename)
        content = news

        with open(file_path, 'w+') as file:
            file.write(content)

        news_list.append(filename)
        
        if idx == limit - 1:
            break
    return news_list

In [32]:
def chat(query):
    url = f'{WEBUI_URL}/chat/completions'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
        
    payload = {
        'model': MODEL,
        'messages': [{'role': 'user', 'content': query}],
        'stream' : False,
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return json.loads(response.text)
    except requests.exceptions.RequestException as e:
        return {'error': f"{str(e)} {response}"}

In [34]:
def extract_turtle(output):
    cleaned_text = re.search(r'```(?:ttl|turtle)(.*?)```', output, flags=re.DOTALL)
    if cleaned_text is not None:
        return cleaned_text.group(1).strip()
    else:
        return None 

In [35]:
def create_newsdict(news_list, news_path="./news/"):
    news_dict = {} 
    
    for news in news_list:
        try:
            with open(news_path + news, "r") as f:
                content = f.read()  
            news_dict[news] = content  
        except FileNotFoundError:
            print(f"Warning: File {news} not found in path {news_path}. Skipping.")
        except Exception as e:
            print(f"Error reading {news}: {e}")
    
    return news_dict

In [36]:
def save_ontologies(filename, content, output_dir="./news_onto"):
    os.makedirs(output_dir, exist_ok=True)

    new_filename = filename.replace(".txt", ".ttl")
    file_path = os.path.join(output_dir, new_filename)
        
    with open(file_path, "w+", encoding="utf-8") as file:
        file.write(content)
        
    print(f"Saved: {file_path}")

In [78]:
def generate_onto(initial_prompt_text, prompt_text, news_dict, initial_ontology=None):
    if not (initial_prompt_text and prompt_text and news_dict):
         return {}

    ontoList = {}       # {'newsXYZ' : 'RDF_ONTOLOGY'}
    responseList = []   # Saving response for Debugging
    promptList = []     # Saving prompt for Debugging
    first_iteration = True # Flag to trigger initial prompt
    prompt = None

    for news, news_content in news_dict.items():
        if first_iteration:
            # First Prompt Construction
            if initial_ontology:
                # Initial prompt w/ initial ontology
                prompt = f"""
{prompt_text}\n
{news_content}\n
Here is the ontology:\n
```ttl
{initial_ontology}
```
"""
            else:
                # Initial prompt w/o initial ontology
                prompt = initial_prompt_text + news_content
            first_iteration = False

        else:
            prompt = f"""
{prompt_text}\n
{news_content}\n
Here is the ontology:\n
```ttl
{previous_ttl_content}
```
"""

        # Send Prompt to LLM
        response = chat(query=prompt)

        # Debug variables
        promptList.append(prompt)
        responseList.append(response)

        # Get the generated ontology
        ttl_content = extract_turtle(response['choices'][0]['message']['content'])

        if ttl_content:  
            ontoList[news] = ttl_content  
            save_ontologies(news, ttl_content)
            previous_ttl_content = ttl_content

        else:
            print(f"Warning: No ontology extracted for {news}")

    return ontoList, responseList, promptList

In [79]:
df = pd.read_excel("./TheHackerNews_Dataset.xlsx")['Article']
df.shape

(3742,)

In [80]:
%%capture
create_news(df)

In [81]:
initial_prompt_text = """I will provide you with a news article, and I want to generate an ontology from it. 
Please extract key concepts, relationships, and categories from the article and structure them into an ontology. 
The ontology should be in a structured format of Turtle (.ttl). 
Here is the news content: 
"""

In [82]:
prompt_text = """I will provide you with a news article, and I want to expand upon an existing ontology. 
Please analyze the new article, extract key concepts, relationships, and categories, and integrate them into the existing ontology while maintaining consistency and avoiding redundancy. 
Ensure that new concepts complement the previous ontology rather than duplicating existing ones.
Only include the instances and properties essential for this specific news. 
Here is the news content: 
"""

In [83]:
news_path = "./news/"

In [84]:
news_list = []
for file in os.listdir(news_path):
    if file.endswith(".txt"):
        news_list.append(file)
news_list.sort()

In [85]:
news_dict = create_newsdict(news_list=news_list, news_path=news_path)

In [86]:
news_dict_test = dict(list(news_dict.items())[:3])
news_dict_test

{'news001.txt': 'Cloud infrastructure security company Wiz on Thursday revealed details of a now-fixed Azure Cosmos database vulnerability that could have been potentially exploited to grant any Azure user full admin access to other customers\' database instances without any authorization.\nThe flaw, which grants read, write, and delete privileges, has been dubbed "ChaosDB," with Wiz researchers noting that "the vulnerability has a trivial exploit that doesn\'t require any previous access to the target environment, and impacts thousands of organizations, including numerous Fortune 500 companies."\nCosmos DB is Microsoft\'s proprietary NoSQL database that\'s advertised as "a fully managed service" that "takes database administration off your hands with automatic management, updates and patching."\nThe Wiz Research Team reported the issue to Microsoft on August 12, after which the Windows maker took steps to mitigate the issue within 48 hours of responsible disclosure, in addition to awa

In [87]:
ontoList, responseList, promptList = generate_onto(initial_prompt_text=initial_prompt_text, prompt_text=prompt_text, news_dict=news_dict_test)

Saved: ./news_onto/news001.ttl
Saved: ./news_onto/news002.ttl
Saved: ./news_onto/news003.ttl


In [91]:
ontoList

{'news001.txt': '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n@prefix ex: <http://example.org/ontology#> .\n\n# Classes\nex:Vulnerability a rdfs:Class .\nex:Company a rdfs:Class .\nex:Database a rdfs:Class .\nex:SecurityFlaw a rdfs:Class .\nex:Feature a rdfs:Class .\nex:Researcher a rdfs:Class .\nex:Customer a rdfs:Class .\nex:Key a rdfs:Class .\nex:Statement a rdfs:Class .\nex:Notification a rdfs:Class .\n\n# Instances\nex:Wiz a ex:Company ;\n    rdfs:label "Wiz" .\n\nex:Microsoft a ex:Company ;\n    rdfs:label "Microsoft" .\n\nex:AzureCosmosDB a ex:Database ;\n    rdfs:label "Azure Cosmos DB" ;\n    ex:hasFeature ex:JupyterNotebook .\n\nex:JupyterNotebook a ex:Feature ;\n    rdfs:label "Jupyter Notebook" .\n\nex:ChaosDB a ex:Vulnerability ;\n    rdfs:label "ChaosDB" ;\n    ex:discoveredBy ex:WizResearchTeam ;\n 

In [92]:
responseList

[{'id': '23a7e593-7380-4339-acf5-963a2d1337de',
  'object': 'chat.completion',
  'created': 1741672887,
  'model': 'deepseek-chat',
  'choices': [{'index': 0,
    'message': {'role': 'assistant',
     'content': 'Here is the ontology in Turtle (`.ttl`) format, structured based on the key concepts, relationships, and categories extracted from the news article:\n\n```turtle\n@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n@prefix ex: <http://example.org/ontology#> .\n\n# Classes\nex:Vulnerability a rdfs:Class .\nex:Company a rdfs:Class .\nex:Database a rdfs:Class .\nex:SecurityFlaw a rdfs:Class .\nex:Feature a rdfs:Class .\nex:Researcher a rdfs:Class .\nex:Customer a rdfs:Class .\nex:Key a rdfs:Class .\nex:Statement a rdfs:Class .\nex:Notification a rdfs:Class .\n\n# Instances\nex:Wiz a ex:Company ;\n    rdfs:label "Wi

In [93]:
promptList

['I will provide you with a news article, and I want to generate an ontology from it. \nPlease extract key concepts, relationships, and categories from the article and structure them into an ontology. \nThe ontology should be in a structured format of Turtle (.ttl). \nHere is the news content: \nCloud infrastructure security company Wiz on Thursday revealed details of a now-fixed Azure Cosmos database vulnerability that could have been potentially exploited to grant any Azure user full admin access to other customers\' database instances without any authorization.\nThe flaw, which grants read, write, and delete privileges, has been dubbed "ChaosDB," with Wiz researchers noting that "the vulnerability has a trivial exploit that doesn\'t require any previous access to the target environment, and impacts thousands of organizations, including numerous Fortune 500 companies."\nCosmos DB is Microsoft\'s proprietary NoSQL database that\'s advertised as "a fully managed service" that "takes da