In [None]:
import requests
import os
import json
import pandas as pd
import re
import import_ipynb
from prep_onto import prep_onto

In [None]:
try:
    from dotenv import load_dotenv

    load_dotenv(".env")
except ImportError:
    print("dotenv not installed, skipping...")

In [None]:
TOKEN = os.environ.get("APIKEY")
WEBUI_URL = "https://api.deepseek.com"
MODEL = "deepseek-chat"

In [None]:
def create_news(df, limit=20, output_dir="./news/"):
    news_list = []
    for idx, news in enumerate(df):
        filename = f"news{str(idx + 1).zfill(3)}.txt"
        file_path = os.path.join(output_dir, filename)
        content = news

        with open(file_path, 'w+') as file:
            file.write(content)

        news_list.append(filename)
        
        if idx == limit - 1:
            break
    return news_list

In [None]:
def chat(query):
    url = f'{WEBUI_URL}/chat/completions'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
        
    payload = {
        'model': MODEL,
        'messages': [{'role': 'user', 'content': query}],
        'stream' : False,
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return json.loads(response.text)
    except requests.exceptions.RequestException as e:
        return {'error': f"{str(e)} {response}"}

In [None]:
def extract_turtle(output):
    cleaned_text = re.search(r'```(?:ttl|turtle)(.*?)```', output, flags=re.DOTALL)
    if cleaned_text is not None:
        return cleaned_text.group(1).strip()
    else:
        return None 

In [None]:
def create_newsdict(news_list, news_path="./news/"):
    news_dict = {} 
    
    for news in news_list:
        try:
            with open(news_path + news, "r") as f:
                content = f.read()  
            news_dict[news] = content  
        except FileNotFoundError:
            print(f"Warning: File {news} not found in path {news_path}. Skipping.")
        except Exception as e:
            print(f"Error reading {news}: {e}")
    
    return news_dict

In [None]:
def save_ontologies(filename, content, output_dir="./news_onto"):
    os.makedirs(output_dir, exist_ok=True)

    new_filename = filename.replace(".txt", ".ttl")
    file_path = os.path.join(output_dir, new_filename)
        
    with open(file_path, "w+", encoding="utf-8") as file:
        file.write(content)
        
    print(f"Saved: {file_path}")

In [None]:
def generate_onto(initial_prompt_text, prompt_text, news_dict, initial_ontology=None):
    if not (initial_prompt_text and prompt_text and news_dict):
         return {}

    ontoList = {}       # {'newsXYZ' : 'RDF_ONTOLOGY'}
    responseList = []   # Saving response for Debugging
    promptList = []     # Saving prompt for Debugging
    first_iteration = True # Flag to trigger initial prompt
    prompt = None

    for news, news_content in news_dict.items():
        if first_iteration:
            # First Prompt Construction
            if initial_ontology:
                # Initial prompt w/ initial ontology
                prompt = f"""
{prompt_text}\n
{news_content}\n
Here is the ontology:\n
```ttl
{initial_ontology}
```
"""
            else:
                # Initial prompt w/o initial ontology
                prompt = initial_prompt_text + news_content
            first_iteration = False

        else:
            prompt = f"""
{prompt_text}\n
{news_content}\n
Here is the ontology:\n
```ttl
{previous_ttl_content}
```
"""

        # Send Prompt to LLM
        response = chat(query=prompt)
        
        # Debug variables
        promptList.append(prompt)
        responseList.append(response)

        # Get the generated ontology
        ttl_content = extract_turtle(response['choices'][0]['message']['content'])
        if ttl_content:  
            ontoList[news] = ttl_content  
            save_ontologies(news, ttl_content)
            previous_ttl_content = ttl_content

        else:
            print(f"Warning: No ontology extracted for {news}")

    return ontoList, responseList, promptList

In [None]:
df = pd.read_excel("./TheHackerNews_Dataset.xlsx")['Article']
df.shape

In [None]:
%%capture
create_news(df)

In [None]:
initial_prompt_text = """I will provide you with a news article, and I want to generate an ontology from it. 
Please extract key concepts, relationships, and categories from the article and structure them into an ontology. 
The ontology should be in a structured format of Turtle (.ttl). 
Here is the news content: 
"""

In [None]:
prompt_text = """I will provide you with a news article, and I want to expand upon an existing ontology. 
Please analyze the new article, extract key concepts, relationships, and categories, and integrate them into the existing ontology while maintaining consistency and avoiding redundancy. 
Ensure that new concepts complement the previous ontology rather than duplicating existing ones.
Only include the instances and properties essential for this specific news. 
Here is the news content: 
"""

In [None]:
news_path = "./news/"

In [None]:
news_list = []
for file in os.listdir(news_path):
    if file.endswith(".txt"):
        news_list.append(file)
news_list.sort()

In [None]:
news_dict = create_newsdict(news_list=news_list[5:10], news_path=news_path)

In [None]:
news_dict

In [None]:
ini_onto_path = "./uni_onto/RefinedUnifiedOntology.ttl"
if os.path.exists(ini_onto_path):
    with open(ini_onto_path, 'r') as file:
        initial_ontology = file.read()
        file.close()
else:
    initial_ontology = None

In [None]:
ontoList, responseList, promptList = generate_onto(initial_prompt_text=initial_prompt_text, 
                                                   prompt_text=prompt_text, 
                                                   news_dict=news_dict,
                                                   initial_ontology=initial_ontology)

In [None]:
ontoList

In [None]:
responseList

In [None]:
promptList