In [None]:
import requests
import os
import json
import pandas as pd
import re

In [None]:
try:
    from dotenv import load_dotenv

    load_dotenv(".env")
except ImportError:
    print("dotenv not installed, skipping...")

In [None]:
TOKEN = os.environ.get("APIKEY")
WEBUI_URL = "http://localhost:8080"
MODEL = "deepseek-r1:1.5b"
COLLECTION_ID = "3e0b8bde-f5f9-4fb6-97f0-99ef34565d56"
TEST_ID = "cfb8ddce-cba9-4bd4-ad47-60107c85a80c"

In [None]:
df = pd.read_excel("./TheHackerNews_Dataset.xlsx")['Article']
df.shape

In [None]:
def create_news(df, limit=20):
    news_list = []
    for idx, news in enumerate(df):
        news_list.append(f"news{idx + 1}.txt")
        if idx == limit - 1:
            break
    return news_list

In [None]:
def chat(query, file_id=None, collection_id=None):
    url = f'{WEBUI_URL}/api/chat/completions'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }

    files = []

    if file_id:
        files.extend({'type': 'file', 'id': id} for id in file_id)
    if collection_id:
        files.extend({'type': 'collection', 'id': id} for id in collection_id)
        
    payload = {
        'model': MODEL,
        'messages': [{'role': 'user', 'content': query}],
        'stream' : False,
    }

    if files:
        payload['files'] = files
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return json.loads(response.text)
    except requests.exceptions.RequestException as e:
        return {'error': str(e)}

In [None]:
res = chat("hello")

In [None]:
res

In [None]:
def upload_file(file_path):
    url = f'{WEBUI_URL}/api/v1/files/'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Accept': 'application/json'
    }
    with open(file_path, 'rb') as f:
        files = {'file': f}
        response = requests.post(url, headers=headers, files=files)
    return json.loads(response.text)

In [None]:
def get_uploaded_files():
    url = f'{WEBUI_URL}/api/v1/files/'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Accept': 'application/json'
    }
    response = requests.get(url, headers=headers)
    uploaded_files = {file["filename"]: file["id"] for file in json.loads(response.text)}
    return uploaded_files

In [None]:
def add_file_to_knowledge(knowledge_id, file_id):
    url = f'{WEBUI_URL}/api/v1/knowledge/{knowledge_id}/file/add'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
    data = {'file_id': file_id}
    response = requests.post(url, headers=headers, json=data)
    return json.loads(response.text)

In [None]:
def remove_file_from_knowledge(knowledge_id, file_id):
    url = f'{WEBUI_URL}/api/v1/knowledge/{knowledge_id}/file/remove'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
    data = {'file_id': file_id}
    response = requests.post(url, headers=headers, json=data)
    return json.loads(response.text)

In [None]:
def remove_file_from_upload(file_id):
    url = f'{WEBUI_URL}/api/v1/files/{file_id}'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
    response = requests.delete(url, headers=headers)
    return json.loads(response.text)

In [None]:
def remove_all_files():
    url = f'{WEBUI_URL}/api/v1/files/all'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
    response = requests.delete(url, headers=headers)
    return json.loads(response.text)

In [None]:
def extract_output(output):
    cleaned_text = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
    return cleaned_text.strip()

In [None]:
def extract_turtle(output):
    cleaned_text = re.search(r'```(?:ttl|turtle)(.*?)```', output, flags=re.DOTALL)
    if cleaned_text is not None:
        return cleaned_text.group(1).strip()
    else:
        return None 

In [None]:
def generate_onto(initial_prompt_text, prompt_text):
    ontoList = {}
    prompt = initial_prompt_text
    for news, news_id in get_uploaded_files().items():
            
        response = chat(query=prompt, file_id=[news_id])
        ttl_content = extract_turtle(response['choices'][0]['message']['content'])
        
        if ttl_content:  
            ontoList[news] = ttl_content  
            
            prompt = f"{prompt_text}\n\n```ttl\n{ttl_content}\n```"
        else:
            print(f"Warning: No ontology extracted for {news}")

    return ontoList

In [None]:
# res = remove_all_files()

In [None]:
news_list = create_news(df)

In [None]:
for news in news_list:
    if news not in get_uploaded_files():
        file_path = f"./news/{news}"
        uploaded = upload_file(file_path)
        print(uploaded)


In [None]:
# for filename, file_id in get_uploaded_files().items():
#     knowledge = add_file_to_knowledge(COLLECTION_ID, file_id)

In [None]:
res = chat("hello")

In [None]:
res

In [None]:
initial_prompt_text = """
I have provided you with a news article, and I want to generate an ontology from it. Please extract key concepts, relationships, and categories from the article and structure them into an ontology. The ontology should be in a structured format of Turtle (.ttl).
"""

In [None]:
prompt_text = """
I have provided you with a news article, and I want to expand upon an existing ontology. Please analyze the new article, extract key concepts, relationships, and categories, and integrate them into the existing ontology while maintaining consistency and avoiding redundancy. Ensure that new concepts complement the previous ontology rather than duplicating existing ones. Here is the ontology: 
"""

In [None]:
ontoList = generate_onto(initial_prompt_text=initial_prompt_text, prompt_text=prompt_text)

In [None]:
ontoList