# Imports

In [1]:
!pip install transformers datasets sentencepiece

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.4


In [3]:
from openai import OpenAI
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from typing import Dict, Tuple, Optional, List
import requests
import warnings
from urllib3.exceptions import InsecureRequestWarning

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Тестирование Spotlight

In [4]:
import requests
import json
from collections import defaultdict

# Конфигурация
api_key = "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"
API_BASE_URL = "https://api.dbpedia-spotlight.org"
DATASET_URL = "https://raw.githubusercontent.com/Perevalov/QALD_9_plus/refs/heads/main/data/qald_9_plus_train_dbpedia.json"
client = OpenAI(api_key=api_key)

# Загрузка датасета QALD-9
def load_dataset():
    response = requests.get(DATASET_URL)
    response.raise_for_status()
    return json.loads(response.text)

# Сравнение результатов с датасетом
def compare_with_dataset(api_uris, dataset_uris):
    api_uris = set(api_uris)
    dataset_uris = set(dataset_uris)

    correct = api_uris & dataset_uris
    incorrect = api_uris - dataset_uris
    missing = dataset_uris - api_uris

    precision = len(correct) / len(api_uris) if api_uris else 0
    recall = len(correct) / len(dataset_uris) if dataset_uris else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "correct": len(correct),
        "incorrect": len(incorrect),
        "missing": len(missing),
        "api_uris": api_uris,
        "dataset_uris": dataset_uris
    }


def translate_to_english(text):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "You are a precise translation engine. Translate the input to English exactly, preserving: "
                            "1. All named entities (names, places, titles) "
                            "2. Technical terms "
                            "3. Question structure "
                            "Output ONLY the translation without commentary."
            },
            {
                "role": "user",
                "content": f"Translate this to English exactly:\n\n{text}"
            }
        ],
        temperature=0.1,
        max_tokens=500
    )
    return response.choices[0].message.content.strip()

def get_dbpedia(text: str, language: str = "en") -> Optional[Dict]:
    """Get entities from DBpedia Spotlight API"""

    # Отключаем предупреждения о небезопасных запросах
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

    endpoint = f"https://api.dbpedia-spotlight.org/{language}/annotate"
    headers = {"Accept": "application/json"}
    params = {"text": text, "confidence": 0.5}

    try:
        response = requests.post(
            endpoint,
            headers=headers,
            data=params,
            verify=False  # Disable SSL verification
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"DBpedia Spotlight API error: {e}")
        return None

def uris(rare_question: str) -> Tuple[Optional[str], Dict[str, str]]:
    """New entity extraction using DBpedia Spotlight"""

    #print('Question: ', rare_question)
    en_question = translate_to_english(rare_question)
    #print('Translated question:', en_question)

    spotlight_result = get_dbpedia(en_question)

    if spotlight_result and "Resources" in spotlight_result:
        entities = {}
        tagged_parts = []
        remaining_text = en_question

        # Process each found entity
        for resource in sorted(spotlight_result["Resources"], key=lambda x: -int(x["@offset"])):
            entity_text = resource["@surfaceForm"]
            entity_type = resource["@types"].split(",")[0].split(":")[-1] if resource["@types"] else "thing"
            entity_uri = resource['@URI']

            # Replace in remaining text
            if entity_text in remaining_text:
                entities[entity_text] = entity_uri
                tagged_parts.append((remaining_text.index(entity_text),
                                    f"<{entity_text}>"))
                remaining_text = remaining_text.replace(entity_text, "", 1)

        # Reconstruct tagged question
        if tagged_parts:
            # Sort by original position
            tagged_parts.sort()
            tagged_question = ""
            last_pos = 0

            for pos, tag in tagged_parts:
                tagged_question += remaining_text[last_pos:pos] + tag
                last_pos = pos

            tagged_question += remaining_text[last_pos:]

            return tagged_question, entities
    else:
      print('ERROR')
      # original extracting + generating uris
      return '', {}



# Обработка одного вопроса
def process_question(question):
    results = {}

    # Получаем ожидаемые URI из датасета
    dataset_uris = [binding["uri"]["value"] for binding in question["answers"][0]["results"]["bindings"]]

    tagged_question, entitiy_uris = uris(question['question'][1]['string'])
    print('Tagged question: ', tagged_question)
    print('dataset uris: ', dataset_uris)
    print('Entity uris: ', entitiy_uris)

    # 4. Сравниваем с датасетом
    comparison = compare_with_dataset(entitiy_uris.keys(), dataset_uris)
    results["comparison"] = comparison

    return {
        "id": question["id"],
        "question": tagged_question,
        "results": results
    }

# Основная функция тестирования
def test_spotlight_api(max_questions=None):
    # Загружаем датасет
    dataset = load_dataset()
    questions = dataset["questions"]

    if max_questions:
        questions = questions[:max_questions]

    total_stats = defaultdict(float)
    question_count = 0

    for question in questions:
        question_result = process_question(question)
        if not question_result:
            continue

        question_count += 1
        qid = question_result["id"]
        comparison = question_result["results"]["comparison"]

        # Выводим результаты для каждого вопроса
        print(f"\nQuestion ID: {qid}")
        print(f"Precision: {comparison['precision']:.2f}, Recall: {comparison['recall']:.2f}, F1: {comparison['f1']:.2f}")
        print(f"Correct: {comparison['correct']}, Incorrect: {comparison['incorrect']}, Missing: {comparison['missing']}")

        # Суммируем статистику
        for metric in ["precision", "recall", "f1", "correct", "incorrect", "missing"]:
            total_stats[metric] += comparison[metric]

        print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

    # Выводим итоговую статистику
    if question_count > 0:
        print("\n=== FINAL STATISTICS ===")
        print(f"Total questions processed: {question_count}")
        for metric in ["precision", "recall", "f1"]:
            avg = total_stats[metric] / question_count
            print(f"Average {metric}: {avg:.2f}")

        print(f"\nTotal correct entities: {int(total_stats['correct'])}")
        print(f"Total incorrect entities: {int(total_stats['incorrect'])}")
        print(f"Total missing entities: {int(total_stats['missing'])}")

# Запуск тестирования
if __name__ == "__main__":
    test_spotlight_api(max_questions=10)  # Тестируем первые 10 вопросов

Tagged question:  List the board games from <GMT>.
dataset uris:  ['http://dbpedia.org/resource/Chandragupta_(board_game)', 'http://dbpedia.org/resource/Fields_of_Fire_(game)', 'http://dbpedia.org/resource/Sword_of_Rome', 'http://dbpedia.org/resource/Paths_of_Glory_(board_game)', 'http://dbpedia.org/resource/Commands_&_Colors:_Ancients', 'http://dbpedia.org/resource/Labyrinth:_The_War_on_Terror,_2001_–_%3F', 'http://dbpedia.org/resource/Twilight_Struggle', "http://dbpedia.org/resource/Washington's_War"]
Entity uris:  {'GMT': 'http://dbpedia.org/resource/Greenwich_Mean_Time'}

Question ID: 1
Precision: 0.00, Recall: 0.00, F1: 0.00
Correct: 0, Incorrect: 1, Missing: 8
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Tagged question:  Who developed <Skype>?
dataset uris:  ['http://dbpedia.org/resource/Skype_Technologies']
Entity uris:  {'Skype': 'http://dbpedia.org/resource/Skype'}

Question ID: 2
Precision: 0.00, Recall: 0.00, F1: 0.00
Correct: 0, Incorrect: 1, Missing: 1
+++

KeyboardInterrupt: 

In [24]:
question = 'Which rivers flow into the North Sea?'

In [7]:
get_dbpedia('What types of animals are on the verge of extinction?')

{'@text': 'What types of animals are on the verge of extinction?',
 '@confidence': '0.5',
 '@support': '0',
 '@types': '',
 '@sparql': '',
 '@policy': 'whitelist'}

In [8]:
uris('What types of animals are on the verge of extinction?')

ERROR


('', {})

In [32]:
print(uris(question)

('Which rivers flow into the <North Sea>?',
 {'North Sea': 'http://dbpedia.org/resource/North_Sea'})

# Preparing for DIN SPARQL

In [4]:
NER_PROMPT = """
Examples:

1. Input: "Who are relatives of Ozzy Osbourne and Kelly Osbourne?"
Output:
Let's think step by step. In the question "Who are relatives of Ozzy Osbourne and Kelly Osbourne?", we are asked: "find people related to both Ozzy Osbourne and Kelly Osbourne".
so we need to identify: relatives, persons.
The entities are: Ozzy Osbourne, Kelly Osbourne.
So the intermediary_question is: Whose <relativess> are <Ozzy Osbourne> and <Kelly Osbourne>?

2. Input: "What is the television show whose previous work is The Spirit of Christmas (short film)?"
Output:
Let's think step by step. In the question "What is the television show whose previous work is The Spirit of Christmas (short film)?", we are asked: "find TV shows connected to The Spirit of Christmas short film".
so we need to identify: television show, previous work, film.
The entities are: The Spirit of Christmas (short film).
So the intermediary_question is: What is the <television show> whose <previous work> is <The Spirit of Christmas (short film)>?

3. Input: "Which office holder's governor is Charles Willing Byrd and has final resting place in North Bend, Ohio?"
Output:
Let's think step by step. In the question "Which office holder's governor is Charles Willing Byrd and has final resting place in North Bend, Ohio?", we are asked: "find political figures governed by Charles Willing Byrd who are buried in North Bend".
so we need to identify: office holder, governor, resting place, person, location.
The entities are: Charles Willing Byrd, North Bend, Ohio.
So the intermediary_question is: What is the <office holder> whose <governor> is <Charles Willing Byrd> and <restingplace> is <North Bend, Ohio>?

4. Input: "What is the allegiance of John Kotelawala?"
Output:
Let's think step by step. In the question "What is the allegiance of John Kotelawala?", we are asked: "find political allegiance of John Kotelawala".
so we need to identify: allegiance, person.
The entities are: John Kotelawala.
So the intermediary_question is: What is the <allegiance> of John Kotelawala?

5. Input: "Where is the headquarters of the public transit system which owns the American Boulevard (Metro Transit station)?"
Output:
Let's think step by step. In the question "Where is the headquarters of the public transit system which owns the American Boulevard (Metro Transit station)?", we are asked: "find location of transit agency managing American Boulevard station".
so we need to identify: headquarters, public transit system, owning organization, station.
The entities are: American Boulevard (Metro Transit station).
So the intermediary_question is: What is the <headquarters> of the <public transit system> which is the <owning organisation> of <American Boulevard (Metro Transit station)>?

Key Requirements:
- Maintain EXACT output format including punctuation and spacing
- Use same entity tags as examples (<relativess>, <restingplace> etc.)
- Keep original capitalization in entity names
- Never add additional explanations outside the template
"""

In [5]:
URI_GENERATION_PROMPT = """
Examples:

1. Input: "Who are relatives of Ozzy Osbourne and Kelly Osbourne?"
Output:
Let's think step by step. In the question "Who are relatives of Ozzy Osbourne and Kelly Osbourne?", we are asked: "find people related to both Ozzy Osbourne and Kelly Osbourne".
so we need to identify: relatives, persons.
The entities are: Ozzy Osbourne, Kelly Osbourne.
So the intermediary_question is: Whose <relativess> are <Ozzy Osbourne> and <Kelly Osbourne>?

2. Input: "What is the television show whose previous work is The Spirit of Christmas (short film)?"
Output:
Let's think step by step. In the question "What is the television show whose previous work is The Spirit of Christmas (short film)?", we are asked: "find TV shows connected to The Spirit of Christmas short film".
so we need to identify: television show, previous work, film.
The entities are: The Spirit of Christmas (short film).
So the intermediary_question is: What is the <television show> whose <previous work> is <The Spirit of Christmas (short film)>?

3. Input: "Which office holder's governor is Charles Willing Byrd and has final resting place in North Bend, Ohio?"
Output:
Let's think step by step. In the question "Which office holder's governor is Charles Willing Byrd and has final resting place in North Bend, Ohio?", we are asked: "find political figures governed by Charles Willing Byrd who are buried in North Bend".
so we need to identify: office holder, governor, resting place, person, location.
The entities are: Charles Willing Byrd, North Bend, Ohio.
So the intermediary_question is: What is the <office holder> whose <governor> is <Charles Willing Byrd> and <restingplace> is <North Bend, Ohio>?

4. Input: "What is the allegiance of John Kotelawala?"
Output:
Let's think step by step. In the question "What is the allegiance of John Kotelawala?", we are asked: "find political allegiance of John Kotelawala".
so we need to identify: allegiance, person.
The entities are: John Kotelawala.
So the intermediary_question is: What is the <allegiance> of John Kotelawala?

5. Input: "Where is the headquarters of the public transit system which owns the American Boulevard (Metro Transit station)?"
Output:
Let's think step by step. In the question "Where is the headquarters of the public transit system which owns the American Boulevard (Metro Transit station)?", we are asked: "find location of transit agency managing American Boulevard station".
so we need to identify: headquarters, public transit system, owning organization, station.
The entities are: American Boulevard (Metro Transit station).
So the intermediary_question is: What is the <headquarters> of the <public transit system> which is the <owning organisation> of <American Boulevard (Metro Transit station)>?

Key Requirements:
- Maintain EXACT output format including punctuation and spacing
- Use same entity tags as examples (<relativess>, <restingplace> etc.)
- Keep original capitalization in entity names
- Never add additional explanations outside the template
"""

In [6]:
SPARQL_GENERATION_PROMPT = """
Input:
Original Question: "List the resting place of the people who served in Norwalk Trainband"
Question with Entities: "List the <restingplace> of the <politicians> whose <military unit> is <Norwalk Trainband>"
DBpedia URIs:
- <Norwalk Trainband> : http://dbpedia.org/resource/Norwalk_Trainband
- <militaryUnit> : http://dbpedia.org/ontology/militaryUnit
- <restingplace> : http://dbpedia.org/property/restingplace
- <Person> : http://dbpedia.org/ontology/Person

Thought Process:
Let's think step by step. In the question "List the resting place of the people who served in Norwalk Trainband", we are asked:
1. "the resting place" → we need property = [http://dbpedia.org/property/restingplace]
2. "people who served in Norwalk Trainband" → we need:
   - class restriction = [http://dbpedia.org/ontology/Person] (to identify people)
   - military service property = [http://dbpedia.org/ontology/militaryUnit]
   - specific military unit = [http://dbpedia.org/resource/Norwalk_Trainband]
3. We need to connect these through a variable (?x) representing the people
4. Final output should be the resting places (?uri)

SPARQL:
SELECT DISTINCT ?uri WHERE {
  ?x <http://dbpedia.org/ontology/militaryUnit> <http://dbpedia.org/resource/Norwalk_Trainband> .
  ?x <http://dbpedia.org/property/restingplace> ?uri .
  ?x a <http://dbpedia.org/ontology/Person> .
}

Input:
Original Question: "What are the broadcast areas of Mauritius Broadcasting Corporation"
Question with Entities: "What are the <broadcast area> of <Mauritius Broadcasting Corporation>"
DBpedia URIs:
- <Mauritius Broadcasting Corporation> : http://dbpedia.org/resource/Mauritius_Broadcasting_Corporation
- <broadcastArea> : http://dbpedia.org/property/broadcastArea

Thought Process:
Let's think step by step. In the question "What are the broadcast areas of Mauritius Broadcasting Corporation", we are asked:
1. "broadcast areas" → we need property = [http://dbpedia.org/property/broadcastArea]
2. "of Mauritius Broadcasting Corporation" → we need specific entity = [http://dbpedia.org/resource/Mauritius_Broadcasting_Corporation]
3. This is a direct property lookup without need for variables or class restrictions
4. Final output should be the broadcast areas (?uri)

SPARQL:
SELECT DISTINCT ?uri WHERE {
  <http://dbpedia.org/resource/Mauritius_Broadcasting_Corporation> <http://dbpedia.org/property/broadcastArea> ?uri .
}

Input:
Original Question: "Which company released the software RenderMan"
Question with Entities: "What is the <company> whose <products> is <RenderMan (software)>"
DBpedia URIs:
- <RenderMan (software)> : http://dbpedia.org/resource/RenderMan_(software)
- <products> : http://dbpedia.org/property/products
- <Company> : http://dbpedia.org/ontology/Company

Thought Process:
Let's think step by step. In the question "Which company released the software RenderMan", we are asked:
1. "company" → we need class restriction = [http://dbpedia.org/ontology/Company]
2. "released the software RenderMan" → we need:
   - release relationship property = [http://dbpedia.org/property/products]
   - specific product = [http://dbpedia.org/resource/RenderMan_(software)]
3. We need to find entities (?uri) that are Companies and have RenderMan as product
4. Final output should be the company/companies (?uri)

SPARQL:
SELECT DISTINCT ?uri WHERE {
  ?uri <http://dbpedia.org/property/products> <http://dbpedia.org/resource/RenderMan_(software)> .
  ?uri a <http://dbpedia.org/ontology/Company> .
}

Input:
Original Question: "Does Mumbai manage the railway line going to the Daund railway junction"
Question with Entities: "Is <Mumbai> the <serving railway line> of <Daund Junction railway station>"
DBpedia URIs:
- <Daund Junction railway station> : http://dbpedia.org/resource/Daund_Junction_railway_station
- <servingRailwayLine> : http://dbpedia.org/ontology/servingRailwayLine
- <Mumbai> : http://dbpedia.org/resource/Mumbai

Thought Process:
Let's think step by step. In the question "Does Mumbai manage the railway line going to the Daund railway junction", we are asked:
1. This is a yes/no question → we need ASK query
2. We need to check if:
   - subject = [http://dbpedia.org/resource/Daund_Junction_railway_station]
   - property = [http://dbpedia.org/ontology/servingRailwayLine]
   - object = [http://dbpedia.org/resource/Mumbai]
3. No variables needed, just a direct triple check
4. Query should return true/false whether this exact triple exists

SPARQL:
ASK WHERE {
  <http://dbpedia.org/resource/Daund_Junction_railway_station> <http://dbpedia.org/ontology/servingRailwayLine> <http://dbpedia.org/resource/Mumbai> .
}
"""

# DIN SPARQL

In [87]:
class DBpediaPipeline:
    def __init__(self, api_key: str, dbpedia_endpoint: str = "http://dbpedia.org/sparql", NER_PROMPT: str = NER_PROMPT, URI_GENERATION_PROMPT: str = URI_GENERATION_PROMPT, SPARQL_GENERATION_PROMPT: str = SPARQL_GENERATION_PROMPT):
        self.client = OpenAI(api_key=api_key)
        self.sparql_endpoint = SPARQLWrapper(dbpedia_endpoint)
        self.sparql_endpoint.setReturnFormat(JSON)

        self.NER_PROMPT = NER_PROMPT
        self.URI_GENERATION_PROMPT = URI_GENERATION_PROMPT
        self.SPARQL_GENERATION_PROMPT = SPARQL_GENERATION_PROMPT
        self.QUERY_REPAIR_PROMPT = """Fix this SPARQL query based on the execution error."""

    def translate_to_english(self, text):
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "You are a precise translation engine. Translate the input to English exactly, preserving: "
                               "1. All named entities (names, places, titles) "
                               "2. Technical terms "
                               "3. Question structure "
                               "Output ONLY the translation without commentary."
                },
                {
                    "role": "user",
                    "content": f"Translate this to English exactly:\n\n{text}"
                }
            ],
            temperature=0.1,
            max_tokens=500
        )
        return response.choices[0].message.content.strip()

    def _original_extract_entities(self, question: str) -> Tuple[Optional[str], Dict[str, str]]:
        """Original GPT-4 based entity extraction (kept as fallback)"""
        prompt = f"""{self.NER_PROMPT}\n\nQuestion: {question}\nProvide output in the exact required format:"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                        "role": "system",
                        "content": """
                                    You are an expert named entity recognizer for DBpedia questions. For each input question, follow this exact thinking process:

                                    1. ANALYZE the question structure:
                                      "In the question [quote question], we are asked: [paraphrase]"

                                    2. IDENTIFY components:
                                      - Main subject (class/resource)
                                      - Properties/relationships
                                      - Concrete entities (people/places/works)
                                      - Constraints/conditions

                                    3. EXTRACT entities:
                                      "so we need to identify: [list entity types]"
                                      "The entities are: [list specific entities]"

                                    4. GENERATE intermediary question:
                                      "So the intermediary_question is: [exact format as examples]"

                                    Output MUST follow this exact template for every question:

                                    Let's think step by step. In the question "[original question]", we are asked: "[paraphrased question]".
                                    so we need to identify: [entity types].
                                    The entities are: [specific entities].
                                    So the intermediary_question is: [exact match to example format]
                                    """

                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                temperature=0.0,
                max_tokens=400
            )

            full_response = response.choices[0].message.content.strip()

            intermediary_match = re.search(
                r'So the intermediary_question is:\s*(.*?)$',
                full_response,
                re.MULTILINE
            )

            if not intermediary_match:
                print("Error: Couldn't extract intermediary question from response")
                return None, {}

            tagged_question = intermediary_match.group(1).strip()

            entities = {}
            for match in re.finditer(r'<([^>]+)>([^<]+)</\1>', tagged_question):
                entity_type, entity_value = match.groups()
                entities[entity_value] = entity_type


            if not re.match(r'^Let\'s think step by step\.', full_response):
                print("Error: Response doesn't follow DINSQL format")
                return None, {}

            return tagged_question, entities

        except Exception as e:
            print(f"Error in entity extraction: {str(e)}")
            return None, {}

    def _original_generate_uris(self, tagged_question: str, entities: Dict[str, str]) -> Dict[str, str]:
        entity_list = "\n".join([f"- {value} ({type})" for value, type in entities.items()])
        prompt = f"{self.URI_GENERATION_PROMPT}\n\nTagged question: {tagged_question}\nEntities:\n{entity_list}\n\nDBpedia URIs:"

        try:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": """
                    DBpedia URI Conversion Expert - Entity to URI Mapping

                      Input Requirements:
                      1. Original question (for context)
                      2. Intermediary question with marked entities (<entity_type>)

                      Output Format:
                      - <tagged_entity> : full_DBpedia_URI (one per line)

                      URI Selection Rules:

                      1. Determining Entity Type:

                        a) RESOURCES (Specific named entities):
                            - Indicators: Proper nouns, specific instances
                            - Examples: <Blanche DuBois>, <Mid Wales>, <Python_(language)>
                            - Format: http://dbpedia.org/resource/Exact_Name
                              - Preserve original capitalization
                              - Replace spaces with underscores
                              - Keep special characters (parentheses, commas)
                              - Use official DBpedia names (check redirects)

                        b) CLASSES (Categories/Types):
                            - Indicators: Generic categories, answers "what kind?"
                            - Examples: <play>, <company>, <city>
                            - Format: http://dbpedia.org/ontology/ProperCase
                              - Always singular form
                              - Capitalize first letter
                              - Use most specific available class

                        c) PROPERTIES (Relationships):
                            - Indicators: Connects entities, shows relationships
                            - Examples: <founded by>, <alma mater>, <developer>
                            - Selection Priority:
                              1. Ontology properties (preferred):
                                  - Format: http://dbpedia.org/ontology/lowercase_property
                                  - More stable, better defined semantics
                              2. Generic properties (fallback):
                                  - Format: http://dbpedia.org/property/lowercase_property
                                  - Used when no ontology property exists
                            - Transformation rules:
                              - Convert to lowercase
                              - Replace spaces with underscores
                              - Use natural property names (e.g., 'alma mater' : 'almaMater')
                """},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.0,
                max_tokens=500
            )
            uri_text = response.choices[0].message.content

            uris = {}
            for line in uri_text.split('\n'):
                if line.strip() and ':' in line:
                    parts = line.split(':', 1)
                    entity = parts[0].strip().strip('-').strip()
                    uri = parts[1].strip()
                    uris[entity] = uri

            return uris
        except Exception as e:
            print(f"Error in URI generation: {str(e)}")
            return {}

    def get_dbpedia(self, text: str, language: str = "en") -> Optional[Dict]:
        """Get entities from DBpedia Spotlight API"""

        # Отключаем предупреждения о небезопасных запросах
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

        endpoint = f"https://api.dbpedia-spotlight.org/{language}/annotate"
        headers = {"Accept": "application/json"}
        params = {"text": text, "confidence": 0.5}

        try:
            response = requests.post(
                endpoint,
                headers=headers,
                data=params,
                verify=False  # Disable SSL verification
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"DBpedia Spotlight API error: {e}")
            return None

    def uris(self, new_question: str) -> Tuple[Optional[str], Dict[str, str]]:
        """New entity extraction using DBpedia Spotlight"""

        spotlight_result = self.get_dbpedia(new_question)

        if spotlight_result and "Resources" in spotlight_result:
            entities = {}
            tagged_parts = []
            remaining_text = new_question

            # Process each found entity
            for resource in sorted(spotlight_result["Resources"], key=lambda x: -int(x["@offset"])):
                entity_text = resource["@surfaceForm"]
                entity_type = resource["@types"].split(",")[0].split(":")[-1] if resource["@types"] else "thing"
                entity_uri = resource['@URI']

                # Replace in remaining text
                if entity_text in remaining_text:
                    entities[entity_text] = entity_uri
                    tagged_parts.append((remaining_text.index(entity_text),
                                        f"<{entity_text}>"))
                    remaining_text = remaining_text.replace(entity_text, "", 1)

            # Reconstruct tagged question
            if tagged_parts:
                # Sort by original position
                tagged_parts.sort()
                tagged_question = ""
                last_pos = 0

                for pos, tag in tagged_parts:
                    tagged_question += remaining_text[last_pos:pos] + tag
                    last_pos = pos

                tagged_question += remaining_text[last_pos:]

                return tagged_question, entities
        else:
          print('ERROR during DBpedia Spotlight running. Start using GPT')
          # original extracting + generating uris
          tagged_question, entities = self._original_extract_entities(question)

          if not tagged_question:
            return {"error": "Failed to extract entities"}

          return tagged_question, self._original_generate_uris(tagged_question, entities)

    def generate_sparql(self, original_question: str, tagged_question: str, uris: Dict[str, str]) -> Optional[str]:
        uri_mapping = "\n".join([f"- <{entity}> : {uri}" for entity, uri in uris.items()])

        prompt = f"""{self.SPARQL_GENERATION_PROMPT}\n\n
                Input:
                Original Question: "{original_question}"
                Question with Entities: "{tagged_question}"
                DBpedia URIs:
                {uri_mapping}
                """

        try:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {
                        "role": "system",
                        "content": """You are a precise SPARQL query generator that follows DINSQL-style reasoning. When given a natural language question with entity annotations and their corresponding DBpedia URIs, you generate accurate SPARQL queries by:

                                    1. Analyzing the input structure:
                                      - Original question (for context)
                                      - Entity-tagged question (showing relationships)
                                      - Provided DBpedia URIs (exact mappings)

                                    2. Following strict transformation rules:
                                      a) Maintain exact URI usage as provided - never modify or guess URIs
                                      b) Preserve all entity relationships from the tagged question
                                      c) Include all referenced entities in the query patterns

                                    3. Applying SPARQL best practices:
                                      - Use SELECT for information requests
                                      - Use ASK for verification questions
                                      - Include DISTINCT modifier unless duplicates are needed
                                      - Place class restrictions (a/rdf:type) when entities represent types
                                      - Connect variables through shared binding points

                                    4. Validation requirements:
                                      - Every mentioned entity appears in the query
                                      - All URIs match exactly what was provided
                                      - Variables are properly joined across patterns
                                      - Query type matches question intent
                                      - Class restrictions are applied where needed

                                    5. Output format:
                                      - Always begin with a 'Thought Process:' section explaining:
                                        * How you interpreted the question
                                        * Why you chose specific patterns
                                        * How variables connect
                                      - Provide the SPARQL query in a clean code block
                                      - Never modify the given URIs or invent new ones

                                    Remember:
                                    - If the question asks about properties of a specific instance, use its direct URI
                                    - When filtering by class, use 'a' or 'rdf:type' with the class URI
                                    - For missing information, don't make assumptions - use only provided URIs
                                    - Maintain consistent variable naming (?x, ?uri, etc.) across connected patterns
                                  """
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                temperature=0.0,
                max_tokens=800
            )

            full_response = response.choices[0].message.content.strip()
            sparql_match = re.search(r'SPARQL:\s*(.*?)$', full_response, re.DOTALL)

            if not sparql_match:
                print("Error: Couldn't extract SPARQL query from response")
                return None

            sparql_query = sparql_match.group(1).strip()

            if not (sparql_query.startswith("SELECT") or sparql_query.startswith("ASK")):
                print("Error: Generated query doesn't start with SELECT or ASK")
                return None

            return sparql_query

        except Exception as e:
            print(f"Error in SPARQL generation: {str(e)}")
            return None

    def postprocess_query(self, query) -> str:
        query = re.sub(r'^\s*#.*$', '', query, flags=re.MULTILINE)
        return query.strip()

    def validate_query(self, query):
      try:
          self.sparql_endpoint.setQuery(query)
          results = self.sparql_endpoint.query().convert()

          # Проверка на пустые результаты
          if 'boolean' in results.keys():
              if isinstance(results['boolean'], bool):
                  return True, None
          elif 'results' in results:
              #print(results['results']['bindings'])
              #print(len(results['results']['bindings']))
              if len(results['results']['bindings']) == 0:
                  return False, "Query executed successfully but returned empty results. Please regenerate the query with different parameters or conditions."
          elif isinstance(results, list) and not results:
              # Для CONSTRUCT/DESCRIBE запросов
              return False, "Query executed successfully but returned empty graph. Please regenerate the query with different parameters or conditions."

          return True, None

      except Exception as e:
          error_msg = re.sub(r"Endpoint returned:.*", "", str(e)).strip()
          return False, error_msg

    def repair_query(self, original_query, error, context):
        if "returned empty" in error:
            context['empty_result'] = True
            prompt = f"{self.QUERY_REPAIR_PROMPT}\n\nThe previous query returned empty results. Please modify it to return non-empty results.\n\nContext:\n- Original question: {context['original_question']}\n- Tagged question: {context['tagged_question']}\n- URIs: {context['uris']}\n\nInvalid query:\n{original_query}\n\nFixed query:"
        else:
            prompt = f"{self.QUERY_REPAIR_PROMPT}\n\nError: {error}\n\nContext:\n- Original question: {context['original_question']}\n- Tagged question: {context['tagged_question']}\n- URIs: {context['uris']}\n\nInvalid query:\n{original_query}\n\nFixed query:"

        try:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": '''Fix this SPARQL query based on the execution error.
                    Rules:
                    1. Preserve the original query intent
                    2. Keep all URIs unchanged
                    3. Only fix syntax/semantic issues
                    4. Return ONLY the fixed query'''},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.0,
                max_tokens=600
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error in query repair: {str(e)}")
            return None

    def execute_pipeline(self, question: str, max_retries: int = 2) -> Dict:
        print('Original question: ', question)
        #print('Step 0: Translate')
        en_question = self.translate_to_english(question)
        print('Translated question: ', en_question)
        if not en_question:
          return {"error": "Failed to translate"}

        #print('Step 1-2: Entity recognition + URI generation')
        # Step 1-2: Entity recognition + URI generation
        tagged_question, uris = self.uris(en_question)
        if not uris:
            return {"error": "Failed to generate URIs", "tagged_question": tagged_question}

        print('Step 3: Query generation')
        # Step 3: Query generation
        sparql = self.generate_sparql(question, tagged_question, uris)
        if not sparql:
            return {"error": "Failed to generate SPARQL", "tagged_question": tagged_question, "uris": uris}

        #print('Step 4: Postprocessing')
        # Step 4: Postprocessing
        sparql = self.postprocess_query(sparql)

        #print('Steps 5-6: Validation and repair cycle')
        # Steps 5-6: Validation and repair cycle
        for attempt in range(max_retries + 1):
            is_valid, error = self.validate_query(sparql)
            if is_valid:
                return {
                    "status": "success",
                    "tagged_question": tagged_question,
                    "uris": uris,
                    "sparql": sparql
                }

            if attempt < max_retries:
                context = {
                    "original_question": question,
                    "tagged_question": tagged_question,
                    "uris": uris
                }
                repaired_query = self.repair_query(sparql, error, context)
                if repaired_query and repaired_query != sparql:
                    sparql = self.postprocess_query(repaired_query)
                    continue

            return {
                "status": "error",
                "error": error,
                "tagged_question": tagged_question,
                "uris": uris,
                "sparql": sparql
            }

# QALD Test

## Preparing


In [69]:
def extract_values(data):

    values = []
    for item in data:
        if isinstance(item, dict):
            for key, nested in item.items():
                if isinstance(nested, dict) and 'value' in nested:
                    values.append(nested['value'])
                elif isinstance(nested, (list, tuple)):
                    for element in nested:
                        if isinstance(element, dict) and 'value' in element:
                            values.append(element['value'])
    return values


In [70]:
endpoint="http://dbpedia.org/sparql"
query = """SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/school> <http://dbpedia.org/resource/Pietermaritzburg> . ?x <http://dbpedia.org/property/school> ?uri  . }
"""
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
bindings = results['results']['bindings']
bindings
extract_values(bindings)

['http://dbpedia.org/resource/Pietermaritzburg',
 'http://dbpedia.org/resource/Voortrekker_High_School_(Pietermaritzburg)',
 'http://dbpedia.org/resource/Maritzburg_College']

In [71]:
query = """ASK WHERE { <http://dbpedia.org/resource/Hyundai_Lavita> <http://dbpedia.org/property/assembly> <http://dbpedia.org/resource/Ulsan> }
"""
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results
#extract_values(bindings)

{'head': {'link': []}, 'boolean': True}

In [72]:
isinstance(results['boolean'], bool)

True

In [76]:
from SPARQLWrapper import SPARQLWrapper, JSON


def sparql_results_match(query1, query2, endpoint="http://dbpedia.org/sparql"):
    def execute_query(query):
        sparql = SPARQLWrapper(endpoint)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        try:
            results = sparql.query().convert()

            # Обработка ASK запросов (возвращают boolean)
            if 'boolean' in results:
                return results['boolean']

            # Обработка SELECT запросов
            if 'results' in results:
                bindings = results['results']['bindings']
                return extract_values(bindings)

            # Для других типов запросов (CONSTRUCT/DESCRIBE)
            return None

        except Exception as e:
            print(f"Error executing query: {e}")
            return None

    results1 = execute_query(query1)
    results2 = execute_query(query2)
    print(f"Results1: {results1}")
    print(f"Results2: {results2}")

    # Оба запроса вернули None (ошибка выполнения)
    if results1 is None and results2 is None:
        return True

    # Обработка ASK запросов
    if isinstance(results1, bool) and isinstance(results2, bool):
        return results1 == results2

    # Один из результатов - boolean, другой - нет
    if isinstance(results1, bool) or isinstance(results2, bool):
        return False

    elif len(results1) > 0 and len(results2) > 0:
      if results1 and results2:
        return results1 == results2
      else:
        return False
    else:
        return False

In [77]:
import json
import pandas as pd

with open('qald_9_plus_test_dbpedia.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

rows = []

for question in data['questions']:
    question_id = question['id']

    for q in question['question']:
        lang = q['language']
        question_text = q['string']

        sparql_query = question['query']['sparql']

        answers = []
        for answer in question['answers']:
            if 'results' in answer:
                for binding in answer['results']['bindings']:
                    for var, value in binding.items():
                        answers.append(value['value'])
            elif 'boolean' in answer:
                answers.append(str(answer['boolean']))

        rows.append({
            'id': question_id,
            'language': lang,
            'question': question_text,
            'sparql_query': sparql_query,
            'answers': ', '.join(answers) if answers else None
        })

test_df = pd.DataFrame(rows)
test_df.to_csv('qald_questions.csv', index=False, encoding='utf-8')
test_df

Unnamed: 0,id,language,question,sparql_query,answers
0,99,en,What is the time zone of Salt Lake City?,PREFIX res: <http://dbpedia.org/resource/> PRE...,http://dbpedia.org/resource/Mountain_Time_Zone
1,99,de,In welcher Zeitzone liegt Salt Lake City?,PREFIX res: <http://dbpedia.org/resource/> PRE...,http://dbpedia.org/resource/Mountain_Time_Zone
2,99,de,Was ist die Zeitzone von Salt Lake City?,PREFIX res: <http://dbpedia.org/resource/> PRE...,http://dbpedia.org/resource/Mountain_Time_Zone
3,99,ru,Какой часовой пояс в Солт-Лейк-Сити,PREFIX res: <http://dbpedia.org/resource/> PRE...,http://dbpedia.org/resource/Mountain_Time_Zone
4,99,ru,В каком часовом поясе расположен Солт-Лейк-Сити?,PREFIX res: <http://dbpedia.org/resource/> PRE...,http://dbpedia.org/resource/Mountain_Time_Zone
...,...,...,...,...,...
1349,179,ru,Как назывались три корабля Колумба?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Niña, http://dbped..."
1350,179,uk,Як називалися три кораблі Колумба?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Niña, http://dbped..."
1351,179,lt,Kokios buvo trys Kolumbo laivų pavadinimai?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Niña, http://dbped..."
1352,179,lt,Kaip vadinosi trys Kolumbo laivai?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Niña, http://dbped..."


In [78]:
test_df = test_df.sample(n=50)
test_df

Unnamed: 0,id,language,question,sparql_query,answers
423,137,lt,Išvardykite visas olandų partijas,SELECT DISTINCT ?uri WHERE { ?uri a <http://db...,http://dbpedia.org/resource/The_Greens_(Nether...
552,104,ba,Роберт Кеннединың ҡыҙың ире кем?,SELECT DISTINCT ?uri WHERE { <http://dbpedia.o...,http://dbpedia.org/resource/Mary_Richardson_Ke...
1116,194,ba,MN аҡш штаттары ҡыҫҡартылған атамаһы ниндәй бу...,SELECT DISTINCT ?uri WHERE { ?uri a yago:Wikic...,
1332,87,ba,Барыһы китап биттәрендә күберәк ниндәй?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,http://dbpedia.org/resource/The_Tolkien_Reader
732,154,uk,Які книги написала Даніела Стіл?,PREFIX rdfs: <http://www.w3.org/2000/01/rdf-sc...,http://dbpedia.org/resource/Hotel_Vendome_(nov...
424,137,ba,Бөтә һанап партия голланд,SELECT DISTINCT ?uri WHERE { ?uri a <http://db...,http://dbpedia.org/resource/The_Greens_(Nether...
1345,43,lt,Pasakyk man kompanijų svetaines su daugiau nei...,PREFIX foaf: <http://xmlns.com/foaf/0.1/> PREF...,"http://www.onoff.se, http://www.tesco.com, htt..."
355,150,ru,В каком городе родился президент Черногории?,PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Belgrade, http://d..."
371,149,lt,Kurioje JAV valstijoje gyventojų tankumas didž...,SELECT ?uri WHERE { ?uri <http://dbpedia.org/o...,
69,66,uk,"Які художники народилися того ж самого дня, що...",PREFIX dbo: <http://dbpedia.org/ontology/> PRE...,"http://dbpedia.org/resource/Cameron_Cartio, ht..."


## Цикл тестирования

In [92]:
api_key = "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"
pipeline = DBpediaPipeline(api_key)

total_questions = len(test_df)
correct_em = 0

for q in range(len(test_df)):
  try:
    result = pipeline.execute_pipeline(test_df.iloc[q]['question'])
    sparql = result['sparql']
    gold_sparql = test_df.iloc[q]["sparql_query"]

    generated_normalized = ' '.join(sparql.split()).lower()
    gold_normalized = ' '.join(gold_sparql.split()).lower()

    is_correct = sparql_results_match(sparql, gold_sparql)
    if is_correct:
      print(True)
      correct_em += 1
    #print(f"Question: {test_df.iloc[q]['question']}")
    print(f"Generated: {sparql}")
    print(f"GOLD: {gold_sparql}")
    print()
  except:
    print(test_df.iloc[q]['question'])
    print('GPT ERROR')

em_accuracy = correct_em / total_questions
print(f"\nMatch Accuracy: {em_accuracy:.2%} ({correct_em}/{total_questions})")

Original question:  Išvardykite visas olandų partijas
Translated question:  List all Dutch parties.
Step 3: Query generation
Results1: ['http://dbpedia.org/resource/Workers_Party_of_the_Netherlands_(build-up_organisation)', 'http://dbpedia.org/resource/Democratic_Political_Turning_Point', "http://dbpedia.org/resource/Communist_Workers'_Party_of_the_Netherlands", 'http://dbpedia.org/resource/Onafhankelijke_Burger_Partij', 'http://dbpedia.org/resource/Christian_Democratic_Appeal', 'http://dbpedia.org/resource/Frisian_National_Party', 'http://dbpedia.org/resource/Anti-Revolutionary_Party', 'http://dbpedia.org/resource/Liberal_State_Party', 'http://dbpedia.org/resource/Communist_Workers_Organisation_(Marxist–Leninist)', 'http://dbpedia.org/resource/Trots_op_Nederland', 'http://dbpedia.org/resource/League_of_Communists_in_the_Netherlands', 'http://dbpedia.org/resource/50PLUS', 'http://dbpedia.org/resource/Party_for_Zeeland', 'http://dbpedia.org/resource/Party_for_the_North', 'http://dbpedia

In [93]:
print(f"\nMatch Accuracy: {em_accuracy:.2%} ({correct_em}/{total_questions})")


Match Accuracy: 24.00% (12/50)


In [8]:
import requests
from urllib.parse import quote

def get_dbpedia_neighbors(entity_url: str) -> dict:
    """
    Извлекает всех соседей сущности в графе DBpedia.

    Args:
        entity_url (str): URL сущности в DBpedia (например, "http://dbpedia.org/resource/Danielle_Steel")

    Returns:
        dict: Словарь, где ключи - имена связанных сущностей, значения - их URL в DBpedia
    """
    # Проверяем и корректируем URL
    if not entity_url.startswith("http://dbpedia.org/resource/"):
        entity_url = f"http://dbpedia.org/resource/{entity_url.split('/')[-1]}"

    # SPARQL запрос для получения всех соседей (исправленная версия)
    sparql_query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT DISTINCT ?property ?neighbor ?neighborLabel WHERE {{
      <{entity_url}> ?property ?neighbor .
      FILTER (isURI(?neighbor) && STRSTARTS(STR(?neighbor), "http://dbpedia.org/resource/"))
      OPTIONAL {{
        ?neighbor rdfs:label ?neighborLabel .
        FILTER (LANG(?neighborLabel) = "en")
      }}
    }}
    LIMIT 1000
    """

    # Параметры запроса
    params = {
        'query': sparql_query,
        'format': 'json'
    }

    headers = {
        'Accept': 'application/sparql-results+json'
    }

    endpoint_url = "https://dbpedia.org/sparql"

    try:
        response = requests.get(endpoint_url, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        results = response.json()

        neighbors = {}
        for binding in results["results"]["bindings"]:
            neighbor_url = binding["neighbor"]["value"]
            # Используем URL как имя, если нет метки
            neighbor_name = binding.get("neighborLabel", {}).get("value", neighbor_url.split("/")[-1].replace("_", " "))
            neighbors[neighbor_name] = neighbor_url

        return neighbors

    except Exception as e:
        print(f"Ошибка при выполнении запроса: {str(e)}")
        if hasattr(e, 'response') and e.response:
            print(f"Ответ сервера: {e.response.text}")
        return {}

In [9]:
example_url = "http://dbpedia.org/resource/Danielle_Steel"
print(f"Получаем соседей для: {example_url}")
neighbors = get_dbpedia_neighbors(example_url)

print(f"\nНайдено {len(neighbors)} связанных сущностей:")
for name, url in list(neighbors.items())[:20]:  # Выводим первые 20 для примера
    print(f"{name}: {url}")

Получаем соседей для: http://dbpedia.org/resource/Danielle_Steel

Найдено 94 связанных сущностей:
California: http://dbpedia.org/resource/California
Publishers Weekly: http://dbpedia.org/resource/Publishers_Weekly
Romance novel: http://dbpedia.org/resource/Romance_novel
San Francisco: http://dbpedia.org/resource/San_Francisco
Passion's Promise: http://dbpedia.org/resource/Passion's_Promise
List of best-selling fiction authors: http://dbpedia.org/resource/List_of_best-selling_fiction_authors
List of works by Danielle Steel: http://dbpedia.org/resource/List_of_works_by_Danielle_Steel
Vanished (1995 film): http://dbpedia.org/resource/Vanished_(1995_film)
Incest: http://dbpedia.org/resource/Incest
Novelist: http://dbpedia.org/resource/Novelist
Now and Forever (Danielle Steel novel): http://dbpedia.org/resource/Now_and_Forever_(Danielle_Steel_novel)
1947 births: http://dbpedia.org/resource/Category:1947_births
20th-century American novelists: http://dbpedia.org/resource/Category:20th-centur