# RAG and GPT-4o double checking for identifying properties

Approach to identify the top match properties from a JSON file using RAG, and once it identifies them, ask GPT-4o again for a final check between the prompt's context and the top matched properties to get a final answer.

In [None]:
# Install necessary libraries
!pip install openai transformers torch scikit-learn faiss-cpu

In [None]:
# Import required libraries
import json
import openai
import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
from google.colab import userdata
from torch.utils.data import DataLoader, TensorDataset
from openai import OpenAI

# Set your OpenAI API key
openai_api_key = userdata.get('OPENAI_API_KEY')  # Replace with your OpenAI API key

# Initialize the OpenAI client with the API key
client = OpenAI(api_key=openai_api_key)  # Pass the API key here

# Function to use GPT-4o for processing the query
def use_gpt_4o(query):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in generating SPARQL queries based on natural language queries."},
            {"role": "user", "content": f"Parse the following query and provide detailed properties: {query}"}
        ],
        max_tokens=150
    )
    # Print the entire response for inspection
    print(response)
    return response.choices[0].message.content.strip()

# Load the text embedding model and tokenizer
model_name = "TaylorAI/bge-micro-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to compute embeddings in batches
def compute_embeddings(texts, batch_size=32, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

# Load the Wikidata properties JSON file
with open('/content/sample_data/props.json', 'r') as f:
    wikidata_properties = json.load(f)

# Extract property labels and aliases and compute their embeddings
property_labels_and_aliases = []
property_labels_map = {}  # Maps aliases to their respective labels
for prop in wikidata_properties:
    property_labels_and_aliases.append(prop['label'])
    property_labels_map[prop['label']] = prop['label']
    for alias in prop.get('aliases', []):
        property_labels_and_aliases.append(alias)
        property_labels_map[alias] = prop['label']

property_embeddings = compute_embeddings(property_labels_and_aliases)

# Build Faiss index
d = property_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)  # L2 distance
index.add(property_embeddings)  # Add embeddings to the index

# Function to extract property from GPT-4o response
def extract_property_from_response(response):
    lines = response.split('\n')
    for line in lines:
        if 'Predicate' in line or 'Property' in line:
            return line.split(':')[-1].strip()
    return None

# Function to map a natural language question to Wikidata properties
def map_question_to_properties(question, top_n=10):
    gpt_response = use_gpt_4o(question)
    print(f"GPT-4o response: {gpt_response}")

    # Extract the relevant property from the GPT-4o response
    relevant_property = extract_property_from_response(gpt_response)
    if not relevant_property:
        print("No relevant property found in the GPT-4o response.")
        return []

    print(f"Relevant property identified: {relevant_property}")

    # Compute the embedding for the relevant property
    relevant_property_embedding = compute_embeddings([relevant_property])[0].reshape(1, -1)
    distances, indices = index.search(relevant_property_embedding, top_n)
    print(f"indices: {indices}, distances: {distances}")  # Debugging line

    # Ensure valid indices and match with labels/aliases
    valid_top_properties = []
    seen_labels = set()  # To avoid duplicates
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(property_labels_and_aliases):
            matched_label_or_alias = property_labels_and_aliases[idx]
            actual_label = property_labels_map[matched_label_or_alias]
            if actual_label not in seen_labels:
                valid_top_properties.append((actual_label, dist))
                seen_labels.add(actual_label)
                if len(valid_top_properties) >= top_n:
                    break

    return valid_top_properties

# Function to confirm the correct property with GPT-4o
def confirm_property_with_gpt4o(question, top_properties):
    properties_text = "\n".join([f"{i+1}. {prop[0]} (distance: {prop[1]:.4f})" for i, prop in enumerate(top_properties)])
    confirm_query = f"""
    Based on the question "{question}", the following properties were identified as potential matches:

    {properties_text}

    Which one of these properties is the most appropriate considering the context of the question?
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in identifying the most appropriate property based on the context of a natural language question."},
            {"role": "user", "content": confirm_query}
        ],
        max_tokens=150
    )
    print(response)
    return response.choices[0].message.content.strip()

In [None]:
# Prompt
question = "Where was Barack Obama born?"
top_properties = map_question_to_properties(question, top_n=10)
print("Top matching Wikidata properties for the question:")
for label, distance in top_properties:
    print(f"{label}: {distance:.4f}")

confirmed_property = confirm_property_with_gpt4o(question, top_properties)
print(f"Confirmed property: {confirmed_property}")

ChatCompletion(id='chatcmpl-9WBv2B9J4Xn043YwowKaN8dUN18Da', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='To generate a SPARQL query for the natural language query "Where was Barack Obama born?", we need to identify the key components in the question. Here are the detailed properties:\n\n1. **Subject**: Barack Obama (the entity in question)\n2. **Predicate**: Place of birth (the property we are seeking about the entity)\n3. **Object**: The birth place (the result we are trying to retrieve)\n\nWe\'ll assume we\'re using a standard knowledge base like DBpedia, where entities and properties are well-defined and standardized.\n\n**Steps to Convert to SPARQL:**\n\n1. **Identify the URI for the Subject**:\n   - In DBpedia, the URI for Barack Obama is usually `dbr:Barack_', role='assistant', function_call=None, tool_calls=None))], created=1717459340, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_319be4768e'

#Adding the ID of the property

In [None]:
# Import required libraries
import json
import openai
import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
from google.colab import userdata
from torch.utils.data import DataLoader, TensorDataset
from openai import OpenAI

# Set your OpenAI API key
openai_api_key = userdata.get('OPENAI_API_KEY')  # Replace with your OpenAI API key

# Initialize the OpenAI client with the API key
client = OpenAI(api_key=openai_api_key)  # Pass the API key here

# Function to use GPT-4o for processing the query
def use_gpt_4o(query):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in generating SPARQL queries based on natural language queries."},
            {"role": "user", "content": f"Parse the following query and provide detailed properties: {query}"}
        ],
        max_tokens=150
    )
    # Print the entire response for inspection
    print(response)
    return response.choices[0].message.content.strip()

# Load the text embedding model and tokenizer
model_name = "TaylorAI/bge-micro-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to compute embeddings in batches
def compute_embeddings(texts, batch_size=32, max_length=128):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

# Load the Wikidata properties JSON file
with open('/content/sample_data/props.json', 'r') as f:
    wikidata_properties = json.load(f)

# Extract property labels and aliases and compute their embeddings
property_labels_and_aliases = []
property_labels_map = {}  # Maps aliases to their respective labels
property_id_map = {}  # Maps labels to their respective IDs
for prop in wikidata_properties:
    property_labels_and_aliases.append(prop['label'])
    property_labels_map[prop['label']] = prop['label']
    property_id_map[prop['label']] = prop['id']
    for alias in prop.get('aliases', []):
        property_labels_and_aliases.append(alias)
        property_labels_map[alias] = prop['label']

property_embeddings = compute_embeddings(property_labels_and_aliases)

# Build Faiss index
d = property_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)  # L2 distance
index.add(property_embeddings)  # Add embeddings to the index

# Function to extract property from GPT-4o response
def extract_property_from_response(response):
    lines = response.split('\n')
    for line in lines:
        if 'Predicate' in line or 'Property' in line:
            return line.split(':')[-1].strip()
    return None

# Function to map a natural language question to Wikidata properties
def map_question_to_properties(question, top_n=10):
    gpt_response = use_gpt_4o(question)
    print(f"GPT-4o response: {gpt_response}")

    # Extract the relevant property from the GPT-4o response
    relevant_property = extract_property_from_response(gpt_response)
    if not relevant_property:
        print("No relevant property found in the GPT-4o response.")
        return []

    print(f"Relevant property identified: {relevant_property}")

    # Compute the embedding for the relevant property
    relevant_property_embedding = compute_embeddings([relevant_property])[0].reshape(1, -1)
    distances, indices = index.search(relevant_property_embedding, top_n)
    print(f"indices: {indices}, distances: {distances}")  # Debugging line

    # Ensure valid indices and match with labels/aliases
    valid_top_properties = []
    seen_labels = set()  # To avoid duplicates
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(property_labels_and_aliases):
            matched_label_or_alias = property_labels_and_aliases[idx]
            actual_label = property_labels_map[matched_label_or_alias]
            if actual_label not in seen_labels:
                valid_top_properties.append((actual_label, dist))
                seen_labels.add(actual_label)
                if len(valid_top_properties) >= top_n:
                    break

    return valid_top_properties

# Function to confirm the correct property with GPT-4o
def confirm_property_with_gpt4o(question, top_properties):
    properties_text = "\n".join([f"{i+1}. {prop[0]} (distance: {prop[1]:.4f})" for i, prop in enumerate(top_properties)])
    confirm_query = f"""
    Based on the question "{question}", the following properties were identified as potential matches:

    {properties_text}

    Which one of these properties is the most appropriate considering the context of the question?
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in identifying the most appropriate property based on the context of a natural language question."},
            {"role": "user", "content": confirm_query}
        ],
        max_tokens=150
    )
    print(response)
    confirmed_property_text = response.choices[0].message.content.strip()

    # Extract the property label from the GPT-4o confirmation response
    confirmed_property_label = None
    for prop in top_properties:
        if prop[0] in confirmed_property_text:
            confirmed_property_label = prop[0]
            break

    return confirmed_property_label

In [None]:
# Prompt
question = "What is the birth date of Albert Einstein?"
top_properties = map_question_to_properties(question, top_n=10)
print("Top matching Wikidata properties for the question:")
for label, distance in top_properties:
    print(f"{label}: {distance:.4f}")

confirmed_property_label = confirm_property_with_gpt4o(question, top_properties)

# Lookup the property ID from the JSON file
confirmed_property_id = property_id_map.get(confirmed_property_label, "ID not found")
print(f"Confirmed property: {confirmed_property_label}, ID: {confirmed_property_id}")

ChatCompletion(id='chatcmpl-9WBdkhKTkiD002IUgFIJiblGZOmVs', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='To generate a SPARQL query from a natural language question like "What is the birth date of Albert Einstein?", it\'s important to:\n\n1. **Identify the subject:** Albert Einstein\n2. **Identify the property being queried:** birth date\n\nGiven these components, we need the appropriate predicates and classes from the ontology to construct the SPARQL query. An example ontology that we could use is DBpedia. Below, I will break down the components and then generate the query.\n\n### Ontology Components\n- **Subject (Entity):** `Albert Einstein`\n- **Predicate (Property):** `birthDate` (In DBpedia, the corresponding property is `dbo:birthDate`)\n\n### SPARQL Query Components\n1. **Selecting', role='assistant', function_call=None, tool_calls=None))], created=1717458268, model='gpt-4o-2024-05-13', object='chat.completion', system_fi