In [1]:
import pandas as pd
from collections import Counter
from utils.openai_helpers import query_openai_model
from utils.wiki_helpers import get_label_for_qid, get_triplet_labels

In [2]:
with open('inputs/wikidata5m/wikidata5m_transductive_valid.txt', 'r') as file:
    wiki_valid = file.readlines()

In [3]:
def convert_to_triplets(input_list):
    """
    :param input_list: List of strings formatted as 'head\trelation\ttail\n'
    :return: List of triplets (head, relation, tail)
    """
    # Split each string by the tab character and strip the newline character, then store as a tuple
    triplets = [tuple(item.strip().split('\t')) for item in input_list]
    return triplets

def top_n_entities(triplets, n=10):
    """
    :param triplets: List of triplets (head, relation, tail)
    :param n: Number of top entities to return
    :return: List of tuples (entity, count) for the top n occurring entities
    """
    entities = entities = [triplet[0] for triplet in triplets] + [triplet[2] for triplet in triplets]
    entity_counts = Counter(entities)
    top_entities = entity_counts.most_common(n)
    
    return top_entities

def triplets_containing_entity(triplets, qid):
    """
    :param triplets: List of triplets (entity, relation, entity)
    :param qid: The QID of the entity to filter by
    :return: List of triplets that contain the specified QID
    """
    filtered_triplets = [triplet for triplet in triplets if qid in triplet]
    
    return filtered_triplets

In [4]:
list_of_triplets = convert_to_triplets(wiki_valid)

In [5]:
list_of_triplets[:5]

[('Q3576734', 'P495', 'Q30'),
 ('Q641724', 'P1412', 'Q1860'),
 ('Q959357', 'P39', 'Q49476'),
 ('Q4263990', 'P105', 'Q7432'),
 ('Q4119101', 'P171', 'Q2906912')]

In [6]:
len(list_of_triplets)

5163

In [7]:
top_entities = top_n_entities(list_of_triplets, 30)

In [8]:
top_entities

[('Q5', 377),
 ('Q30', 202),
 ('Q16521', 84),
 ('Q145', 67),
 ('Q7432', 64),
 ('Q2736', 53),
 ('Q1860', 52),
 ('Q82955', 42),
 ('Q6723', 38),
 ('Q486972', 33),
 ('Q6655', 33),
 ('Q937857', 33),
 ('Q16', 31),
 ('Q183', 30),
 ('Q532', 28),
 ('Q11424', 28),
 ('Q482994', 25),
 ('Q794', 23),
 ('Q142', 23),
 ('Q33999', 22),
 ('Q668', 21),
 ('Q408', 20),
 ('Q571', 20),
 ('Q134556', 19),
 ('Q34740', 15),
 ('Q4830453', 15),
 ('Q38', 14),
 ('Q336286', 13),
 ('Q8502', 13),
 ('Q188', 13)]

In [9]:
triplets_containing_entity(list_of_triplets, 'Q38')

[('Q3835811', 'P495', 'Q38'),
 ('Q19577450', 'P27', 'Q38'),
 ('Q3770400', 'P27', 'Q38'),
 ('Q1223267', 'P17', 'Q38'),
 ('Q327719', 'P27', 'Q38'),
 ('Q7335559', 'P17', 'Q38'),
 ('Q2314711', 'P17', 'Q38'),
 ('Q955524', 'P27', 'Q38'),
 ('Q3631503', 'P20', 'Q38'),
 ('Q46462', 'P17', 'Q38'),
 ('Q1637348', 'P17', 'Q38'),
 ('Q53195', 'P17', 'Q38'),
 ('Q3138845', 'P495', 'Q38'),
 ('Q1131395', 'P17', 'Q38')]

In [10]:
# generated_text, usage = query_openai_model("What is the capital of France?")

In [11]:
get_label_for_qid('Q3835811')

('Catch as Catch Can', '1967 film by Franco Indovina')

In [12]:
get_triplet_labels(('Q3835811', 'P495', 'Q38'))

(('Catch as Catch Can', 'country of origin', 'Italy'),
 ('1967 film by Franco Indovina',
  'country of origin of this item (creative work, food, phrase, product, etc.)',
  'country in Southern Europe'))