In [6]:
import copy
from random import randrange
import datasets

In [24]:
SYMMETRICAL_RELATIONSHIPS = [
    "taxon synonym",
    "partner in business or sport",
    "opposite of",
    "partially coincident with",
    "physically interacts with",
    "partner",
    "relative",
    "related category",
    "connects with",
    "twinned administrative body",
    "different from",
    "said to be the same as",
    "sibling",
    "adjacent station",
    "shares border with",
]
# special tokens
Q_TOKEN = "[Q]"
S_TOKEN = "[S]"
P_TOKEN = "[P]"
O_TOKEN = "[O]"
N_TOKEN = ""
SPECIAL_TOKENS = [Q_TOKEN, S_TOKEN, P_TOKEN, O_TOKEN, N_TOKEN]

In [20]:
def linearize_rdf(triples):
    encoded_rdf = ""
    for triple in triples:
        if len(triple) == 3:
            encoded_rdf += f"{S_TOKEN} {triple[0]} {P_TOKEN} {triple[1]} {O_TOKEN} {triple[2]} "
        elif len(triple) == 4:
            encoded_rdf += f"{S_TOKEN} {triple[0]} {P_TOKEN} {triple[1]} {triple[2]} {O_TOKEN} {triple[3]} "
        else:
            raise ValueError(f"Triple length was {len(triple)} instead of the expected 3 or 4")
    return encoded_rdf

In [32]:
def _invert_triple(triple):
    output = copy.deepcopy(triple)
    output[0] = triple[-1]
    output[-1] = triple[0]
    return output

def invert_one_triple(example, rdf_key="triples"):
    # if all triples in the RDF are symmetrical relationships, return None
    possible_swap_spots = [i for i in range(len(example[rdf_key])) if example[rdf_key][i][1] not in SYMMETRICAL_RELATIONSHIPS]
    if len(possible_swap_spots) == 0:
        example["rdf_inverted"] = None
    else:
        # we invert at random one of the non-symmetrical relationship triples
        replacement_spot = possible_swap_spots[randrange(len(possible_swap_spots))]
        inverted_rdf = [_invert_triple(example[rdf_key][i]) if i == replacement_spot else example[rdf_key][i] for i in range(len(example[rdf_key]))]
        example["rdf_inverted"] = linearize_rdf(inverted_rdf)
    return example

In [33]:
kelm = datasets.load_dataset("json", data_files="../datasets/KELM/clean_kelm.jsonl", split="train")

Using custom data configuration default-4a0c9cbc9b685d80
Reusing dataset json (/home/teven/.cache/huggingface/datasets/json/default-4a0c9cbc9b685d80/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


In [37]:
invert_one_triple(kelm[3])

{'triples': [['Fazl Rural District ( Hamadan Province )',
   'located in the administrative territorial entity',
   'Zarrin Dasht District'],
  ['Fazl Rural District ( Hamadan Province )', 'country', 'Iran'],
  ['Fazl Rural District ( Hamadan Province )',
   'instance of',
   'Dehestan ( administrative division )']],
 'serialized_triples': 'Fazl Rural District ( Hamadan Province ) country Iran, located in the administrative territorial entity Zarrin Dasht District, instance of Dehestan ( administrative division ).',
 'gen_sentence': 'Fazl Rural District (Hamadan Province) is located in the Zarrin Dasht District, Iran.',
 'rdf_inverted': '[S] Fazl Rural District ( Hamadan Province ) [P] located in the administrative territorial entity [O] Zarrin Dasht District [S] Fazl Rural District ( Hamadan Province ) [P] country [O] Iran [S] Dehestan ( administrative division ) [P] instance of [O] Fazl Rural District ( Hamadan Province ) '}