In [2]:
import numpy as np
import pandas as pd
import json

In [11]:
with open("Datasets/fewrel_train_wiki.json") as file:
    train = json.load(file)

with open("Datasets/fewrel_val_wiki.json") as file:
    val = json.load(file)

In [14]:
len(train.keys()), len(val.keys()), train.keys()

(64,
 16,
 dict_keys(['P931', 'P4552', 'P140', 'P1923', 'P150', 'P6', 'P27', 'P449', 'P1435', 'P175', 'P1344', 'P39', 'P527', 'P740', 'P706', 'P84', 'P495', 'P123', 'P57', 'P22', 'P178', 'P241', 'P403', 'P1411', 'P135', 'P991', 'P156', 'P176', 'P31', 'P1877', 'P102', 'P1408', 'P159', 'P3373', 'P1303', 'P17', 'P106', 'P551', 'P937', 'P355', 'P710', 'P137', 'P674', 'P466', 'P136', 'P306', 'P127', 'P400', 'P974', 'P1346', 'P460', 'P86', 'P118', 'P264', 'P750', 'P58', 'P3450', 'P105', 'P276', 'P101', 'P407', 'P1001', 'P800', 'P131']))

In [16]:
train['P931'][:2]

[{'tokens': ['Merpati',
   'flight',
   '106',
   'departed',
   'Jakarta',
   '(',
   'CGK',
   ')',
   'on',
   'a',
   'domestic',
   'flight',
   'to',
   'Tanjung',
   'Pandan',
   '(',
   'TJQ',
   ')',
   '.'],
  'h': ['tjq', 'Q1331049', [[16]]],
  't': ['tanjung pandan', 'Q3056359', [[13, 14]]]},
 {'tokens': ['The',
   'name',
   'was',
   'at',
   'one',
   'point',
   'changed',
   'to',
   'Nottingham',
   'East',
   'Midlands',
   'Airport',
   'so',
   'as',
   'to',
   'include',
   'the',
   'name',
   'of',
   'the',
   'city',
   'that',
   'is',
   'supposedly',
   'most',
   'internationally',
   'recognisable',
   ',',
   'mainly',
   'due',
   'to',
   'the',
   'Robin',
   'Hood',
   'legend',
   '.'],
  'h': ['east midlands airport', 'Q8977', [[9, 10, 11]]],
  't': ['nottingham', 'Q41262', [[8]]]}]

We can see the general structure, the first level of the JSON represents a relation type, and contains all the sample of that relation

Each relation sample contains the tokenised sentence, head and tail entity (which each contain some unknown numbers)

In [19]:
sentence_lengths = {}
for key, samples in train.items():
    sentence_lengths[key] = []
    for s in samples:
        sentence_lengths[key].append(len(s['tokens']))
    sentence_lengths[key] = np.mean(sentence_lengths[key])

print("Overall mean sentence length:", np.mean(list(sentence_lengths.values())))
print("Mean sentence length per class:")
sentence_lengths

Overall mean sentence length: 24.98064732142857
Mean sentence length per class:


{'P931': 23.091428571428573,
 'P4552': 25.942857142857143,
 'P140': 25.415714285714287,
 'P1923': 25.077142857142857,
 'P150': 22.97,
 'P6': 25.041428571428572,
 'P27': 25.04,
 'P449': 25.041428571428572,
 'P1435': 22.568571428571428,
 'P175': 25.945714285714285,
 'P1344': 26.275714285714287,
 'P39': 25.315714285714286,
 'P527': 25.798571428571428,
 'P740': 23.074285714285715,
 'P706': 24.07857142857143,
 'P84': 25.964285714285715,
 'P495': 24.662857142857142,
 'P123': 25.6,
 'P57': 25.018571428571427,
 'P22': 25.745714285714286,
 'P178': 25.055714285714284,
 'P241': 25.382857142857144,
 'P403': 23.23857142857143,
 'P1411': 27.17142857142857,
 'P135': 26.195714285714285,
 'P991': 25.277142857142856,
 'P156': 26.924285714285713,
 'P176': 25.262857142857143,
 'P31': 24.93285714285714,
 'P1877': 25.982857142857142,
 'P102': 25.035714285714285,
 'P1408': 24.717142857142857,
 'P159': 24.478571428571428,
 'P3373': 25.635714285714286,
 'P1303': 24.19857142857143,
 'P17': 25.595714285714287,
 

In [21]:
# Examine what the numbers in entity entries mean
heads = []
for key, samples in train.items():
    for s in samples:
        heads.extend([s['h'], s['t']])

In [22]:
heads

[['tjq', 'Q1331049', [[16]]],
 ['tanjung pandan', 'Q3056359', [[13, 14]]],
 ['east midlands airport', 'Q8977', [[9, 10, 11]]],
 ['nottingham', 'Q41262', [[8]]],
 ['fort lauderdale-hollywood international airport',
  'Q635361',
  [[9, 10, 11, 12, 13, 14]]],
 ['fort lauderdale, florida', 'Q165972', [[16, 17, 18, 19]]],
 ['jinnah international airport', 'Q61052', [[6, 7, 8]]],
 ['karachi', 'Q8660', [[10]]],
 ['margaret ekpo international airport', 'Q1030578', [[9, 10, 11, 12]]],
 ['calabar', 'Q844091', [[14]]],
 ['pakyong airport', 'Q7126092', [[0, 1]]],
 ['gangtok', 'Q186141', [[12]]],
 ['tinson pen aerodrome', 'Q2320308', [[3, 4, 5]]],
 ['kingston', 'Q34692', [[7]]],
 ['noumérat – moufdi zakaria airport', 'Q1432199', [[9, 10, 11, 12, 13]]],
 ['ghardaïa', 'Q622841', [[15]]],
 ['kazi nazrul islam airport', 'Q14941562', [[7, 8, 9, 10]]],
 ['durgapur', 'Q5088', [[12]]],
 ['robin hood airport', 'Q8996', [[4, 5, 6]]],
 ['doncaster', 'Q58900', [[8]]],
 ['large regional airport', 'Q180129', [[3

There appears to be one token for each word in the entitiy, not sure what the meaning is though