# Clean Dataset - Remove Unseen Entities/Relations in Target Triplets

Remove any triplets in `valid.txt` whose entities and/or relations are not seen in `train.txt`, and similarly for `test.txt` and `msg.txt`.

In [1]:
import os
import pandas as pd

In [2]:
raw_path = "raw"
processed_path = "processed"

data_name = "FBNELL_v1"

## Process `train.txt` and `valid.txt`

In [4]:
# Read train.txt

train_ents, train_rels, train_trips = [], [], []
with open(os.path.join(raw_path, data_name, "train.txt"), "r") as f:
    for line in f:
        line = line.strip().split()
        train_ents.append(line[0])
        train_ents.append(line[2])
        train_rels.append(line[1])
        train_trips.append(line)

In [6]:
# Read valid.txt

valid_ents, valid_rels, valid_trips = [], [], []
with open(os.path.join(raw_path, data_name, "valid.txt"), "r") as f:
    for line in f:
        line = line.strip().split()
        valid_ents.append(line[0])
        valid_ents.append(line[2])
        valid_rels.append(line[1])
        valid_trips.append(line)

In [7]:
# Check how many valid entities and relations are not in train

diff_ents = set(valid_ents) - set(train_ents)
diff_rels = set(valid_rels) - set(train_rels)

print("Number of valid entities not in train: {}".format(len(diff_ents)))
print("Number of valid relations not in train: {}".format(len(diff_rels)))

Number of valid entities not in train: 158
Number of valid relations not in train: 0


In [10]:
# Remove triples with entities and relations not in train

valid_trips_processed = [
    trip for trip in valid_trips 
    if trip[0] not in diff_ents and 
    trip[2] not in diff_ents and 
    trip[1] not in diff_rels]

print("Removed {} triplets from raw valid".format(len(valid_trips) - len(valid_trips_processed)))

Removed 164 triplets from raw valid


In [12]:
# Save orignal train.txt and processed valid.txt to processed folder

if not os.path.exists(os.path.join(processed_path, data_name)):
    os.makedirs(os.path.join(processed_path, data_name))

with open(os.path.join(processed_path, data_name, "train.txt"), "w") as f:
    for trip in train_trips:
        f.write("{}\t{}\t{}\n".format(trip[0], trip[1], trip[2]))

with open(os.path.join(processed_path, data_name, "valid.txt"), "w") as f:
    for trip in valid_trips_processed:
        f.write("{}\t{}\t{}\n".format(trip[0], trip[1], trip[2]))

## Process `msg.txt` and `test.txt`

In [13]:
# Read msg.txt

msg_ents, msg_rels, msg_trips = [], [], []
with open(os.path.join(raw_path, data_name, "msg.txt"), "r") as f:
    for line in f:
        line = line.strip().split()
        msg_ents.append(line[0])
        msg_ents.append(line[2])
        msg_rels.append(line[1])
        msg_trips.append(line)

In [14]:
# Read test.txt

test_ents, test_rels, test_trips = [], [], []
with open(os.path.join(raw_path, data_name, "test.txt"), "r") as f:
    for line in f:
        line = line.strip().split()
        test_ents.append(line[0])
        test_ents.append(line[2])
        test_rels.append(line[1])
        test_trips.append(line)

In [15]:
# Check how many test entities and relations are not in msg

diff_ents = set(test_ents) - set(msg_ents)
diff_rels = set(test_rels) - set(msg_rels)

print("Number of test entities not in msg: {}".format(len(diff_ents)))
print("Number of test relations not in msg: {}".format(len(diff_rels)))

Number of test entities not in msg: 5
Number of test relations not in msg: 0


In [16]:
# Remove triples with entities and relations not in msg

test_trips_processed = [
    trip for trip in test_trips
    if trip[0] not in diff_ents and
    trip[2] not in diff_ents and
    trip[1] not in diff_rels]

print("Removed {} triplets from raw test".format(len(test_trips) - len(test_trips_processed)))

Removed 5 triplets from raw test


In [17]:
# Save processed msg.txt and test.txt to processed folder

if not os.path.exists(os.path.join(processed_path, data_name)):
    os.makedirs(os.path.join(processed_path, data_name))

with open(os.path.join(processed_path, data_name, "msg.txt"), "w") as f:
    for trip in msg_trips:
        f.write("{}\t{}\t{}\n".format(trip[0], trip[1], trip[2]))

with open(os.path.join(processed_path, data_name, "test.txt"), "w") as f:
    for trip in test_trips_processed:
        f.write("{}\t{}\t{}\n".format(trip[0], trip[1], trip[2]))