## Pre-Processing and Visualization

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_examples = []

with open('WikiCREM/WikiCREM_train.txt') as fp:
    i = 0
    example = {}

    for line in fp:
        i += 1

        if i == 1:
            example['tokens'] = line.split()
        elif i == 2:
            for j, token in enumerate(example['tokens']):
                if token == '[MASK]':
                    example['mask_idx'] = j
        elif i == 3:
            example['candidates'] = line.rstrip().split(',')
        elif i == 4:
            example['true_label'] = line.rstrip()
        else:
            train_examples.append(example)
            example = {}
            i = 0
            continue

In [3]:
test_examples = []

with open('WikiCREM/WikiCREM_dev.txt') as fp:
    i = 0
    example = {}

    for line in fp:
        i += 1

        if i == 1:
            example['tokens'] = line.split()
        elif i == 2:
            for j, token in enumerate(example['tokens']):
                if token == '[MASK]':
                    example['mask_idx'] = j
        elif i == 3:
            example['candidates'] = line.rstrip().split(',')
        elif i == 4:
            example['true_label'] = line.rstrip()
        else:
            test_examples.append(example)
            example = {}
            i = 0
            continue

In [4]:
def generate_matching_distance_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        distance = 9999

        for i, token in enumerate(example['tokens']):
            if token in candidate.split():
                distance = min(distance, abs(i - example['mask_idx']))

        if candidate == example['true_label']:
            p = distance
        else:
            n = distance

    return p, n

In [5]:
def generate_non_matching_distance_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        distance = 9999

        for i, token in enumerate(example['tokens']):
            if token in candidate.split():
                distance = min(distance, abs(i - example['mask_idx']))

        if candidate == example['true_label']:
            n = distance
        else:
            p = distance

    return p, n

In [6]:
def generate_sentence_distance_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        distance = 9999
        idx = -1
        sentence_distance = 0

        for i, token in enumerate(example['tokens']):
            if token in candidate.split():
                if abs(i - example['mask_idx']) < distance:
                    distance = abs(i - example['mask_idx'])
                    idx = i

        for i in range(min(idx, example['mask_idx']), max(idx, example['mask_idx'])):
            if example['tokens'][i] == '.':
                sentence_distance += 1

        if candidate == example['true_label']:
            p = sentence_distance
        else:
            n = sentence_distance

    return p, n

In [7]:
def generate_repetition_count_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        count = 0

        for c in candidate.split():
            count = max(example['tokens'].count(c), count)

        if candidate == example['true_label']:
            p = count
        else:
            n = count
        
    return p, n

In [8]:
def generate_anaphor_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        flag = 0

        for i, token in enumerate(example['tokens']):
            if token in candidate.split():
                if i - example['mask_idx'] < 0:
                    flag = 1

        if candidate == example['true_label']:
            p = flag
        else:
            n = flag

    return p, n

In [9]:
def generate_cataphor_feature(example):
    p = 0
    n = 0

    for candidate in example['candidates']:
        flag = 0

        for i, token in enumerate(example['tokens']):
            if token in candidate.split():
                if i - example['mask_idx'] > 0:
                    flag = 1

        if candidate == example['true_label']:
            p = flag
        else:
            n = flag

    return p, n

In [10]:
# Currently we generate the following sets of features:
# 1) Minimum Distance to Matching NP
# 2) Minimum Distance to Non-Matching NP
# 3) Minimum Sentence Distance to Matching NP
# 4) Number of Repitions Within The Passage
# 5) Presence of Anaphor
# 6) Presence of Cataphor
def feature_set_generation(examples):
    X = []
    Y = []

    for example in examples:
        p1, n1 = generate_matching_distance_feature(example)
        p2, n2 = generate_non_matching_distance_feature(example)
        p3, n3 = generate_sentence_distance_feature(example)
        p4, n4 = generate_repetition_count_feature(example)
        p5, n5 = generate_anaphor_feature(example)
        p6, n6 = generate_cataphor_feature(example)
        X.append([p1, p2, p3, p4, p5, p6])
        Y.append([1])
        X.append([n1, n2, n3, n4, n5, n6])
        Y.append([0])
            
    return X, Y

In [11]:
X_Train, Y_Train = feature_set_generation(train_examples)
X_Test, Y_Test = feature_set_generation(test_examples)

In [12]:
print(X_Test)
print(Y_Test)

[[14, 4, 1, 1, 1, 0], [4, 14, 0, 2, 1, 1]]
[[1], [0]]
