In [None]:
import pickle

with open('data/en_infoboxes.pickle', 'rb') as f:
    en_infoboxes = pickle.load(f)

with open('data/nl_infoboxes.pickle', 'rb') as f:
    nl_infoboxes = pickle.load(f)

In [None]:
nl_infoboxes

In [None]:
import re

clean = {
    '[': ' ',
    ']': ' ',
    '{': ' ',
    '}': ' ',
    '|': ' ',
}

translation = str.maketrans(clean)
WHITESPACE_PATTERN = re.compile(r'\W+')
TAG_PATTERN = re.compile(r'<[^>]+>')

def clean_str(string):
    # remove brackets
    string = string.translate(translation)
    # remove html tags
    string = re.sub(TAG_PATTERN, ' ', string)
    # Collapse whitespace
    string = re.sub(WHITESPACE_PATTERN, ' ', string)
    return string.strip()

In [None]:
def unpack_iterables(item):
    if (type(item) in (int, float)):
        return item
    elif type(item) == str:
        return clean_str(item)
    elif len(item) > 0 and all(type(a) not in (list, tuple, set) for a in item):
        return item
    
    return unpack_iterables(item[0])

In [None]:
for k, v in en_infoboxes.items():
    if type(v) == dict:
        for k1, v1 in v.items():
            en_infoboxes[k][k1] = unpack_iterables(v1)
en_infoboxes

In [None]:
tmp = nl_infoboxes.copy()
for k, v in tmp.items():
    if not v:
        nl_infoboxes.pop(k)
    elif type(v) == dict:
        for k1, v1 in v.items():
            nl_infoboxes[k][k1] = unpack_iterables(v1)
nl_infoboxes

In [None]:
with open('data/nl_infoboxes_clean.pickle', 'wb') as f:
    pickle.dump(nl_infoboxes, f)

with open('data/en_infoboxes_clean.pickle', 'wb') as f:
    pickle.dump(en_infoboxes, f)

In [None]:
both_available = en_infoboxes.keys() & nl_infoboxes.keys()

In [None]:
for k, v in nl_infoboxes.items():
    for k1, v1 in v.items():
        if '[' in v:
            print(v)

In [None]:
len(en_infoboxes)

In [None]:
len(both_available)

In [None]:
# use a set so we only keep track of unique mappings
matches = {
    'exact_key': set(),
    'full_match': set(),
    'partial_match': set(),
}

for key in both_available:
    for nl_k, nl_v in nl_infoboxes[key].items():
        for en_k, en_v in en_infoboxes[key].items():
            nl_k = nl_k.lower()
            en_k = en_k.lower()
            result = (en_k, nl_k)

            # TODO hier keys groeperen die dezelfde waarden hebben
            # zodat [isbn, issn] samenkomen of
            # [uitgeverij, originele uitgever, uitgever]
            # beide kanten op kijken of je al bestaande keys hebt voor het nl en
            # en

            if nl_k == en_k:
                matches['exact_key'].add(result)
            elif nl_v == en_v:
                matches['full_match'].add(result)
            elif (nl_v in en_v) or (en_v in nl_v):
                matches['partial_match'].add(result)

print(len(matches))

In [None]:
matches

In [None]:
expanded_boxes, left_untouched_boxes = [], []
for title, existing_infobox in en_infoboxes.items():
    # we have an existing infobox for Dutch, check if we need to expand it
    if title in nl_infoboxes.keys():
        nl_existing = nl_infoboxes[title]
        nl_new = nl_existing.copy()
        for nl_k, nl_v in nl_existing.items():
            en_existing = en_infoboxes[title]
            for en_k, en_v in en_existing.items():
                if en_v == nl_v:
                    continue

                # TODO: Dit is niet best, we kunnen beter de keys waarvan we weten
                # dat ze bestaan in de Engelse infobox verzamelen en die 
                # toepassen, maar met sets van tuples is dat niet handig
                # for k in matches.keys():
                for k in ['exact_key', 'full_match']:
                    reason = f'Door {k}'
                    for en_map, nl_map in matches[k]:
                        if en_map == en_k and nl_map not in nl_new and en_v not in nl_new.values():
                            nl_new[nl_map] = (en_existing[en_map], reason)
        if nl_new != nl_existing:
            expanded_boxes.append({'old': nl_existing, 'new': nl_new})
        else:
            left_untouched_boxes.append(nl_existing)

In [None]:
len(expanded_boxes), len(left_untouched_boxes)

In [None]:
expanded_boxes

In [None]:
for d in expanded_boxes:
    

In [None]:
# total = []
# with open('data/titlesAE.txt') as f:
#     total.extend(f.readlines())

# with open('data/titlesFZ.txt') as f:
#     total.extend(f.readlines())

# with open('data/sf_fantasy_light_mostpop.txt') as f:
#     total.extend(f.readlines())

# total = sorted(list(set(total)))

# with open('titles.txt', 'w') as f:
#     f.write(''.join(total))

# # Handmatig checken op:
# #   - wikipediadump xml blabla
# #   - Dingen die bovenaan staan (punct marks etc.)
# #   - &amp; vervangen

In [1]:
import pickle
from extract_mappings import Mapping, Mapper


with open('data/mappings.pickle', 'rb') as f:
    mappings = pickle.load(f)

with open('data/mapper.pickle', 'rb') as f:
    mapper = pickle.load(f)

In [None]:
mappings

In [None]:
from collections import Counter
Counter(mappings).items()

In [9]:
for k, v in mapper.map.items():
    for m in v:
        if m.reason == 'Normalized value Levenshtein match':
            print(m)

Mapping(en_key='name', nl_key='ja_naam_trans', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='genre', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='Huidige', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='tv_com', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='tagline', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='reeks', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='volgendeboek', train_count=1, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='cover-op-enwp', train_count=24, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='afbeelding', train_count=21, reason='Normalized value Levenshtein match')
Mapping(en_key='name', nl_key='poster-op-enwp', train_count=4, reason='Nor

In [2]:
mapper.get_mappings('publisher')

[Mapping(en_key='publisher', nl_key='uitgever', train_count=60, reason='Exact value match'),
 Mapping(en_key='publisher', nl_key='originele uitgever', train_count=53, reason='Exact value match'),
 Mapping(en_key='publisher', nl_key='auteur', train_count=1, reason='Normalized value Levenshtein match'),
 Mapping(en_key='publisher', nl_key='naam', train_count=1, reason='Normalized value Levenshtein match'),
 Mapping(en_key='publisher', nl_key='uitgeverij', train_count=1, reason='Exact value match')]

In [None]:
len({m.en_key for m in mappings})

In [None]:
mapper.get_mappings('title')

In [None]:
for m in mappings:

In [1]:
from utils import *

en_infoboxes = load_pickle('data/train/en_infoboxes_clean.pickle')
nl_infoboxes = load_pickle('data/train/nl_infoboxes_clean.pickle')

both_available = en_infoboxes.keys() & nl_infoboxes.keys()

In [2]:
def avg(x):
    return sum(x) / len(x)

In [3]:
en_key_counts = []
nl_key_counts = []

for k in both_available:
    en_key_counts.append(len(en_infoboxes[k]))
    nl_key_counts.append(len(nl_infoboxes[k]))

avg(en_key_counts), avg(nl_key_counts)

(14.066856330014225, 13.84068278805121)

In [None]:
# avg len infoboxes train set
# en: 14,1
# nl: 13,8

In [1]:
print(
    301 / 40,
    160 / 40,
    135 / 40,
    586 / 40
)

7.525 4.0 3.375 14.65


In [None]:
# Test set results
# total_correct_exact_k: 301
# total_correct_exact_v: 160
# total_correct_exact_pairs: 135
# total_pairs_generated: 586        

In [None]:
# average correct keys per infobox: 7.525 
# average correct values per infobox: 4.0 
# average correct pairs per infobox: 3.375 
# average total pairs generated per infobox: 14.65