In [1]:
import pickle

with open('data/en_infoboxes.pickle', 'rb') as f:
    en_infoboxes = pickle.load(f)

with open('data/nl_infoboxes.pickle', 'rb') as f:
    nl_infoboxes = pickle.load(f)

In [2]:
nl_infoboxes

{'A Batalha do Apocalipse': None,
 'A Clean, Well-Lighted Place': None,
 '334 (novel)': None,
 '1876 (novel)': None,
 'A Blues for Shindig': None,
 "A Buyer's Market": None,
 'A Change of Climate': None,
 'A Burnt-Out Case': None,
 '2312 (novel)': None,
 'A Concise Treatise on the Art of Angling': None,
 'A Certain Smile': {'titel': 'A Certain Smile<br /><small>Als een verre glimlach</small>',
  'regie': '[[Jean Negulesco]]',
  'producer': '[[Henry Ephron]]',
  'schrijver': '[[Frances Goodrich]]<br />[[Albert Hackett]]<br />[[Françoise Sagan]] (roman)',
  'spelers': '[[Rossano Brazzi]]<br />[[Joan Fontaine]]<br />[[Bradford Dillman]]',
  'muziek': '[[Alfred Newman]]',
  'montage': '[[Louis R. Loeffler]]',
  'cinematografie': '[[Milton R. Krasner]]',
  'distributeur': '[[20th Century Fox]]',
  'première': '[[22 september]] [[1958]]',
  'genre': 'Drama',
  'lengte': '104 minuten',
  'taal': '[[Engels]]',
  'land': '{{US}}',
  'budget': '$ 2.300.000',
  'opbrengst': '$ 1.300.000',
  'imdb

In [3]:
import re

clean = {
    '[': ' ',
    ']': ' ',
    '{': ' ',
    '}': ' ',
    '|': ' ',
}

translation = str.maketrans(clean)
WHITESPACE_PATTERN = re.compile(r'\W+')
TAG_PATTERN = re.compile(r'<[^>]+>')

def clean_str(string):
    # remove brackets
    string = string.translate(translation)
    # remove html tags
    string = re.sub(TAG_PATTERN, ' ', string)
    # Collapse whitespace
    string = re.sub(WHITESPACE_PATTERN, ' ', string)
    return string.strip()

In [4]:
def unpack_iterables(item):
    if (type(item) in (int, float)):
        return item
    elif type(item) == str:
        return clean_str(item)
    elif len(item) > 0 and all(type(a) not in (list, tuple, set) for a in item):
        return item
    
    return unpack_iterables(item[0])

In [5]:
for k, v in en_infoboxes.items():
    if type(v) == dict:
        for k1, v1 in v.items():
            en_infoboxes[k][k1] = unpack_iterables(v1)
en_infoboxes

{'A Batalha do Apocalipse': {'name': 'A Batalha do Apocalipse Da Queda dos Anjos ao Crepúsculo do Mundo The Battle of Apocalypse From the Fall of Angels to the Twilight of the World',
  'image': 'A Batalha do Apocalipse jpg',
  'author': 'Eduardo Spohr',
  'country': 'Brazil',
  'language': 'Portuguese language Portuguese',
  'genre': 'Fiction Fantastic Literature Angels',
  'publisher': 'Verus',
  'release_date': '2010',
  'pages': '586',
  'isbn': '9788576860761'},
 'A Clean, Well-Lighted Place': {'name': 'A Clean Well Lighted Place',
  'author': 'Ernest Hemingway',
  'country': 'United States',
  'language': 'English',
  'genre': 'Short story',
  'publication_type': 'Periodical',
  'media_type': 'Print',
  'pub_date': '1933'},
 '334 (novel)': {'name': '334',
  'image': '334 novel book cover jpg',
  'caption': 'Dust jacket from the first edition hardcover by Michael Hasted',
  'author': 'Thomas M Disch',
  'country': 'United States',
  'language': 'English',
  'genre': 'Dystopian nov

In [6]:
tmp = nl_infoboxes.copy()
for k, v in tmp.items():
    if not v:
        nl_infoboxes.pop(k)
    elif type(v) == dict:
        for k1, v1 in v.items():
            nl_infoboxes[k][k1] = unpack_iterables(v1)
nl_infoboxes

{'A Certain Smile': {'titel': 'A Certain Smile Als een verre glimlach',
  'regie': 'Jean Negulesco',
  'producer': 'Henry Ephron',
  'schrijver': 'Frances Goodrich Albert Hackett Françoise Sagan roman',
  'spelers': 'Rossano Brazzi Joan Fontaine Bradford Dillman',
  'muziek': 'Alfred Newman',
  'montage': 'Louis R Loeffler',
  'cinematografie': 'Milton R Krasner',
  'distributeur': '20th Century Fox',
  'première': '22 september 1958',
  'genre': 'Drama',
  'lengte': '104 minuten',
  'taal': 'Engels',
  'land': 'US',
  'budget': '2 300 000',
  'opbrengst': '1 300 000',
  'imdb': '0051466',
  'moviemeter': '48099'},
 "A Clergyman's Daughter": {'naam': 'De domineesdochter',
  'orig titel': 'A Clergyman s Daughter',
  'auteur': 'George Orwell',
  'vertaler': 'Elizabeth Stortenbeker',
  'land': 'Engeland',
  'taal': 'Nederlands',
  'originele taal': 'Engels',
  'uitgever': 'Meulenhoff',
  'uitgiftedatum origineel': '1935',
  'paginas': '302',
  'isbn': '90 290 0373 1'},
 'A Case of Conscie

In [7]:
with open('data/nl_infoboxes_clean.pickle', 'wb') as f:
    pickle.dump(nl_infoboxes, f)

with open('data/en_infoboxes_clean.pickle', 'wb') as f:
    pickle.dump(en_infoboxes, f)

In [8]:
both_available = en_infoboxes.keys() & nl_infoboxes.keys()

In [9]:
for k, v in nl_infoboxes.items():
    for k1, v1 in v.items():
        if '[' in v:
            print(v)

In [10]:
len(en_infoboxes)

4227

In [11]:
len(both_available)

703

In [12]:
# use a set so we only keep track of unique mappings
matches = {
    'exact_key': set(),
    'full_match': set(),
    'partial_match': set(),
}

for key in both_available:
    for nl_k, nl_v in nl_infoboxes[key].items():
        for en_k, en_v in en_infoboxes[key].items():
            nl_k = nl_k.lower()
            en_k = en_k.lower()
            result = (en_k, nl_k)

            # TODO hier keys groeperen die dezelfde waarden hebben
            # zodat [isbn, issn] samenkomen of
            # [uitgeverij, originele uitgever, uitgever]
            # beide kanten op kijken of je al bestaande keys hebt voor het nl en
            # en

            if nl_k == en_k:
                matches['exact_key'].add(result)
            elif nl_v == en_v:
                matches['full_match'].add(result)
            elif (nl_v in en_v) or (en_v in nl_v):
                matches['partial_match'].add(result)

print(len(matches))

3


In [13]:
matches

{'exact_key': {('awards', 'awards'),
  ('budget', 'budget'),
  ('editing', 'editing'),
  ('english_pub_date', 'english_pub_date'),
  ('fusiongenres', 'fusiongenres'),
  ('genre', 'genre'),
  ('illustrator', 'illustrator'),
  ('isbn', 'isbn'),
  ('name', 'name'),
  ('producer', 'producer'),
  ('setting', 'setting'),
  ('subgenres', 'subgenres'),
  ('taxon', 'taxon'),
  ('website', 'website')},
 'full_match': {('alt', 'naam'),
  ('artist', 'tekenaar'),
  ('author', 'auteur'),
  ('author', 'illustraties'),
  ('author', 'naam'),
  ('author', 'onderschrift'),
  ('author', 'schrijfster'),
  ('author', 'schrijver'),
  ('birth_name', 'geboortenaam'),
  ('birth_name', 'volledige naam'),
  ('birth_name', 'volledigenaam'),
  ('book', 'boek'),
  ('caption', 'titel'),
  ('cinematography', 'cinematografie'),
  ('country', 'land'),
  ('cover_artist', 'auteur'),
  ('cover_artist', 'cover-op-enwp'),
  ('cover_artist', 'illustraties'),
  ('cover_artist', 'kaftontwerp'),
  ('creator', 'auteur'),
  ('crea

In [45]:
expanded_boxes, left_untouched_boxes = [], []
for title, existing_infobox in en_infoboxes.items():
    # we have an existing infobox for Dutch, check if we need to expand it
    if title in nl_infoboxes.keys():
        nl_existing = nl_infoboxes[title]
        nl_new = nl_existing.copy()
        for nl_k, nl_v in nl_existing.items():
            en_existing = en_infoboxes[title]
            for en_k, en_v in en_existing.items():
                if en_v == nl_v:
                    continue

                # TODO: Dit is niet best, we kunnen beter de keys waarvan we weten
                # dat ze bestaan in de Engelse infobox verzamelen en die 
                # toepassen, maar met sets van tuples is dat niet handig
                # for k in matches.keys():
                for k in ['exact_key', 'full_match']:
                    reason = f'Door {k}'
                    for en_map, nl_map in matches[k]:
                        if en_map == en_k and nl_map not in nl_new and en_v not in nl_new.values():
                            nl_new[nl_map] = (en_existing[en_map], reason)
        if nl_new != nl_existing:
            expanded_boxes.append({'old': nl_existing, 'new': nl_new})
        else:
            left_untouched_boxes.append(nl_existing)

In [46]:
len(expanded_boxes), len(left_untouched_boxes)

(694, 9)

In [47]:
expanded_boxes

[{'old': {'titel': 'A Certain Smile Als een verre glimlach',
   'regie': 'Jean Negulesco',
   'producer': 'Henry Ephron',
   'schrijver': 'Frances Goodrich Albert Hackett Françoise Sagan roman',
   'spelers': 'Rossano Brazzi Joan Fontaine Bradford Dillman',
   'muziek': 'Alfred Newman',
   'montage': 'Louis R Loeffler',
   'cinematografie': 'Milton R Krasner',
   'distributeur': '20th Century Fox',
   'première': '22 september 1958',
   'genre': 'Drama',
   'lengte': '104 minuten',
   'taal': 'Engels',
   'land': 'US',
   'budget': '2 300 000',
   'opbrengst': '1 300 000',
   'imdb': '0051466',
   'moviemeter': '48099',
   'name': 'A Certain Smile',
   'english_pub_date': '1956'},
  'new': {'titel': 'A Certain Smile Als een verre glimlach',
   'regie': 'Jean Negulesco',
   'producer': 'Henry Ephron',
   'schrijver': 'Frances Goodrich Albert Hackett Françoise Sagan roman',
   'spelers': 'Rossano Brazzi Joan Fontaine Bradford Dillman',
   'muziek': 'Alfred Newman',
   'montage': 'Louis R

In [25]:
for d in expanded_boxes:
    

{'old': {'titel': 'A Scanner Darkly',
  'tagline': 'Everything Is Not Going To Be OK What Does A Scanner See',
  'regie': 'Richard Linklater',
  'producer': 'Tommy Pallotta Anne Walker McBay Palmer West Jonah Smith Erwin Stoff',
  'schrijver': 'Richard Linklater',
  'spelers': 'Keanu Reeves Robert Downey Jr Woody Harrelson Winona Ryder',
  'muziek': 'Graham Reynolds',
  'montage': 'Sandra Adair',
  'cinematografie': 'Shane F Kelly',
  'distributeur': 'Warner Home Video',
  'première': '7 juli 2006',
  'genre': 'Sciencefictionfilm Animatiefilm',
  'lengte': '100 minuten',
  'taal': 'Engels',
  'land': 'US',
  'budget': '8 7 miljoen dollar',
  'imdb': '0405296',
  'moviemeter': '28490'},
 'new': {'titel': 'A Scanner Darkly',
  'tagline': 'Everything Is Not Going To Be OK What Does A Scanner See',
  'regie': 'Richard Linklater',
  'producer': 'Tommy Pallotta Anne Walker McBay Palmer West Jonah Smith Erwin Stoff',
  'schrijver': 'Richard Linklater',
  'spelers': 'Keanu Reeves Robert Downey

In [37]:
# total = []
# with open('data/titlesAE.txt') as f:
#     total.extend(f.readlines())

# with open('data/titlesFZ.txt') as f:
#     total.extend(f.readlines())

# with open('data/sf_fantasy_light_mostpop.txt') as f:
#     total.extend(f.readlines())

# total = sorted(list(set(total)))

# with open('titles.txt', 'w') as f:
#     f.write(''.join(total))

# # Handmatig checken op:
# #   - wikipediadump xml blabla
# #   - Dingen die bovenaan staan (punct marks etc.)
# #   - &amp; vervangen