In [2]:
import followthemoney as ftm
import followthemoney_enrich as ftm_enrich
import followthemoney.model as model
from followthemoney.dedupe import Match, Linker
import json
import pandas as pd
import alephclient.api   as api

In [115]:
def read_ftm_json(path, filter = "Person"):
    entity_dict = {}
    with open(path) as f:
        for line in f:
            entity = model.get_proxy(json.loads(line))
            wd = entity.first("wikidataId", True)
            if entity.schema.name == filter:
                if wd:
                    entity_dict[wd] = entity
    return entity_dict
path = "./data/output/"

In [116]:
path = "./data/output/"
ep_path = path + "everypolitician.json"
ma_path = path + "meineabgeordneten_wikidata.json"
mein_abg = read_ftm_json(ma_path)
every_polit = read_ftm_json(ep_path)

In [128]:
for _, every in every_polit.items():
    if len(every) >1 :
        print(every)

# Matching
We will check for equal Wikidata IDs and create a Match object. A match objects holds two entity IDs and a decision about the sameness.

In [86]:
enricher = ftm_enrich.enricher.Enricher()

In [113]:
matches = []
for idx, abg in mein_abg.items():
    polit = every_polit.get(idx)
    if polit:
        #print(polit.to_dict())
        match = enricher.make_match(polit, abg)
        match = Match(model, {})
        match.entity=  polit
        match.canonical =  abg
        match.decision = match.SAME
        matches.append(match)


In total, there are 77 matching person entities with respect to the wikidata ID.

In [114]:
len(matches)

77

# Merging
The merging logic actually exists in the ftm [repository](https://github.com/alephdata/followthemoney/blob/6cb55e319f69443dff17bf1ee5dd1a37a31b5c4a/followthemoney/cli/dedupe.py) and works the following:

1. Create a linker object, which takes match objects and checks if there is a sameness decision.
2. If so, add the pair to a hashmap (Python dict) in the linker object.
3. Iterate through both collection of to-be-merged entities and pass each entity to the linker object (which knows the links). If the entity's ID is stored in the hashmap, swap the entity ID. If not, keep the ID. This also applies for "edges", such as memberships.
4. Write to file. 
5. As we have duplicates, we aggregate, which merges items with the same ID. Merging just unions both, properties and their values. Therefore, same properties are merged, and different ones are just added to the multi-valued list.

## Example

In [143]:
linker_exmpl = Linker(model)

a = ftm.model.make_entity("Person")
a.add("name", "hans keslen")
a.add("title", "Dr")
a.add("birthDate", "1908-07-06")
a.make_id("hans kelsen")

b = ftm.model.make_entity("Person")
b.add("name", "Dr. Dr. hans keslen")
b.add("birthDate", "1908")
b.add("title", "Dr")
b.make_id("Dr. Dr. Hans Kelsen")

match = enricher.make_match(a, b)
match.decision = match.SAME
linker_exmpl.add(match)

merged_ent  = a.merge(b)
{
    "a": a.id,
    "b": b.id,
    "merged": merged_ent.id,
    "result": merged_ent.to_dict()["properties"]}

{'a': '891bd4dbcf5506d489f8d6e757ace9411eccee55',
 'b': '3ac4e58acef775a8f047f1381dd2947fb0ee4464',
 'merged': '891bd4dbcf5506d489f8d6e757ace9411eccee55',
 'result': {'name': ['hans keslen', 'Dr. Dr. hans keslen'],
  'title': ['Dr'],
  'birthDate': ['1908', '1908-07-06']}}

In [90]:
# logic adapted form https://github.com/alephdata/followthemoney/blob/6cb55e319f69443dff17bf1ee5dd1a37a31b5c4a/followthemoney/cli/dedupe.py

linker = Linker(model)
for match in matches: 
    linker.add(match)


In [91]:
def mergeEntities(inpath, outfile, linker):
    infile = open(inpath)

    with infile as f:
        for line in f:
            entity = model.get_proxy(json.loads(line))
            applied = linker.apply(entity)
            

            json_ent = json.dumps(applied.to_dict(), sort_keys=True)
            outfile.write(json_ent + "\n")

merged_path  = path + "/merged/merged.json"
outfile = open(merged_path,  "w")
mergeEntities(ep_path , outfile, linker)
mergeEntities(ma_path, outfile, linker)
merged_aggr_path = path + "/merged/merged_aggr.json"

### Aggregate CLI command

In [92]:
%%bash -s "$merged_path" "$merged_aggr_path"
cat $1 | ftm aggregate -o $1


In [6]:
api.AlephAPI.get_collection()

alephclient.api.AlephAPI