In [1]:
import pandas as pd
import numpy as np

import json

In [2]:
pairs = pd.read_csv('related_eq.tsv', sep='\t')

In [3]:
uri_to_cluster = dict()
clusters = []

for _, r in pairs.iterrows():
    s = r['a']
    o = r['b']
    cluster_no = uri_to_cluster.get(s, None)
    if cluster_no is None:
        cluster_no = uri_to_cluster.get(o, None)

    if cluster_no is None:
        uri_to_cluster[s] = len(clusters)
        uri_to_cluster[o] = len(clusters)
        clusters.append({s, o})
    else:
        uri_to_cluster[s] = cluster_no
        uri_to_cluster[o] = cluster_no
        clusters[cluster_no].add(s)
        clusters[cluster_no].add(o)

In [4]:
refs = pd.read_csv('dbpedia_refs.tsv', sep='\t').set_index('t', verify_integrity=True)

In [5]:
cluster_refs = []

for cluster in clusters:
    r = dict()
    for t in cluster:
        if t not in refs.index:
            continue

        dbpedia = refs.loc[t]['dbpedia']
        if dbpedia in r:
            r[dbpedia].add(t)
        else:
            r[dbpedia] = {t}

    cluster_refs.append(r)

Are there clusters with more than one unique DBpedia reference?

In [6]:
ref_counts = np.array([len(cluster) for cluster in cluster_refs])
more_than_one = np.where(ref_counts > 1)[0]
print(f'Clusters with more than one DBpedia reference: {len(more_than_one)}')

Clusters with more than one DBpedia reference: 130


In [7]:
mto_counts = ref_counts[more_than_one]
(unique, counts) = np.unique(mto_counts, return_counts=True)
np.asarray((unique, counts)).T

array([[  2, 124],
       [  3,   6]])

In clusters with one unique DBpedia reference, is it applied to every entity?

In [8]:
missing_refs = []
only_one_ix = np.where(ref_counts == 1)[0]

for ix in only_one_ix:
    with_ref = list(cluster_refs[ix].values())[0]
    all_ents = clusters[ix]
    difference = all_ents - with_ref
    if len(difference) == 0:
        continue

    missing_refs.append({
        'cluster': ix,
        'all_ents_count': len(all_ents),
        'refs_missing_count': len(difference),
        'refs_missing': difference,
    })

df_missing_refs = pd.DataFrame(missing_refs)

In [9]:
print(f'Clusters with missing DBpedia references: {len(df_missing_refs)}')
print(f'Entities with missing DBpedia references: {df_missing_refs["refs_missing_count"].sum()}')

Clusters with missing DBpedia references: 752
Entities with missing DBpedia references: 962


In [10]:
no_issues_count = len(clusters) - len(more_than_one) - len(df_missing_refs)
print(f'Clusters with consistent DBpedia references: {no_issues_count} / {len(clusters)}')

Clusters with consistent DBpedia references: 1343 / 2225


Export results

In [11]:
mto_result = [
    {
        'topics': list(clusters[ix]),
        'references': {
            k: list(v)
            for k, v in cluster_refs[ix].items()
        },
    }
    for ix in more_than_one
]

with open('more_than_one_reference.json', 'w') as fp:
    json.dump(mto_result, fp, indent=2)

In [12]:
missing_result = [
    {
        'topics': list(clusters[mr['cluster']]),
        'refs_missing': list(mr['refs_missing']),
        'dbpedia': list(cluster_refs[mr['cluster']])[0]
    }
    for mr in missing_refs
]

with open('missing_references.json', 'w') as fp:
    json.dump(missing_result, fp, indent=2)