In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
(clusters, uri_to_cluster, is_main_label, uri_to_label) = pd.read_pickle('../6_conflation/temp/cluster_data.pkl.bz2')

with open('results/all_topics.tsv') as fp:
    all_topics = [line.strip() for line in fp.readlines()]

for topic in all_topics:
    if topic not in uri_to_cluster:
        uri_to_cluster[topic] = len(clusters)
        clusters.append({topic})

In [3]:
cluster_issues = [[] for c in clusters]
uri_issues = {uri: [] for uri in all_topics}
uri_warnings = {uri: [] for uri in all_topics}

### 2: externally inconsistent references

In [4]:
with open('../2_ext_ref_consistency/results/review_results.json') as fp:
    ex2_results = json.load(fp)

with open('../2_ext_ref_consistency/results/final_suspects.json') as fp:
    ex2_suspects = {
        v['dbpedia']: v
        for v in json.load(fp)
    }

for dbpedia, v in ex2_results.items():
    suspect = ex2_suspects[dbpedia]
    o = {
        'experiment': 'ex2',
        'comment': v['comment'],
        'comment2': suspect['reason'],
    }

    if v['decision'] == 'invalid':
        o['issue'] = 'invalid DBpedia reference'
        for uri in suspect['cso_topics']:
            uri_issues[uri].append(o)

    elif v['decision'] == 'other':
        o['issue'] = 'other'
        for uri in suspect['cso_topics']:
            uri_warnings[uri].append(o)

### 3: missing references to corresponding KBs

In [5]:
ex3_results = pd.read_csv('../3_missing_refs/results/missing_refs.tsv', sep='\t')

for uri in ex3_results['t'].values:
    uri_issues[uri].append({
        'experiment': 'ex3',
        'issue': 'missing Wikidata reference',
    })

### 4: logically invalid alignments

In [6]:
ex4_results = pd.read_csv('../4_refs_reasoning/results/unique_pairs.tsv', sep='\t', names=['a', 'b'])

for _, r in ex4_results.iterrows():
    uri_issues[r.a].append({
        'experiment': 'ex4',
        'issue': 'invalid inferred sameAs',
        'comment': r.b
    })
    uri_issues[r.b].append({
        'experiment': 'ex4',
        'issue': 'invalid inferred sameAs',
        'comment': r.a
    })

### 5: intra-cluster alignment inconsistencies

In [7]:
with open('../5_intra_ref_consistency/results/more_than_one_reference.json') as fp:
    ex5_too_many = json.load(fp)

for v in ex5_too_many:
    cluster_issues[uri_to_cluster[v['topics'][0]]].append({
        'experiment': 'ex5',
        'issue': 'cluster contains more than one unique DBpedia reference',
    })

In [8]:
with open('../5_intra_ref_consistency/results/missing_references.json') as fp:
    ex5_missing = json.load(fp)

for v in ex5_missing:
    for t in v['refs_missing']:
        uri_issues[t].append({
            'experiment': 'ex5',
            'issue': 'missing DBpedia reference',
            'comment': v['dbpedia'],
        })

### 6: term conflation

In [9]:
ex6_results = pd.read_csv('../6_conflation/results/cluster_reviews.csv')

# Assume majority vote approach
for _, r in ex6_results[ex6_results.is_wrong_majority].iterrows():
    cluster_issues[r.cluster].append({
        'experiment': 'ex6',
        'issue': 'inconsistent synonyms in cluster',
    })

"Propagate" issues from individual URIs to clusters

In [10]:
for uri, issues in uri_issues.items():
    for issue in issues:
        c_issue = issue.copy()
        c_issue['uri'] = uri
        cluster_issues[uri_to_cluster[uri]].append(c_issue)

### Summarize

In [11]:
cluster_issue_count = np.array([len(issues) for issues in cluster_issues])

cl_unique, cl_counts = np.unique(cluster_issue_count, return_counts=True)
print(f'Nonzero: {cl_counts[1:].sum()}')
print(np.asarray((cl_unique, cl_counts)).T)

Nonzero: 3137
[[   0 8050]
 [   1  854]
 [   2  571]
 [   3  379]
 [   4  257]
 [   5  188]
 [   6  123]
 [   7  109]
 [   8  129]
 [   9   68]
 [  10   69]
 [  11   58]
 [  12   61]
 [  13   24]
 [  14   34]
 [  15   12]
 [  16   16]
 [  17   12]
 [  18   17]
 [  19   42]
 [  20   11]
 [  21    2]
 [  22    7]
 [  23    2]
 [  24   22]
 [  25    4]
 [  26    5]
 [  27    3]
 [  28    2]
 [  29   22]
 [  30    2]
 [  32    7]
 [  33    2]
 [  34    1]
 [  35    1]
 [  36    6]
 [  37    2]
 [  38    1]
 [  39    1]
 [  40    1]
 [  41    2]
 [  43    1]
 [  56    5]
 [  68    1]
 [  81    1]]


In [12]:
uri_issue_count = np.array([len(issues) for issues in uri_issues.values()])

uri_unique, uri_counts = np.unique(uri_issue_count, return_counts=True)
print(f'Nonzero: {uri_counts[1:].sum()}')
print(np.asarray((uri_unique, uri_counts)).T)

Nonzero: 4287
[[    0 10003]
 [    1  1875]
 [    2   610]
 [    3   439]
 [    4   247]
 [    5   224]
 [    6   129]
 [    7   144]
 [    8   147]
 [    9    91]
 [   10    79]
 [   11    73]
 [   12    46]
 [   13    24]
 [   14    19]
 [   15     7]
 [   16    10]
 [   17    12]
 [   18    12]
 [   19    44]
 [   24    25]
 [   27     3]
 [   28     6]
 [   29    21]]
