In [1]:
import networkx as nx
import pandas as pd

import itertools
import json

In [2]:
raw_data = pd.read_json('results/mutual_links.json')

In [3]:
g = nx.Graph()
g.add_nodes_from(raw_data.dbpedia)
g.add_edges_from((
    (r['dbpedia'], target)
    for _, r in raw_data.iterrows()
    for target in ([] if pd.isna(r['others']) else r['others'].split(' || '))
))

In [4]:
def get_unconnected(graph):
    ccs = list(nx.connected_components(graph))
    not_connected = []

    for cc in ccs:
        if len(cc) > 10:
            continue
        not_connected.extend(cc)

    print(f'Nodes outside large connected components: {len(not_connected)}')
    return not_connected

suspects_unconnected = get_unconnected(g)

Nodes outside large connected components: 76


Remove suspects from previous step. Seek out communities in the connected graph.

In [5]:
g.remove_nodes_from(suspects_unconnected)

In [6]:
g_bridges = list(nx.bridges(g))
g.remove_edges_from(g_bridges)
suspects_unconnected2 = get_unconnected(g)
g.remove_nodes_from(suspects_unconnected2)

Nodes outside large connected components: 55


In [7]:
communities = list(nx.community.label_propagation_communities(g))

suspects_community = []

for community in communities:
    if len(community) > 10:
        continue
    suspects_community.extend(community)

In [8]:
results = pd.DataFrame(
    columns=['dbpedia', 'reason'],
    data=itertools.chain(
        ((uri, 'not connected') for uri in suspects_unconnected),
        ((uri, 'not connected after bridge removal') for uri in suspects_unconnected2),
        ((uri, 'community') for uri in suspects_community)
    )
)

results.groupby('reason').count()

Unnamed: 0_level_0,dbpedia
reason,Unnamed: 1_level_1
community,9
not connected,76
not connected after bridge removal,55


In [9]:
raw_data = raw_data.set_index('dbpedia')

In [10]:
results = results.join(raw_data, on='dbpedia').drop(columns=['linkCount', 'others'])

In [11]:
results_arr = []

for _, r in results.iterrows():
    results_arr.append({
        'dbpedia': r['dbpedia'],
        'reason': r['reason'],
        'cso_topics': ['https://cso.kmi.open.ac.uk/topics/' + y for y in r['cso_topics'].split()]
    })

json.dump(results_arr, open('results/final_suspects.json', 'w'), indent=2)
