In [1]:
import json

# Load our preprocessed abstract data
with open('visvssrelationships_data_2016.json', 'r') as f:
    abstracts = json.load(f)

In [None]:
import difflib

list_of_lists = [abstracts[a]['author'] for a in abstracts]
originalList = set([val for sublist in list_of_lists for val in sublist])

authorList = set()
while originalList:
    authorA = originalList.pop()
    deleteSet = set()
    for authorB in originalList:
        ratio = difflib.SequenceMatcher(None, authorA, authorB).ratio()
        if ((ratio > 0.9) & (ratio != 1.0)):
            print('authorA:', authorA)
            print('authorB:', authorB)
            print(' Ratio:', ratio)
            deleteSet.add(authorB)
    originalList.difference_update(deleteSet)
    authorList.add(authorA)
    print(len(authorList),'+',len(originalList),'=',len(authorList)+len(originalList))

1 + 3512 = 3513
2 + 3511 = 3513
3 + 3510 = 3513
4 + 3509 = 3513
5 + 3508 = 3513
6 + 3507 = 3513
7 + 3506 = 3513
8 + 3505 = 3513
9 + 3504 = 3513
10 + 3503 = 3513
11 + 3502 = 3513
12 + 3501 = 3513
authorA: Judith E. Fan
authorB: Judith E Fan
 Ratio: 0.96
13 + 3499 = 3512
14 + 3498 = 3512
15 + 3497 = 3512
authorA: Rosemary A Cowell
authorB: Rosemary A. Cowell
 Ratio: 0.9714285714285714
16 + 3495 = 3511
17 + 3494 = 3511
18 + 3493 = 3511
authorA: Susana T.L. Chung
authorB: Susana T L Chung
 Ratio: 0.9090909090909091
19 + 3491 = 3510
20 + 3490 = 3510
21 + 3489 = 3510
22 + 3488 = 3510
23 + 3487 = 3510
authorA: Carmel A. Levitan
authorB: Carmel Levitan
 Ratio: 0.9032258064516129
24 + 3485 = 3509
authorA: Jonathan R Folstein
authorB: Jonathan R. Folstein
 Ratio: 0.9743589743589743
25 + 3483 = 3508
26 + 3482 = 3508
27 + 3481 = 3508
28 + 3480 = 3508
29 + 3479 = 3508
30 + 3478 = 3508
authorA: Hans P. Op de Beeck
authorB: Hans Op de Beeck
 Ratio: 0.9142857142857143
authorA: Hans P. Op de Beeck
auth

In [None]:
import networkx as nx
import json
from networkx.readwrite import json_graph
from tqdm import tqdm

# Use this to find the closest authors (assuming the authorList is pre-pruned)
def findClosestAuthor(name,authorList):
    maxRatio = 0.0
    outAuthor = None
    for author in authorList:
        ratio = difflib.SequenceMatcher(None, author, name).ratio()
        if (ratio > maxRatio):
            outAuthor = author
            maxRatio = ratio
    return outAuthor

# Loop through each abstract and add edges between the authors' names
# and the title of the abstract.
G = nx.Graph()
for a in tqdm(abstracts):
    title = abstracts[a]['title']
    
    for name in abstracts[a]['author']:
        author = findClosestAuthor(name, authorList)
        G.add_edge(author, title)
        if not 'group' in G.node[author]:
            G.node[author]['group'] = 1
        
    firstAuthor = findClosestAuthor(abstracts[a]['author'][0], authorList)
    G.node[firstAuthor]['group'] = 2
    G.node[title]['group'] = 3

# Loop through all the nodes and add the name property for D3.js
for n in G:
    G.node[n]['name'] = n
    
# Remove parallel edges
G = nx.Graph(G)
# Remove self loops
G.remove_edges_from(G.selfloop_edges())

# Export for D3.js
d = json_graph.node_link_data(G) # node-link format to serialize
json.dump(d, open('html/force.json','w'))

# Use NetworkX to plot the data (I've had limited success)
#import matplotlib.pyplot as plt
#pos=nx.spring_layout(G) # positions for all nodes
#nx.draw_networkx_nodes(G,pos,node_size=2,alpha=0.5)
#nx.draw_networkx_edges(G,pos,alpha=0.25)
#nx.draw_networkx_labels(G,pos,font_size=7,font_family='sans-serif')
#plt.axis('off')
#plt.show() # display

100%|█████████▉| 1458/1459 [24:39<00:01,  1.21s/it]