In [2]:
import os
import codecs

rexa_dir = './rexa/'
field_names = [
    'author-in-focus', 'author-in-focus-score',
    'authorlist', 'alt-authorlist', 'altTitle', 
    'editor', 'email', 'institution', 'journal',
    'abstract', 'body', 'keyword', 'title', 'year'
]
authors_f = []

for root, dirs, files in os.walk('./rexa', topdown=False):
    for f in files:
        if f.endswith('.txt'):
            authors_f.append({
                'path': '{}/{}'.format(root, f),
                'cluster_name': root.split('/')[-1]
            })
print(authors_f[0])

{'cluster_name': 'SusanJoRussell', 'path': './rexa/russell_s/SusanJoRussell/mention#882379.txt'}


In [3]:
def raw2dict(raw):
    return { entry.split(':', 1)[0].strip():entry.split(':', 1)[1].strip() for entry in raw if entry.strip() != ''}

In [4]:
from xml.etree import ElementTree as ET

def clean_name(xml_string):
    root = ET.fromstring(xml_string)
    return ' '.join([child.text for child in root])

In [5]:
def fill_na(author):
    for field in field_names:
        if author.get(field, None) is None:
            author[field] = None
    return author

In [6]:
authors_d = []

for author_f in authors_f:
    with codecs.open(author_f['path'], 'r', 'utf-8', errors='ignore') as f:
        author_raw = raw2dict(f.readlines())
        author_raw['cluster_name'] = author_f['cluster_name']
        authors_d.append(author_raw)
print(len(authors_d))

3002


In [7]:
# for author_d in authors_d:
#     author_d['author-in-focus'] = clean_name(author_d['author-in-focus'])
#     author_d['authorlist'] = ', '.join([clean_name(author_name) for author_name in author_d['authorlist'].split('%%')])
#     if author_d.get('alt-authorlist', None) is not None:
#         author_d['alt-authorlist'] = ', '.join(clean_name(author_name) for author_name in author_d['alt-authorlist'].split('%%'))
#     if author_d.get('keyword', None) is not None:
#         author_d['keyword'] = ', '.join([keyword.strip() for keyword in author_d['keyword'].split(',') ])
#     if author_d.get('author-in-focus-score', None) is not None:
#         author_d['author-in-focus-score'] = float(author_d['author-in-focus-score'])
# print(authors_d[0])

In [8]:
# Clean author data
authors_dict = {}

for index, author_d in enumerate(authors_d):
    fill_na(author_d)
    author_d['author-in-focus'] = clean_name(author_d['author-in-focus'])
    author_d['authorlist'] = tuple([clean_name(author_name) for author_name in author_d['authorlist'].split('%%')])
    if author_d.get('alt-authorlist', None) is not None:
        author_d['alt-authorlist'] = tuple(clean_name(author_name) for author_name in author_d['alt-authorlist'].split('%%'))
    if author_d.get('keyword', None) is not None:
        author_d['keyword'] = tuple([keyword.strip() for keyword in author_d['keyword'].split(',') ])
    if author_d.get('author-in-focus-score', None) is not None:
        author_d['author-in-focus-score'] = float(author_d['author-in-focus-score'])
    authors_dict[index] = author_d

In [9]:
print(authors_dict.pop(1))

{'body': None, u'authorlist': ('SJ Russell', 'KA Steger', 'SA Johnston'), u'alt-authorlist': ('SJ Russell', 'KA Steger', 'SA Johnston'), 'institution': None, u'title': u'Subcellular localization, stoichiometry, and protein levels of 26 S proteasome subunits in yeast', u'journal': u'J Biol Chemyear: 1999', 'abstract': None, u'author-in-focus': 'SJ Russell', 'cluster_name': 'SarahJRussell', 'altTitle': None, 'editor': None, 'year': None, 'keyword': None, u'author-in-focus-score': 0.6251166, 'email': None}


In [10]:
from itertools import groupby
      

training_data = dict([(key, tuple(group)) for key, group in groupby(authors_d, lambda item: item['cluster_name'])])

In [11]:
import dedupe

fields = [
    {'field' : 'author-in-focus', 'type': 'String'},
#     {'field' : 'author-in-focus-score', 'type': 'Price'},
    {'field' : 'authorlist', 'type': 'Set'},
    {'field' : 'alt-authorlist', 'type': 'Set', 'has missing' : True},
    {'field' : 'email', 'type': 'String', 'has missing' : True},
    {'field' : 'keyword', 'type': 'Set', 'has missing' : True},
#     {'field' : 'abstract', 'type': 'Text', 'has missing' : True},
#     {'field' : 'body', 'type': 'Text', 'has missing' : True},
    {'field' : 'journal', 'type': 'String', 'has missing' : True},
    {'field' : 'institution', 'type': 'String', 'has missing' : True},
]

deduper = dedupe.Dedupe(fields)

deduper.sample(authors_dict)

match_clusters = [
    'SarahJRussell',
    'DMAllen-ohu',
    'AlvinBlum',
    'SJonesKnowEng',
    'MAJordan',
    'DKoller',
    'LHLee-elec',
    'JGMcGuire',
    'AlanMoore',
    'RajeevMotwani',
    'SebastianThrun',
    'StephenJYoung'
]

distinct_clusters = [
    ('SAYoung', 'SCKYoung'),
    ('SAYoung', 'SCKYoung'),
    ('StephenRussell', 'StephenRussellBIO'),
    ('RajeevMotwani', 'RaviMotwani'),
    ('AndrewJMoore', 'AndrewMMoore'),
    ('JBMcGuire', 'JGMcGuire'),
    ('LALee1', 'LALee2'),
    ('DKoller', 'DanielKoller'),
    ('MarilynJordan', 'MauriceJordan'),
    ('SCJones1', 'SCJones2'),
    ('AlvinBlum', 'AvrimBlum'),
    ('DAllen-jr', 'DAllen-ucla'),
]

deduper.markPairs({
    'match': [
        (training_data.get(cluster_name)[-1], training_data.get(cluster_name)[-2])
        for cluster_name in match_clusters
    ],
    'distinct': [
        (training_data.get(cluster_l)[0], training_data.get(cluster_r)[0])
        for cluster_l, cluster_r in distinct_clusters
    ]
})

print('start training...')

deduper.train()

print('finished...')

training_file = 'author_training.json'

with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)
    
threshold = deduper.threshold(authors_dict, recall_weight=1)

print('clustering...')
clustered_dupes = deduper.match(authors_dict, threshold)

print('# duplicate sets', len(clustered_dupes))

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...


start training...


INFO:rlr.crossvalidation:optimum alpha: 0.100000
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.4, authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.2, authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.8, authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.6, authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.6, alt-authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.4, alt-authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.2, alt-authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.8, alt-authorlist)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.2, keyword)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.4, keyword)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.8, keyword)
INFO:dedupe.blocking:Canopy: TfidfSetCanopyPredicate: (0.6, keyword)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, journal)
INFO:dedupe.

INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (3, email)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, email)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (2, email)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (4, email)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.2, email)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.4, email)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, email)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.8, email)
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(ExistsPredicate: (Exists, journal), TfidfNGramCanopyPredicate: (0.2, author-in-focus))
INFO:dedupe.training:(SimplePredicate: (magnitudeOfCardinality, authorlist), SimplePredicate: (twoGramFingerprint, author-in-focus))
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.2, author-in-focus)


finished...


INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.786
INFO:dedupe.api:precision: 0.650
INFO:dedupe.api:With threshold: 0.355
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.2, author-in-focus)


clustering...
('# duplicate sets', 138)


In [33]:
clustered_dupes[-1]


((2559, 2560), (0.88590854, 0.88590854))

In [14]:
# import pandas as pd

# authors_df = pd.DataFrame.from_dict(authors_d)
# authors_df.describe()

Unnamed: 0,abstract,alt-authorlist,altTitle,author-in-focus,author-in-focus-score,authorlist,body,editor,email,institution,journal,keyword,title,year
count,746,2965,219,3002,3002.0,3002,719,93,275,629,1227,114,3002,65
unique,726,2138,214,312,48.0,2136,708,89,161,373,1163,112,2864,17
top,Abstract Many algorithms rely critically on be...,Simon L Peyton Jones,Experiences with an interactive museum tour-gu...,R Motwani,1.0,Simon L Peyton Jones,1 Introduction The performance of many learnin...,"ed.,",russell@cs.berkeley.edu,School of Computer Science Carnegie Mellon Uni...,"Technical report,year: 1992","Keywords: Approximation, Prize Collecting Trav...",Artificial Intelligence: A Modern Approach,2001
freq,2,28,2,161,1169.0,30,2,2,13,19,7,2,4,8


In [15]:
# authors_df

Unnamed: 0,abstract,alt-authorlist,altTitle,author-in-focus,author-in-focus-score,authorlist,body,editor,email,institution,journal,keyword,title,year
0,"""Investigations in Number, Data and Space"" is ...",,,Susan Jo Russell,0.60451025,"Susan Jo Russell, Karen Economopoulos",,,,,,,"A Revision of Investigatons in Number, Data an...",
1,,"SJ Russell, KA Steger, SA Johnston",,SJ Russell,0.6251166,"SJ Russell, KA Steger, SA Johnston",,,,,J Biol Chemyear: 1999,,"Subcellular localization, stoichiometry, and p...",
2,,"P J Russell, J M Doenias, S J Russell",,S J Russell,0.60451025,"P J Russell, J M Doenias, S J Russell",,,,,,,GELYMAC: a Macintosh application for calculati...,
3,,"C J Lowenstein, E W Alley, P Raval, A M Snowma...",,S W Russell,0.60451025,"C J Lowenstein, E W Alley, P Raval, A M Snowma...",,,,,Proc.Natl.Acad.Sci.year: 1993,,Macrophage nitric oxide synthase gene: two ups...,
4,,"R B Lorsbach, W J Murphy, C J Lowenstein, S H ...",,S W Russell,0.60451025,"R B Lorsbach, W J Murphy, C J Lowenstein, S H ...",,,,,J.Biol.Chem.year: 1993,,Expression of the nitric oxide synthase gene i...,
5,,S Russel,,S Russel,1.0,S Russel,,,,,,,"Analogy By Similarity, Analogical reasoning,",
6,,"A Y Ng, D Harada, S Russell",,S Russell,0.6251166,"A Y Ng, D Harada, S Russell",,,,,In Proceedings of the Sixteenth International ...,,Theory and application to reward shaping,
7,Abstract. Memory-bounded algorithms such as Ko...,Stuart Russell,,Stuart Russell,0.6251166,Stuart Russell,1 Introduction This paper adopts the standard ...,,,,,,Efficient memory-bounded search methods,
8,,S Russell,,S Russell,0.6251166,S Russell,,,,,Proc. International Workshop on automatic Spee...,,Expressive probability models for speech recog...,
9,,"Ronald Parr, Stuart Russell, Mike Malone",,Stuart Russell,0.6251166,"Ronald Parr, Stuart Russell, Mike Malone",,,,,"Technical report,year: 1992",,The RALPH system,


In [52]:
# data_frame = pd.read_csv('./authors.csv')
# data_frame.describe()

Unnamed: 0,author-in-focus-score,year
count,3002.0,65.0
mean,0.925739,1997.738462
std,0.112081,4.338335
min,0.507084,1983.0
25%,0.92777,1995.0
50%,0.975576,1998.0
75%,1.0,2001.0
max,1.0,2004.0


In [65]:
from csv import DictWriter

with open('authors.csv', 'w') as authors_file:
    writer = DictWriter(authors_file, field_names)
    writer.writeheader()
    for row in authors_d:
        writer.writerow(row)