### Clean data

In [None]:
import os
import codecs
import pandas as pd
import itertools


In [None]:
rexa_dir = '/home/is2548/ML/entity-resolution/data/'
field_names = [
    'author-in-focus', 'author-in-focus-score',
    'authorlist', 'alt-authorlist', 'altTitle', 
    'editor', 'email', 'institution', 'journal',
    'abstract', 'body', 'keyword', 'title', 'year'
]
authors_f = []

for root, dirs, files in os.walk(rexa_dir, topdown=False):
    for f in files:
        if f.endswith('.txt'):
            authors_f.append({
                'path': '{}/{}'.format(root, f),
                'cluster_name': root.split('/')[-1]
            })
print(authors_f[0])

In [None]:
def raw2dict(raw):
    return { entry.split(':', 1)[0].strip():entry.split(':', 1)[1].strip() for entry in raw if entry.strip() != ''}

In [None]:
from xml.etree import ElementTree as ET

def clean_name(xml_string):
    root = ET.fromstring(xml_string)
    return ' '.join([child.text for child in root])

In [None]:
def fill_na(author):
    for field in field_names:
        if author.get(field, None) is None:
            author[field] = None
    return author

In [None]:
authors_d = []

for author_f in authors_f:
    with codecs.open(author_f['path'], 'r', 'utf-8', errors='ignore') as f:
        author_raw = raw2dict(f.readlines())
        author_raw['cluster_name'] = author_f['cluster_name']
        authors_d.append(author_raw)
print(len(authors_d))

In [None]:
# Clean author data
authors_dict = {}

for index, author_d in enumerate(authors_d):
    fill_na(author_d)
    author_d['author-in-focus'] = clean_name(author_d['author-in-focus'])
    author_d['authorlist'] = tuple(
        clean_name(author_name)
        for author_name in author_d['authorlist'].split('%%')
        if clean_name(author_name) != author_d['author-in-focus']
    ) or tuple([''])
    if author_d.get('alt-authorlist', None) is not None:
        author_d['alt-authorlist'] = tuple(
            clean_name(author_name)
            for author_name in author_d['alt-authorlist'].split('%%')
            if clean_name(author_name) != author_d['author-in-focus']
        ) or tuple([''])
    if author_d.get('keyword', None) is not None:
        author_d['keyword'] = tuple([keyword.strip() for keyword in author_d['keyword'].split(',') ])
    
    if author_d.get('journal', None) is None:
        author_d['journal'] = '';
        
    if author_d.get('author-in-focus-score', None) is not None:
        author_d['author-in-focus-score'] = float(author_d['author-in-focus-score'])
    authors_dict[index] = author_d
    

In [None]:
df = pd.DataFrame.from_dict(authors_d)
df.describe()
df = df[[
    'author-in-focus', 'author-in-focus-score', 'authorlist', 'title',
    'alt-authorlist', 'journal', 'abstract', 'body', 'institution',
    'email', 'altTitle', 'keyword', 'editor', 'year', 'cluster_name'
]]

In [None]:
cutoff = 1000
df1 = df[: cutoff]
df2 = df[cutoff: cutoff + cutoff / 2]
df2 = df2.reset_index()

### Run with dedupe

In [None]:
# from itertools import groupby
# training_data = dict([(key, tuple(group)) for key, group in groupby(authors_d, lambda item: item['cluster_name'])])

In [None]:
# import dedupe

# fields = [
#     {'field' : 'author-in-focus', 'type': 'String'},
# #     {'field' : 'author-in-focus-score', 'type': 'Price'},
#     {'field' : 'authorlist', 'type': 'Set'},
#     {'field' : 'alt-authorlist', 'type': 'Set', 'has missing' : True},
#     {'field' : 'email', 'type': 'String', 'has missing' : True},
#     {'field' : 'keyword', 'type': 'Set', 'has missing' : True},
# #     {'field' : 'abstract', 'type': 'Text', 'has missing' : True},
# #     {'field' : 'body', 'type': 'Text', 'has missing' : True},
#     {'field' : 'journal', 'type': 'String', 'has missing' : True},
#     {'field' : 'institution', 'type': 'String', 'has missing' : True},
# ]

# deduper = dedupe.Dedupe(fields)

# deduper.sample(authors_dict)

# match_clusters = [
#     'SarahJRussell',
#     'DMAllen-ohu',
#     'AlvinBlum',
#     'SJonesKnowEng',
#     'MAJordan',
#     'DKoller',
#     'LHLee-elec',
#     'JGMcGuire',
#     'AlanMoore',
#     'RajeevMotwani',
#     'SebastianThrun',
#     'StephenJYoung'
# ]

# distinct_clusters = [
#     ('SAYoung', 'SCKYoung'),
#     ('SAYoung', 'SCKYoung'),
#     ('StephenRussell', 'StephenRussellBIO'),
#     ('RajeevMotwani', 'RaviMotwani'),
#     ('AndrewJMoore', 'AndrewMMoore'),
#     ('JBMcGuire', 'JGMcGuire'),
#     ('LALee1', 'LALee2'),
#     ('DKoller', 'DanielKoller'),
#     ('MarilynJordan', 'MauriceJordan'),
#     ('SCJones1', 'SCJones2'),
#     ('AlvinBlum', 'AvrimBlum'),
#     ('DAllen-jr', 'DAllen-ucla'),
# ]

# deduper.markPairs({
#     'match': [
#         (training_data.get(cluster_name)[-1], training_data.get(cluster_name)[-2])
#         for cluster_name in match_clusters
#     ],
#     'distinct': [
#         (training_data.get(cluster_l)[0], training_data.get(cluster_r)[0])
#         for cluster_l, cluster_r in distinct_clusters
#     ]
# })

# print('start training...')

# deduper.train()

# print('finished...')

# training_file = 'author_training.json'

# with open(training_file, 'w') as tf:
#     deduper.writeTraining(tf)
    
# threshold = deduper.threshold(authors_dict, recall_weight=1)

# print('clustering...')
# clustered_dupes = deduper.match(authors_dict, threshold)

# print('# duplicate sets', len(clustered_dupes))

### Build Features

In [None]:
import jellyfish
import pandas as pd

pairwise_df_train = None
pairwise_df_test = None

features = [
        'author_damerau_levenshtein_distance', 'author_hamming_distance',
        'author_jaro_distance', 'author_jaro_winkler', 'author_levenshtein_distance',
        'authorlist_damerau_levenshtein_distance', 'authorlist_hamming_distance',
        'authorlist_jaro_distance', 'authorlist_jaro_winkler', 'authorlist_levenshtein_distance',
        'title_damerau_levenshtein_distance', 'title_hamming_distance',
        'title_jaro_distance', 'title_jaro_winkler', 'title_levenshtein_distance',
        'journal_damerau_levenshtein_distance', 'journal_hamming_distance',
        'journal_jaro_distance', 'journal_jaro_winkler', 'journal_levenshtein_distance'
    ]
target = 'identical'

pairwise_df_train = pd.DataFrame(
    columns=features + [target]
)

pairwise_df_test = pd.DataFrame(
    columns=features + [target]
)

def creat_row(df, row_1, row_2):
    row0 = df.ix[row_1]
    row1 = df.ix[row_2]


    aio0 = row0['author-in-focus']
    aio1 = row1['author-in-focus']
    
    al0 = row0['authorlist']
    al1 = row1['authorlist']
    
    title_0 = row0['title']
    title_1 = row1['title']
    
    journal_0 = row0['journal']
    journal_1 = row1['journal']
    
    cluster_name_0 = row0['cluster_name']
    cluster_name_1 = row1['cluster_name']
    
#     print(int(cluster_name_0 == cluster_name_1))

    return [
        jellyfish.damerau_levenshtein_distance(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
        jellyfish.hamming_distance(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
        jellyfish.jaro_distance(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
        jellyfish.jaro_winkler(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
        jellyfish.levenshtein_distance(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
#         jellyfish.match_rating_comparison(aio0.decode('unicode-escape'), aio1.decode('unicode-escape')),
        min([jellyfish.damerau_levenshtein_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]),
        min([jellyfish.hamming_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]),
        min([jellyfish.jaro_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]),
        min([jellyfish.jaro_winkler(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]),
        min([jellyfish.levenshtein_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]),
#         int(max([jellyfish.match_rating_comparison(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])),
        jellyfish.damerau_levenshtein_distance(title_0.decode('unicode-escape'), title_1.decode('unicode-escape')),
        jellyfish.hamming_distance(title_0.decode('unicode-escape'), title_1.decode('unicode-escape')),
        jellyfish.jaro_distance(title_0.decode('unicode-escape'), title_1.decode('unicode-escape')),
        jellyfish.jaro_winkler(title_0.decode('unicode-escape'), title_1.decode('unicode-escape')),
        jellyfish.levenshtein_distance(title_0.decode('unicode-escape'), title_1.decode('unicode-escape')),

        jellyfish.damerau_levenshtein_distance(journal_0.decode('unicode-escape'), journal_1.decode('unicode-escape')),
        jellyfish.hamming_distance(journal_0.decode('unicode-escape'), journal_1.decode('unicode-escape')),
        jellyfish.jaro_distance(journal_0.decode('unicode-escape'), journal_1.decode('unicode-escape')),
        jellyfish.jaro_winkler(journal_0.decode('unicode-escape'), journal_1.decode('unicode-escape')),
        jellyfish.levenshtein_distance(journal_0.decode('unicode-escape'), journal_1.decode('unicode-escape')),

        int(cluster_name_0 == cluster_name_1)
    ]


In [None]:
# row0 = df.ix[0]
# row1 = df.ix[1]

# aio0 = row0['author-in-focus']
# aio1 = row1['author-in-focus']

# al0 = row0['authorlist']
# al1 = row1['authorlist']
# print min([jellyfish.damerau_levenshtein_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])
# print min([jellyfish.hamming_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])
# print min([jellyfish.jaro_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])
# print min([jellyfish.jaro_winkler(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])
# print min([jellyfish.levenshtein_distance(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1])
# [jellyfish.match_rating_comparison(x.decode('unicode-escape'), y.decode('unicode-escape')) for x in al0 for y in al1]

In [None]:
for i, (n1, n2) in enumerate(itertools.product(range(len(df1)), range(len(df1)))):
    if i % 1000 == 0 or i % 1000 == 999:
        print i, n1, n2
    if n1 != n2:
        pairwise_df_train.loc[i] = creat_row(df1, n1, n2)
        
pairwise_df_train.head()

In [None]:
for i, (n1, n2) in enumerate(itertools.product(range(len(df2)), range(len(df2)))):
    if i % 1000 == 0 or i % 1000 == 999:
        print i, n1, n2
    if n1 != n2:
        pairwise_df_test.loc[i] = creat_row(df2, n1, n2)

pairwise_df_test.head()

In [None]:
len(pairwise_df_train)

In [None]:
pairwise_df_train['identical'].unique()

In [None]:
X_train = pairwise_df_train[features].values
y_train = pairwise_df_train[target].values

X_test = pairwise_df_test[features].values
y_test = pairwise_df_test[target].values