In [8]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0





In [13]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed.csv', key = csv_headers.WHED_INDEX)
labeled_data = em.read_csv_metadata(path_to_csv_dir + 'golden_data_labeled_nomissing.csv', key='_id',
                                                                 ltable=WHED, rtable=AOM, 
                                                                 fk_ltable='ltable_' + csv_headers.WHED_INDEX, 
                                                                 fk_rtable='rtable_' + csv_headers.AOM_INDEX,encoding = "ISO-8859-1")



In [14]:
IJ = em.split_train_test(labeled_data, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

In [15]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [46]:
# Generate a set of features
F = em.get_features_for_matching(WHED, AOM)

def is_same_server(ltuple, rtuple):
    # Attention, the input argument is not symmetric,
    # dealing with missing data, if either table is missing city, return 0
    if pd.isnull(ltuple['a_web']) or pd.isnull(rtuple['a_email_server']):
        return 0
    # return 1 if is same city, 0 otherwise
    if ltuple['a_web'] in rtuple['a_email_server']:
        return 1
    else:
        return 0

# em.add_blackbox_feature(F, 'is_same_server', is_same_server)

F.feature_name

0               a_name_a_name_jac_qgm_3_qgm_3
1           a_name_a_name_cos_dlm_dc0_dlm_dc0
2           a_name_a_name_jac_dlm_dc0_dlm_dc0
3                           a_name_a_name_mel
4                      a_name_a_name_lev_dist
5                       a_name_a_name_lev_sim
6                           a_name_a_name_nmw
7                            a_name_a_name_sw
8         a_country_a_country_jac_qgm_3_qgm_3
9     a_country_a_country_cos_dlm_dc0_dlm_dc0
10    a_country_a_country_jac_dlm_dc0_dlm_dc0
11                    a_country_a_country_mel
12               a_country_a_country_lev_dist
13                a_country_a_country_lev_sim
14                    a_country_a_country_nmw
15                     a_country_a_country_sw
16              a_city_a_city_jac_qgm_3_qgm_3
17          a_city_a_city_cos_dlm_dc0_dlm_dc0
18          a_city_a_city_jac_dlm_dc0_dlm_dc0
19                          a_city_a_city_mel
20                     a_city_a_city_lev_dist
21                      a_city_a_c

In [47]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  
# Display first few rows
H.head()


Unnamed: 0,_id,ltable_a_id,rtable_person_id,a_name_a_name_jac_qgm_3_qgm_3,a_name_a_name_cos_dlm_dc0_dlm_dc0,a_name_a_name_jac_dlm_dc0_dlm_dc0,a_name_a_name_mel,a_name_a_name_lev_dist,a_name_a_name_lev_sim,a_name_a_name_nmw,...,a_city_a_city_sw,a_prov_a_prov_jac_qgm_3_qgm_3,a_prov_a_prov_cos_dlm_dc0_dlm_dc0,a_prov_a_prov_jac_dlm_dc0_dlm_dc0,a_prov_a_prov_mel,a_prov_a_prov_lev_dist,a_prov_a_prov_lev_sim,a_prov_a_prov_nmw,a_prov_a_prov_sw,gold_label
408,8607,20499,31346,0.348837,0.67082,0.5,0.725774,20.0,0.285714,2.0,...,1.0,0.0,0.0,0.0,0.711111,13.0,0.133333,-11.0,1.0,0
405,8536,20480,43972,0.333333,0.612372,0.428571,0.6711,22.0,0.371429,-1.0,...,1.0,,,,,,,,,0
108,2217,3081,57232,0.37931,0.632456,0.4,0.888,14.0,0.44,-3.0,...,8.0,0.0,0.0,0.0,0.537037,7.0,0.222222,-5.0,1.0,1
367,7654,20222,46848,0.1,0.408248,0.25,0.511905,19.0,0.095238,-11.0,...,1.0,0.333333,0.707107,0.5,0.890909,6.0,0.454545,-1.0,5.0,0
218,4368,16426,52334,0.181818,0.654654,0.428571,0.614251,28.0,0.243243,-17.0,...,0.0,0.0,0.0,0.0,0.537037,7.0,0.222222,-5.0,1.0,0


In [18]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [48]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [49]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='precision', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x114f94898>,5,0.833333,0.727273,0.85,0.9375,1.0,0.869621
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x114f947f0>,5,0.933333,1.0,0.947368,0.933333,1.0,0.962807
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x114f94860>,5,0.846154,1.0,0.933333,1.0,1.0,0.955897
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x115d6d400>,5,0.875,0.833333,0.944444,1.0,1.0,0.930556
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x114f94780>,5,0.928571,0.75,0.941176,0.928571,0.8,0.869664


In [41]:
print(result)

OrderedDict([('selected_matcher', <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x115d6d400>), ('cv_stats',            Name  \
0  DecisionTree   
1            RF   
2           SVM   
3        LinReg   
4        LogReg   

                                                                         Matcher  \
0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x114f94898>   
1          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x114f947f0>   
2        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x114f94860>   
3  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x115d6d400>   
4  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x114f94780>   

   Num folds  Fold 1    Fold 2    Fold 3    Fold 4    Fold 5  Mean score  
0          5  0.9375  0.727273  0.944444  0.941176  0.666667    0.843412  
1          5  0.8750  0.727273  0.944444  0.823529  0.714286    0.816906  
2          5  0.6875  0.3636

In [31]:
# Convert the J into a set of feature vectors using F
K = em.extract_feature_vecs(J, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  
K = em.impute_table(K, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [44]:
rf.fit(table=H, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'], target_attr='gold_label')
predictions = rf.predict(table=K, exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                append=True, target_attr='predicted', inplace=False)


In [45]:
eval_summary = em.eval_matches(predictions, 'gold_label', 'predicted')
print(eval_summary)

OrderedDict([('prec_numerator', 72.0), ('prec_denominator', 76.0), ('precision', 0.9473684210526315), ('recall_numerator', 72.0), ('recall_denominator', 79.0), ('recall', 0.9113924050632911), ('f1', 0.9290322580645162), ('pred_pos_num', 76.0), ('false_pos_num', 4.0), ('false_pos_ls', [(11527, 35914), (19863, 17955), (18647, 57879), (13965, 35547)]), ('pred_neg_num', 163.0), ('false_neg_num', 7.0), ('false_neg_ls', [(20607, 50974), (18757, 5307), (11313, 46559), (18757, 16712), (20629, 39304), (20621, 31808), (10172, 56019)])])
