In [28]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0


In [30]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)
labeled_data = em.read_csv_metadata(path_to_csv_dir + 'G_combined.csv', key='_id',
                                                                 ltable=WHED, rtable=AOM, 
                                                                 fk_ltable='ltable_' + csv_headers.WHED_INDEX, 
                                                                 fk_rtable='rtable_' + csv_headers.AOM_INDEX)

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [31]:
IJ = em.split_train_test(labeled_data, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

In [32]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [33]:
# Generate a set of features
F = em.get_features_for_matching(WHED, AOM)

def is_same_server(ltuple, rtuple):
    # Attention, the input argument is not symmetric,
    # dealing with missing data, if either table is missing city, return 0
    if pd.isnull(ltuple['a_web']) or pd.isnull(rtuple['a_email_server']):
        return 0
    # return 1 if is same city, 0 otherwise
    if ltuple['a_web'] in rtuple['a_email_server'].split('.'):
        return 1
    else:
        return 0

em.add_blackbox_feature(F, 'is_same_server', is_same_server)

F.feature_name

0               a_name_a_name_jac_qgm_3_qgm_3
1           a_name_a_name_cos_dlm_dc0_dlm_dc0
2           a_name_a_name_jac_dlm_dc0_dlm_dc0
3                           a_name_a_name_mel
4                      a_name_a_name_lev_dist
5                       a_name_a_name_lev_sim
6                           a_name_a_name_nmw
7                            a_name_a_name_sw
8         a_country_a_country_jac_qgm_3_qgm_3
9     a_country_a_country_cos_dlm_dc0_dlm_dc0
10    a_country_a_country_jac_dlm_dc0_dlm_dc0
11                    a_country_a_country_mel
12               a_country_a_country_lev_dist
13                a_country_a_country_lev_sim
14                    a_country_a_country_nmw
15                     a_country_a_country_sw
16              a_city_a_city_jac_qgm_3_qgm_3
17          a_city_a_city_cos_dlm_dc0_dlm_dc0
18          a_city_a_city_jac_dlm_dc0_dlm_dc0
19                          a_city_a_city_mel
20                     a_city_a_city_lev_dist
21                      a_city_a_c

In [34]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)  
# Display first few rows
H.head()


Unnamed: 0,_id,ltable_a_id,rtable_person_id,a_name_a_name_jac_qgm_3_qgm_3,a_name_a_name_cos_dlm_dc0_dlm_dc0,a_name_a_name_jac_dlm_dc0_dlm_dc0,a_name_a_name_mel,a_name_a_name_lev_dist,a_name_a_name_lev_sim,a_name_a_name_nmw,...,a_prov_a_prov_jac_qgm_3_qgm_3,a_prov_a_prov_cos_dlm_dc0_dlm_dc0,a_prov_a_prov_jac_dlm_dc0_dlm_dc0,a_prov_a_prov_mel,a_prov_a_prov_lev_dist,a_prov_a_prov_lev_sim,a_prov_a_prov_nmw,a_prov_a_prov_sw,is_same_server,gold_label
80,759,12554,36459,1.0,1.0,1.0,1.0,0.0,1.0,29.0,...,,,,,,,,,0,1
51,521,10527,24053,0.515152,0.707107,0.5,0.856667,14.0,0.533333,2.0,...,,,,,,,,,0,0
2,21,15,28818,1.0,1.0,1.0,1.0,0.0,1.0,17.0,...,0.0,0.0,0.0,0.0,7.0,0.0,-6.0,0.0,0,1
216,1857,20242,7908,0.395349,0.632456,0.4,0.887179,22.0,0.435897,-5.0,...,,,,,,,,,0,1
104,967,15293,57956,0.416667,0.816497,0.666667,0.596649,24.0,0.111111,-4.0,...,,,,,,,,,0,0


In [35]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [36]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
                strategy='mean')

In [37]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_' + csv_headers.WHED_INDEX, 'rtable_' + csv_headers.AOM_INDEX, 'gold_label'],
        k=5,
        target_attr='gold_label', metric='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x10b19a668>,5,0.904762,0.909091,0.75,0.810811,0.954545,0.865842
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10b19a0b8>,5,0.909091,0.956522,0.846154,0.857143,0.952381,0.904258
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x10b19a6a0>,5,0.933333,0.958333,0.6875,0.809524,0.913043,0.860347
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10b18bb38>,5,0.954545,0.977778,0.758621,0.833333,0.930233,0.890902
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x10b19aef0>,5,0.883721,0.977778,0.647059,0.820513,0.930233,0.851861


In [23]:
print(result)

OrderedDict([('selected_matcher', <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10c2e5240>), ('cv_stats',            Name  \
0  DecisionTree   
1            RF   
2           SVM   
3        LinReg   
4        LogReg   

                                                                         Matcher  \
0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x10c2e51d0>   
1          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x10c2e5240>   
2        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x10c2e5208>   
3  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x10c2e5d68>   
4  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x10c2e5f60>   

   Num folds    Fold 1    Fold 2    Fold 3    Fold 4    Fold 5  Mean score  
0          5  0.956522  0.909091  0.785714  0.780488  0.926829    0.871729  
1          5  0.930233  0.978723  0.785714  0.894737  0.930233    0.903928  
2          5  0.936170  0.9019