# Matching 

In [1]:
import py_entitymatching as em
import os
import pandas as pd
import time

In [2]:
# start time
tic0 = time.clock()

## 1. Data preparation

### 1.1 Load datasets

In [3]:
# Set paths
dataset_dir = 'data' # Directory 
path_ama = dataset_dir + os.sep + 'amazon_products.csv'  # Amazon dataset
path_egg = dataset_dir + os.sep + "newegg_products.csv"  # NewEggs dataset
path_labeled_data = dataset_dir + os.sep + 'labeled.csv' # Labeled (G) sample after blocking

In [4]:
# Load data with utf-8 encoding
# Amazon
tb_ama = em.read_csv_metadata(path_ama, key="ASIN", encoding='utf-8')
# NewEgg
tb_egg = em.read_csv_metadata(path_egg, key="NID", encoding='utf-8')
# Sample S
S = em.read_csv_metadata(path_labeled_data, 
                        key="_id", 
                        ltable=tb_ama, rtable=tb_egg, 
                        fk_ltable="ltable_ASIN", fk_rtable="rtable_NID", 
                        encoding='utf-8')

No handlers could be found for logger "py_entitymatching.io.parsers"


### 1.2 Divide dataset into I and J

In [5]:
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

## 2. Features Extraction

### 2.1 Feature table

### Generate features automatically 

In [6]:
# use automatic feature generator
F = em.get_features_for_matching(tb_ama, tb_egg)

In [7]:
F.feature_name

0                 NAME_NAME_jac_qgm_3_qgm_3
1             NAME_NAME_cos_dlm_dc0_dlm_dc0
2         CATEGORY_CATEGORY_jac_qgm_3_qgm_3
3     CATEGORY_CATEGORY_cos_dlm_dc0_dlm_dc0
4     CATEGORY_CATEGORY_jac_dlm_dc0_dlm_dc0
5                     CATEGORY_CATEGORY_mel
6                CATEGORY_CATEGORY_lev_dist
7                 CATEGORY_CATEGORY_lev_sim
8                     CATEGORY_CATEGORY_nmw
9                      CATEGORY_CATEGORY_sw
10              NUM_REVIEWS_NUM_REVIEWS_exm
11              NUM_REVIEWS_NUM_REVIEWS_anm
12         NUM_REVIEWS_NUM_REVIEWS_lev_dist
13          NUM_REVIEWS_NUM_REVIEWS_lev_sim
14              BRAND_BRAND_jac_qgm_3_qgm_3
15          BRAND_BRAND_cos_dlm_dc0_dlm_dc0
16          BRAND_BRAND_jac_dlm_dc0_dlm_dc0
17                          BRAND_BRAND_mel
18                     BRAND_BRAND_lev_dist
19                      BRAND_BRAND_lev_sim
20                          BRAND_BRAND_nmw
21                           BRAND_BRAND_sw
Name: feature_name, dtype: objec

### 2.2 Extract feature vectors from set I

In [8]:
i_vectors = em.extract_feature_vecs(I, 
                                    feature_table=F, 
                                    attrs_after='label', show_progress=False)

### 2.3 Impute missing values

In [9]:
# Check missing values
any(pd.notnull(i_vectors))

True

In [10]:
# Filling missing by median
i_vectors = em.impute_table(i_vectors, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

## 3. Matcher Selection

### 3.1 Initialize matchers

In [11]:
# Decision Tree
dt = em.DTMatcher(name='DecisionTree', random_state=0)
# Random Forest
rf = em.RFMatcher(name='RandomForest', random_state=0)
# SVM
svm = em.SVMMatcher(name='SVM', random_state=0)
# Naive Bayes
nb = em.NBMatcher(name='NaiveBayes')
# Logistic Regression
lg = em.LogRegMatcher(name='LogisticRegression', random_state=0)
# LinearRegression
ln = em.LinRegMatcher(name='LinearRegression')

### 3.2 Cross validation on I set (1st time)

In [12]:
def cv(vectors):
    for m in ['precision', 'recall', 'f1']:
        print('\n' + m)
        result = em.select_matcher([dt, rf, svm, nb, lg, ln], 
                                   table=vectors, 
                                   exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'],
                                   k=5, 
                                   target_attr='label', 
                                   metric=m,
                                   random_state=0)
    #     display(result['cv_stats'])
        alist = result['cv_stats']['Name']
        blist = result['cv_stats']['Mean score']
        for i, (a, b) in enumerate(zip(alist, blist)):
            print(str(i) + ". " + str(a) + "\t" + str(b))

In [13]:
cv(i_vectors)


precision
0. DecisionTree	0.705211558308
1. RandomForest	0.767395104895
2. SVM	0.723869463869
3. NaiveBayes	0.615305242664
4. LogisticRegression	0.555649350649
5. LinearRegression	0.718095238095

recall
0. DecisionTree	0.78099219621
1. RandomForest	0.663623188406
2. SVM	0.523366778149
3. NaiveBayes	0.753021181717
4. LogisticRegression	0.229096989967
5. LinearRegression	0.315785953177

f1
0. DecisionTree	0.723386394307
1. RandomForest	0.69230020284
2. SVM	0.597563726596
3. NaiveBayes	0.637416463295
4. LogisticRegression	0.315084029486
5. LinearRegression	0.415632183908


### 3.3 Debug (on Random Forest)

#### Data

In [14]:
# Random forest
PQ = em.split_train_test(i_vectors, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']

In [15]:
# Debug RF matcher using GUI
# em.vis_debug_rf(rf, P, Q, 
#         exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'],
#         target_attr='label')

#### Debug 1

In [16]:
# Upate attribute type of 'PRICE' from string to numeric
atypes1 = em.get_attr_types(tb_ama)
atypes1['PRICE'] = 'numeric'
atypes2 = em.get_attr_types(tb_egg)
atypes2['PRICE'] = 'numeric'

In [17]:
block_c = em.get_attr_corres(tb_ama, tb_egg) # block corres
sim = em.get_sim_funs_for_matching() # similarity functions
tok = em.get_tokenizers_for_matching() # tokenizing functions
# matching features
F1 = em.get_features(tb_ama, tb_egg, 
                    atypes1, atypes2, 
                    block_c, tok, sim)

In [18]:
def price_small(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 1:
        return 1
    else:
        return 0

In [19]:
em.add_blackbox_feature(F1, 'PRICE_PRICE_small', price_small)

True

In [20]:
i_vectors1 = em.extract_feature_vecs(I, 
                                    feature_table=F1, 
                                    attrs_after='label', show_progress=False)
i_vectors1 = em.impute_table(i_vectors1, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

  if d1 == d2:


In [21]:
cv(i_vectors1)


precision
0. DecisionTree	0.697130004498
1. RandomForest	0.8
2. SVM	0.687459207459
3. NaiveBayes	0.601158536585
4. LogisticRegression	0.54683982684
5. LinearRegression	0.631666666667

recall
0. DecisionTree	0.664347826087
1. RandomForest	0.622296544036
2. SVM	0.532062430323
3. NaiveBayes	0.788361204013
4. LogisticRegression	0.20762541806
5. LinearRegression	0.393946488294

f1
0. DecisionTree	0.660907645458
1. RandomForest	0.687476190476
2. SVM	0.587254480287
3. NaiveBayes	0.652499134077
4. LogisticRegression	0.283238866397
5. LinearRegression	0.470443722944


#### Debug 2

In [22]:
def price_exact(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 0.01:
        return 1
    else:
        return 0
    
def price_large(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d > 10:
        return 1
    else:
        return 0

In [23]:
block_c['corres'] = [('NAME', 'NAME')]
F2 = em.get_features(tb_ama, tb_egg, atypes1, atypes2, block_c, tok, sim)
em.add_blackbox_feature(F2, 'PRICE_PRICE_exact', price_exact)
em.add_blackbox_feature(F2, 'PRICE_PRICE_small', price_small)
em.add_blackbox_feature(F2, 'PRICE_PRICE_large', price_large)

True

In [24]:
i_vectors2 = em.extract_feature_vecs(I, 
                                    feature_table=F2, 
                                    attrs_after='label', show_progress=False)
i_vectors2 = em.impute_table(i_vectors2, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

In [25]:
cv(i_vectors2)


precision
0. DecisionTree	0.725386996904
1. RandomForest	0.796478375426
2. SVM	0.590649350649
3. NaiveBayes	0.391692307692
4. LogisticRegression	0.59
5. LinearRegression	0.658095238095

recall
0. DecisionTree	0.775774804905
1. RandomForest	0.773745819398
2. SVM	0.32508361204
3. NaiveBayes	0.916666666667
4. LogisticRegression	0.255596432553
5. LinearRegression	0.411917502787

f1
0. DecisionTree	0.737316948285
1. RandomForest	0.77382847038
2. SVM	0.372794485455
3. NaiveBayes	0.527160486393
4. LogisticRegression	0.32294453305
5. LinearRegression	0.487009222661


#### Debug 3

In [26]:
def model_in_name(ltuple, rtuple):
    lst1 = ltuple.INFO.split(' ');
    lst2 = ltuple.NAME.split(' ');
    if (rtuple.INFO in lst1) or (rtuple.INFO in lst2):
        return 1
    else:
        return 0
    
def refurbished(ltuple, rtuple):
    l = 'refurbished' in ltuple.NAME.lower() 
    r = 'refurbished' in rtuple.NAME.lower() 
    if l != r:
        return 1
    else:
        return 0

In [27]:
# Reuse automatic features of Name
block_c['corres'] = [('NAME', 'NAME')]

F3 = em.get_features(tb_ama, tb_egg, atypes1, atypes2, block_c, tok, sim)

# Brand feature
BRAND_BRAND_lev_sim = 'lev_sim(ltuple.BRAND, rtuple.BRAND)'
feature = em.get_feature_fn(BRAND_BRAND_lev_sim, sim, tok)
em.add_feature(F3, 'BRAND_BRAND_lev_sim', feature)

# Category feature
CATEGORY_CATEGORY_lev_sim = 'lev_sim(ltuple.CATEGORY, rtuple.CATEGORY)'
feature = em.get_feature_fn(CATEGORY_CATEGORY_lev_sim, sim, tok)
em.add_feature(F3, 'CATEGORY_CATEGORY_lev_sim', feature)

# Price features
em.add_blackbox_feature(F3, 'PRICE_PRICE_exact', price_exact)
em.add_blackbox_feature(F3, 'PRICE_PRICE_small', price_small)
em.add_blackbox_feature(F3, 'PRICE_PRICE_large', price_large)

# Info and name features
em.add_blackbox_feature(F3, 'INFONAME_INFO_contain', model_in_name)
em.add_blackbox_feature(F3, 'NAME_NAME_refurbished', refurbished)

True

In [28]:
i_vectors3 = em.extract_feature_vecs(I, 
                                    feature_table=F3, 
                                    attrs_after='label', show_progress=False)
i_vectors3 = em.impute_table(i_vectors3, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

In [29]:
cv(i_vectors3)


precision
0. DecisionTree	0.851282051282
1. RandomForest	0.868452380952
2. SVM	0.97032967033
3. NaiveBayes	0.398285714286
4. LogisticRegression	0.97032967033
5. LinearRegression	0.97032967033

recall
0. DecisionTree	0.861884057971
1. RandomForest	0.850579710145
2. SVM	0.747079152731
3. NaiveBayes	0.933333333333
4. LogisticRegression	0.747079152731
5. LinearRegression	0.747079152731

f1
0. DecisionTree	0.849029138503
1. RandomForest	0.854742830605
2. SVM	0.840472312002
3. NaiveBayes	0.53823740947
4. LogisticRegression	0.840472312002
5. LinearRegression	0.840472312002


Conclusion: the best matcher is SVM.

In [30]:
best_features = F3
best_vectors = i_vectors3
best_model = svm

## 4. Evaluation

### 4.1 Extract Feature vectors from J

In [31]:
# Generate test feature vectors
j_vectors = em.extract_feature_vecs(J, 
                                    feature_table=best_features, 
                                    attrs_after='label', show_progress=False)
# Impute missing values 
j_vectors = em.impute_table(j_vectors, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

### 4.2 Test on J

In [32]:
# General Evaluation function
def evaluate(model, train, test):
    # Train on train set
    model.fit(table=train, exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'],
        target_attr='label')
    # Predict on test set
    predictions = model.predict(table=test, exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
              append=True, target_attr='predicted', inplace=False)
    # Evaluate 
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    # Print out
    em.print_eval_summary(eval_result)

#### Evalution result for all 6 methods

In [33]:
# Evaluate the predictions
count = 0
for model in [dt, rf, svm, nb, lg, ln]:
    count += 1
    print(str(count) + ". " + model.name)
    evaluate(model, best_vectors, j_vectors)
    print("")

1. DecisionTree
Precision : 92.21% (71/77)
Recall : 81.61% (71/87)
F1 : 86.59%
False positives : 6 (out of 77 positive predictions)
False negatives : 16 (out of 173 negative predictions)

2. RandomForest
Precision : 89.16% (74/83)
Recall : 85.06% (74/87)
F1 : 87.06%
False positives : 9 (out of 83 positive predictions)
False negatives : 13 (out of 167 negative predictions)

3. SVM
Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)

4. NaiveBayes
Precision : 33.74% (83/246)
Recall : 95.4% (83/87)
F1 : 49.85%
False positives : 163 (out of 246 positive predictions)
False negatives : 4 (out of 4 negative predictions)

5. LogisticRegression
Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)

6. LinearRegression
Precision : 95.65% (66/69)
Recall : 75.86% (6

#### Evaluation on the best matcher - SVM

In [34]:
evaluate(best_model, best_vectors, j_vectors)

Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)


In [35]:
# End time
toc0 = time.clock()
print("Total time: " + str(toc0 - tic0))

Total time: 9.656946
