# Matching 

In [2]:
import py_entitymatching as em
import os
import pandas as pd
import time

## 1. Data preparation

### 1.1 Load datasets

In [3]:
# Set paths
dataset_dir = 'data' # Directory 
path_ama = dataset_dir + os.sep + 'amazon_products.csv'  # Amazon dataset
path_egg = dataset_dir + os.sep + "newegg_products.csv"  # NewEggs dataset
path_block = dataset_dir + os.sep + 'blocked_pairs.csv' # Labeled (G) sample after blocking

In [5]:
# Load data with utf-8 encoding
# Amazon
tb_ama = em.read_csv_metadata(path_ama, key="ASIN", encoding='utf-8')
# NewEgg
tb_egg = em.read_csv_metadata(path_egg, key="NID", encoding='utf-8')
# Sample S
tb_block = em.read_csv_metadata(path_block, 
                        key="_id", 
                        ltable=tb_ama, rtable=tb_egg, 
                        fk_ltable="ltable_ASIN", fk_rtable="rtable_NID", 
                        encoding='utf-8')

In [6]:
# Upate attribute type of 'PRICE' from string to numeric
atypes1 = em.get_attr_types(tb_ama)
atypes1['PRICE'] = 'numeric'
atypes2 = em.get_attr_types(tb_egg)
atypes2['PRICE'] = 'numeric'

## 2. Features Extraction

### 2.1 Feature table

### Generate features automatically 

In [12]:
# Decision Tree
dt = em.DTMatcher(name='DecisionTree', random_state=0)
# Random Forest
rf = em.RFMatcher(name='RandomForest', random_state=0)
# SVM
svm = em.SVMMatcher(name='SVM', random_state=0)
# Naive Bayes
nb = em.NBMatcher(name='NaiveBayes')
# Logistic Regression
lg = em.LogRegMatcher(name='LogisticRegression', random_state=0)
# LinearRegression
ln = em.LinRegMatcher(name='LinearRegression')

In [16]:
def price_small(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 1:
        return 1
    else:
        return 0
    
def price_exact(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 0.01:
        return 1
    else:
        return 0
    
def price_large(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d > 10:
        return 1
    else:
        return 0
def model_in_name(ltuple, rtuple):
    lst1 = ltuple.INFO.split(' ');
    lst2 = ltuple.NAME.split(' ');
    if (rtuple.INFO in lst1) or (rtuple.INFO in lst2):
        return 1
    else:
        return 0
    
def refurbished(ltuple, rtuple):
    l = 'refurbished' in ltuple.NAME.lower() 
    r = 'refurbished' in rtuple.NAME.lower() 
    if l != r:
        return 1
    else:
        return 0

In [22]:
# Reuse automatic features of Name
block_c = em.get_attr_corres(tb_ama, tb_egg) # block corres
block_c['corres'] = [('NAME', 'NAME')]
sim = em.get_sim_funs_for_matching() # similarity functions
tok = em.get_tokenizers_for_matching() # tokenizing functions
F2 = em.get_features(tb_ama, tb_egg, atypes1, atypes2, block_c, tok, sim)

# Brand feature
BRAND_BRAND_lev_sim = 'lev_sim(ltuple.BRAND, rtuple.BRAND)'
feature = em.get_feature_fn(BRAND_BRAND_lev_sim, sim, tok)
em.add_feature(F2, 'BRAND_BRAND_lev_sim', feature)

# Category feature
CATEGORY_CATEGORY_lev_sim = 'lev_sim(ltuple.CATEGORY, rtuple.CATEGORY)'
feature = em.get_feature_fn(CATEGORY_CATEGORY_lev_sim, sim, tok)
em.add_feature(F2, 'CATEGORY_CATEGORY_lev_sim', feature)

# Price features
em.add_blackbox_feature(F2, 'PRICE_PRICE_exact', price_exact)
em.add_blackbox_feature(F2, 'PRICE_PRICE_small', price_small)
em.add_blackbox_feature(F2, 'PRICE_PRICE_large', price_large)

# Info and name features
em.add_blackbox_feature(F2, 'INFONAME_INFO_contain', model_in_name)
em.add_blackbox_feature(F2, 'NAME_NAME_refurbished', refurbished)

True

In [23]:
i_vectors2 = em.extract_feature_vecs(I, 
                                    feature_table=F2, 
                                    attrs_after='label', show_progress=False)
i_vectors2 = em.impute_table(i_vectors2, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
                            strategy='median')

In [25]:
best_features = F2
best_vectors = i_vectors2
best_model = svm

## 3. Predict

In [27]:
# General Evaluation function
def evaluate(model, train, test):
    # Train on train set
    model.fit(table=train, exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'],
        target_attr='label')
    # Predict on test set
    predictions = model.predict(table=test, exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID', 'label'], 
              append=True, target_attr='predicted', inplace=False)
    # Evaluate 
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    # Print out
    em.print_eval_summary(eval_result)

#### Evalution result for all 6 methods

In [28]:
# Evaluate the predictions
count = 0
for model in [dt, rf, svm, nb, lg, ln]:
    count += 1
    print(str(count) + ". " + model.name)
    evaluate(model, best_vectors, j_vectors)
    print("")

1. DecisionTree
Precision : 92.21% (71/77)
Recall : 81.61% (71/87)
F1 : 86.59%
False positives : 6 (out of 77 positive predictions)
False negatives : 16 (out of 173 negative predictions)

2. RandomForest
Precision : 89.16% (74/83)
Recall : 85.06% (74/87)
F1 : 87.06%
False positives : 9 (out of 83 positive predictions)
False negatives : 13 (out of 167 negative predictions)

3. SVM
Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)

4. NaiveBayes
Precision : 33.74% (83/246)
Recall : 95.4% (83/87)
F1 : 49.85%
False positives : 163 (out of 246 positive predictions)
False negatives : 4 (out of 4 negative predictions)

5. LogisticRegression
Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)

6. LinearRegression
Precision : 95.65% (66/69)
Recall : 75.86% (6

#### Evaluation on the best matcher - SVM

In [29]:
evaluate(best_model, best_vectors, j_vectors)

Precision : 95.65% (66/69)
Recall : 75.86% (66/87)
F1 : 84.62%
False positives : 3 (out of 69 positive predictions)
False negatives : 21 (out of 181 negative predictions)


In [30]:
# End time
toc0 = time.clock()
print("Total time: " + str(toc0 - tic0))

Total time: 6.514814
