# Prediction

In [1]:
import py_entitymatching as em
import os
import pandas as pd

## 1. Data preparation

In [2]:
# Set paths
dataset_dir = 'data' # Directory 
path_ama = dataset_dir + os.sep + 'amazon_products.csv'  # Amazon dataset
path_egg = dataset_dir + os.sep + "newegg_products.csv"  # NewEggs dataset
path_block = dataset_dir + os.sep + 'blocked_pairs.csv' # Labeled (G) sample after blocking

In [3]:
# Load data with utf-8 encoding
# Amazon
tb_ama = em.read_csv_metadata(path_ama, key="ASIN", encoding='utf-8')
# NewEgg
tb_egg = em.read_csv_metadata(path_egg, key="NID", encoding='utf-8')
# Sample S
tb_block = em.read_csv_metadata(path_block, 
                        key="_id", 
                        ltable=tb_ama, rtable=tb_egg, 
                        fk_ltable="ltable_ASIN", fk_rtable="rtable_NID", 
                        encoding='utf-8')

No handlers could be found for logger "py_entitymatching.io.parsers"


In [4]:
# Upate attribute type of 'PRICE' from string to numeric
atypes1 = em.get_attr_types(tb_ama)
atypes1['PRICE'] = 'numeric'
atypes2 = em.get_attr_types(tb_egg)
atypes2['PRICE'] = 'numeric'

## 2. Features Extraction

### 2.1 Define feature table

In [5]:
def price_small(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 1:
        return 1
    else:
        return 0
    
def price_exact(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d < 0.01:
        return 1
    else:
        return 0
    
def price_large(ltuple, rtuple):
    d = abs(float(ltuple.PRICE) - float(rtuple.PRICE))
    if d > 10:
        return 1
    else:
        return 0
def model_in_name(ltuple, rtuple):
    lst1 = ltuple.INFO.split(' ');
    lst2 = ltuple.NAME.split(' ');
    if (rtuple.INFO in lst1) or (rtuple.INFO in lst2):
        return 1
    else:
        return 0
    
def refurbished(ltuple, rtuple):
    l = 'refurbished' in ltuple.NAME.lower() 
    r = 'refurbished' in rtuple.NAME.lower() 
    if l != r:
        return 1
    else:
        return 0

In [6]:
# Reuse automatic features of Name
block_c = em.get_attr_corres(tb_ama, tb_egg) # block corres
block_c['corres'] = [('NAME', 'NAME')]
sim = em.get_sim_funs_for_matching() # similarity functions
tok = em.get_tokenizers_for_matching() # tokenizing functions
F = em.get_features(tb_ama, tb_egg, atypes1, atypes2, block_c, tok, sim)

# Brand feature
BRAND_BRAND_lev_sim = 'lev_sim(ltuple.BRAND, rtuple.BRAND)'
feature = em.get_feature_fn(BRAND_BRAND_lev_sim, sim, tok)
em.add_feature(F, 'BRAND_BRAND_lev_sim', feature)

# Category feature
CATEGORY_CATEGORY_lev_sim = 'lev_sim(ltuple.CATEGORY, rtuple.CATEGORY)'
feature = em.get_feature_fn(CATEGORY_CATEGORY_lev_sim, sim, tok)
em.add_feature(F, 'CATEGORY_CATEGORY_lev_sim', feature)

# Price features
em.add_blackbox_feature(F, 'PRICE_PRICE_exact', price_exact)
em.add_blackbox_feature(F, 'PRICE_PRICE_small', price_small)
em.add_blackbox_feature(F, 'PRICE_PRICE_large', price_large)

# Info and name features
em.add_blackbox_feature(F, 'INFONAME_INFO_contain', model_in_name)
em.add_blackbox_feature(F, 'NAME_NAME_refurbished', refurbished)

True

### Extracting feature vectors

In [8]:
i_vectors = em.extract_feature_vecs(tb_block, 
                                    feature_table=F, 
                                    show_progress=False)
i_vectors = em.impute_table(i_vectors, 
                            exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID'], 
                            strategy='median')

In [9]:
best_features = F
best_vectors = i_vectors

## 3. Predict

In [10]:
import pickle

### Classify Matches

In [11]:
# General Evaluation function
filename = 'svm_model.sav'
model = pickle.load(open(filename, 'rb'))

# Predict on test set
predictions = model.predict(table=best_vectors, exclude_attrs=['_id', 'ltable_ASIN', 'rtable_NID'], 
          append=True, target_attr='predicted', inplace=False)

### Store matched tuples

In [12]:
cols = ['ltable_ASIN', 'rtable_NID']
df = predictions[predictions['predicted'] == 1]
df = df[cols]
df.to_csv('matched_tuples.csv', index=False)