# Blocking

In [1]:
import csv
import py_entitymatching as em
import pandas as pd
import os
# import numpy as np
import math

In [2]:
AMAZON_PRODUCTS_FN = "data/amazon_products_clean.csv"
NEWEGG_PRODUCTS_FN = "data/newegg_products.csv"

DEBUG_OUT_FN = "output/debug_blocking.txt"
SURVIVING_TUPLE_PAIRS_FN = "output/blocked_pairs_details_all_large.csv"

In [3]:
def custom_block_func(ltuple, rtuple):
    try:
        if len(rtuple["INFO"]) > 5:
            if (rtuple["INFO"] in ltuple["NAME"]) or (rtuple["INFO"] in ltuple["INFO"]):
                return False
            else:
                return True
        else:
            return True
    except TypeError as e:
        return True

## Main function

In [4]:
# Load csv files as dataframes and set the key attribute in the dataframe
amazon_products = em.read_csv_metadata(AMAZON_PRODUCTS_FN, key="ASIN")
newegg_products = em.read_csv_metadata(NEWEGG_PRODUCTS_FN, key="NID")

print("Number of tuples in A X B:", str(len(amazon_products) * len(newegg_products)))

('Number of tuples in A X B:', '72874880')


No handlers could be found for logger "py_entitymatching.io.parsers"


In [5]:
ab = em.AttrEquivalenceBlocker()
blocked = ab.block_tables(amazon_products, newegg_products, "CATEGORY", "CATEGORY",
                          l_output_attrs=["NAME", "INFO"], r_output_attrs=["NAME", "INFO"])

print("Number of blocked tuples (after blocking on category):", len(blocked))

('Number of blocked tuples (after blocking on category):', 3842626)


In [6]:
ab2 = em.AttrEquivalenceBlocker()
blocked = ab2.block_candset(blocked, "BRAND", "BRAND", allow_missing=True, show_progress=False)
print("Number of blocked tuples (after blocking on brand):", len(blocked))

('Number of blocked tuples (after blocking on brand):', 391609)


### Setup for rules based blocker

In [7]:
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()
atypes1 = em.get_attr_types(amazon_products)
atypes2 = em.get_attr_types(newegg_products)
block_c = em.get_attr_corres(amazon_products, newegg_products)
block_f = em.get_features(amazon_products, newegg_products, atypes1, atypes2, block_c, block_t, block_s)

### Jaccard Score Between The Names

In [8]:
r1 = em.get_feature_fn('jaccard(wspace(ltuple.NAME), wspace(rtuple.NAME))', block_t, block_s)
# weird workaround for weird bug
r1["right_attribute"] = "NAME"
r1["left_attribute"] = "NAME"
em.add_feature(block_f, 'name_name_jac', r1)

True

In [9]:
rb = em.RuleBasedBlocker()
rb.add_rule(["name_name_jac(ltuple, rtuple) < 0.5"], block_f)
# rb.add_rule(["custom_block(ltuple, rtuple) > 0"], block_f)

'_rule_0'

In [10]:
blocked_rule = rb.block_candset(blocked, n_jobs=1, show_progress=False)
print("Tuples after blocking on name: {}".format(len(blocked_rule)))

Tuples after blocking on name: 3997


In [11]:
bb = em.BlackBoxBlocker()
bb.set_black_box_function(custom_block_func)
blocked_black = bb.block_candset(blocked, n_jobs=1, show_progress=False)
print("Tuples after blocking on custom: {}".format(len(blocked_black)))

Tuples after blocking on custom: 1314


In [12]:
blocked = em.combine_blocker_outputs_via_union([blocked_rule, blocked_black])
print("Number of blocked tuples (after union):", len(blocked))

('Number of blocked tuples (after union):', 4938)


### Debug again

In [13]:
dbg = em.debug_blocker(blocked, amazon_products, newegg_products, output_size=50, attr_corres=[('NAME', 'NAME')])
print(dbg.head)

<bound method DataFrame.head of     _id  similarity ltable_ASIN       rtable_NID  \
0     0    1.000000  B002O0L0GC   9SIA3912EU4290   
1     1    1.000000  B00PDDMN6S  N82E16811133274   
2     2    1.000000  B004LRO1BW   9SIA4RE4T20421   
3     3    1.000000  B01CRVKTCS   9SIA4P03S12378   
4     4    0.846154  B0062FZ2WS   9SIA0AJ2U28695   
5     5    0.833333  B0013IQL4C   9SIA2TN40R4778   
6     6    0.800000  B00F0JXDUU   9SIA3TB1JE6631   
7     7    0.789474  B00B1LW3W0   9SIA9HJ3JX9155   
8     8    0.789474  B00B1LW3W0   9SIA0AJ1RH1076   
9     9    0.785714  B007FQNLR6   9SIA3TB1HF4865   
10   10    0.777778  B004U3MQQY  N82E16820239976   
11   11    0.769231  B01HCXHBIQ   9SIA4UB1UP5595   
12   12    0.764706  B00GIHPP8Q  N82E16820231704   
13   13    0.764706  B01CRVKTCS   9SIA3FA36D7258   
14   14    0.764706  B01CRVKTCS   9SIA6ZP4YF0813   
15   15    0.764706  B01CRVKTCS   9SIACGB57B8118   
16   16    0.750000  B00GIHPP8Q  N82E16820231581   
17   17    0.750000  B002CSRF9M 

### Save surviving tuple pairs

In [14]:
blocked.to_csv(SURVIVING_TUPLE_PAIRS_FN)