In [1]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import joblib

In [2]:
# Read the dataset / Clean / Save
A = pd.read_csv('../data/sigmod/X2.csv')
A = A.fillna(-999)
A.to_csv('X2_cleaned.csv', index=False)

In [3]:
# Reread the cleaned dataset
A = em.read_csv_metadata('X2_cleaned.csv', key='instance_id')
B = em.read_csv_metadata('X2_cleaned.csv', key='instance_id')

print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B)))

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Number of tuples in A: 343
Number of tuples in B: 343
Number of tuples in A X B (i.e the cartesian product): 117649


In [4]:
# Start running blocking
ob = em.OverlapBlocker()

C = ob.block_tables(A, B, 'title', 'title',
                    l_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
                                    'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
                                    'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    r_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
                                    'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
                                    'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    overlap_size=1, show_progress=True, l_output_prefix='left_',
                    r_output_prefix='right_', )

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [5]:
# Get features
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

# Get the attributes to be projected while predicting
attrs_from_table = ['left_brand',
                    'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
                    'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
                    'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
                    'left_weight', 'left_dimensions', 'left_title', 'right_brand',
                    'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
                    'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
                    'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
                    'right_weight', 'right_dimensions', 'right_title']
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'left_instance_id', 'right_instance_id'])
attrs_to_be_excluded.extend(attrs_from_table)

In [6]:
# Convert the cancidate set to feature vectors using the feature table
L = em.extract_feature_vecs(C, feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            show_progress=True, n_jobs=-1)

In [7]:
loaded_rf = joblib.load("./random_forest.joblib")

In [8]:
# Predict the matches
predictions = loaded_rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,
                                append=True, target_attr='predicted', inplace=False, )

In [9]:
# Prepare the output
def duplicates(x):
    return x['left_instance_id'] == x['right_instance_id']


def prepare_sigmod_output(res):
    ret = res[res.predicted == 1]
    ret = ret[['left_instance_id', 'right_instance_id']]
    ret = ret[~ret.apply(duplicates, axis=1)]
    return ret.drop_duplicates()

In [10]:
ret = prepare_sigmod_output(predictions)

In [11]:
ret

Unnamed: 0,left_instance_id,right_instance_id
20,www.flexshopper.com//884,www.softwarecity.ca//737
34,www.flexshopper.com//488,www.softwarecity.ca//737
40,www.tigerdirect.com//12,www.softwarecity.ca//737
79,buy.net//1960,www.softwarecity.ca//737
88,www.vology.com//80,www.softwarecity.ca//737
...,...,...
102509,www.vology.com//4484,www.vology.com//3017
102525,www.vology.com//105,www.vology.com//3017
102528,www.vology.com//1068,www.vology.com//3017
102543,www.vology.com//3356,www.vology.com//3017
