In [1]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import re
import joblib

In [3]:
def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9.\-\s]')
    multispace_regex = re.compile(r'\s\s+')

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')

    # Count the number of nans in a certain row and remove records with more than 3 nans
#     nans_count = df.isnull().sum(axis=1)
#     mask = nans_count > 4
#     print("removing {} records containing nans".format(len(df[mask])))
#     df = df[~mask]

    # Brand assignment
    all_brands = set()
    extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    # cpu model
    def assign_cpu_model(record):
        if record['cpu_brand'] == 'intel':
            pass
        else:
            pass

    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0

    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = df.fillna(-999)

    # Unit stand. in weight
    return df

In [4]:
# Read the dataset / Clean / Save
A = pd.read_csv('../data/sigmod/X2.csv')
A = preprocess_laptop_dataset(A)
A.to_csv('X2_cleaned.csv', index=False)

In [5]:
# Reread the cleaned dataset
A = em.read_csv_metadata('X2_cleaned.csv', key='instance_id')
B = em.read_csv_metadata('X2_cleaned.csv', key='instance_id')

print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B)))

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Number of tuples in A: 343
Number of tuples in B: 343
Number of tuples in A X B (i.e the cartesian product): 117649


In [6]:
# Start running blocking
ob = em.OverlapBlocker()

C = ob.block_tables(A, B, 'title', 'title',
                    l_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
                                    'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
                                    'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    r_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
                                    'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
                                    'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    overlap_size=1, show_progress=True, l_output_prefix='left_',
                    r_output_prefix='right_', )

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [7]:
# Get features
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

# Get the attributes to be projected while predicting
attrs_from_table = ['left_brand',
                    'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
                    'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
                    'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
                    'left_weight', 'left_dimensions', 'left_title', 'right_brand',
                    'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
                    'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
                    'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
                    'right_weight', 'right_dimensions', 'right_title']
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'left_instance_id', 'right_instance_id'])
attrs_to_be_excluded.extend(attrs_from_table)

In [None]:
# Convert the cancidate set to feature vectors using the feature table
L = em.extract_feature_vecs(C, feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            show_progress=True, n_jobs=-1)

In [7]:
loaded_rf = joblib.load("./random_forest.joblib")

In [8]:
# Predict the matches
predictions = loaded_rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,
                                append=True, target_attr='predicted', inplace=False, )

In [9]:
# Prepare the output
def duplicates(x):
    return x['left_instance_id'] == x['right_instance_id']


def prepare_sigmod_output(res):
    ret = res[res.predicted == 1]
    ret = ret[['left_instance_id', 'right_instance_id']]
    ret = ret[~ret.apply(duplicates, axis=1)]
    return ret.drop_duplicates()

In [10]:
ret = prepare_sigmod_output(predictions)

In [11]:
ret

Unnamed: 0,left_instance_id,right_instance_id
20,www.flexshopper.com//884,www.softwarecity.ca//737
34,www.flexshopper.com//488,www.softwarecity.ca//737
40,www.tigerdirect.com//12,www.softwarecity.ca//737
79,buy.net//1960,www.softwarecity.ca//737
88,www.vology.com//80,www.softwarecity.ca//737
...,...,...
102509,www.vology.com//4484,www.vology.com//3017
102525,www.vology.com//105,www.vology.com//3017
102528,www.vology.com//1068,www.vology.com//3017
102543,www.vology.com//3356,www.vology.com//3017
