In [1]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import re

In [2]:
def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9.\-\s]')
    multispace_regex = re.compile(r'\s\s+')

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')

    # Count the number of nans in a certain row and remove records with more than 3 nans
#     nans_count = df.isnull().sum(axis=1)
#     mask = nans_count > 4
#     print("removing {} records containing nans".format(len(df[mask])))
#     df = df[~mask]

    # Brand assignment
    all_brands = set()
    extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    # cpu model
    def assign_cpu_model(record):
        if record['cpu_brand'] == 'intel':
            pass
        else:
            pass

    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0

    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = df.fillna(-999)

    # Unit stand. in weight
    return df

In [3]:
A = pd.read_csv('../data/sigmod/X2.csv')
A = A.fillna(-999)
A = preprocess_laptop_dataset(A)
A.to_csv('X_cleaned.csv', index=False)
len(A)

343

In [4]:
A = em.read_csv_metadata('X_cleaned.csv', key='instance_id')
B = em.read_csv_metadata('X_cleaned.csv', key='instance_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [5]:
# clean, remove nans
# A = A.fillna("")
# B = B.fillna("")
len(A)

343

In [6]:
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

Number of tuples in A: 343
Number of tuples in B: 343
Number of tuples in A X B (i.e the cartesian product): 117649


In [7]:
A.head()

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title
0,www.softwarecity.ca//737,lenovo,intel,i5-3320m,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
1,www.isupplyhub.com//1256,acer,intel,-999,1.6 ghz intel core i5-4200u,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,-999,500 gb mechanical hard drive,-999,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers access...
2,www.isupplyhub.com//326,acer,intel,-999,1.6 ghz intel core i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,-999,500 gb mechanical hard drive,-999,5.2 pounds,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch laptop intel i5 4200u 1.6ghz processor 4gb ram 500g...
3,www.isupplyhub.com//821,hp,amd,-999,-999,-999,4,ddr3 sdram. 4 gb sdram ddr3,-999,500 gb,-999,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 cert...
4,www.isupplyhub.com//157,asus,intel,-999,1.7 ghz core i5-3317u,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,-999,256 mb,-999,2.9 pounds,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor 4gb ddr3 25...


In [8]:
A.columns

Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
      dtype='object')

In [9]:
ob = em.OverlapBlocker()
C = ob.block_tables(A, B, 'title', 'title', 
                    l_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'], 
                    r_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    overlap_size=1, show_progress=True, l_output_prefix='left_',
    r_output_prefix='right_',)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [10]:
C.head()

Unnamed: 0,_id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,right_cpu_type,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title
0,0,www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,...,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
1,1,www.isupplyhub.com//326,www.softwarecity.ca//737,acer,intel,-999,1.6 ghz intel core i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,...,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
2,2,www.isupplyhub.com//821,www.softwarecity.ca//737,hp,amd,-999,-999,-999,4,ddr3 sdram. 4 gb sdram ddr3,...,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
3,3,www.isupplyhub.com//157,www.softwarecity.ca//737,asus,intel,-999,1.7 ghz core i5-3317u,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,...,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
4,4,www.isupplyhub.com//985,www.softwarecity.ca//737,lenovo,intel,-999,intel core i7,intel core i7,0,3 gb,...,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...


In [11]:
C.columns

Index(['_id', 'left_instance_id', 'right_instance_id', 'left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
       'left_weight', 'left_dimensions', 'left_title', 'right_brand',
       'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
       'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
       'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
       'right_weight', 'right_dimensions', 'right_title'],
      dtype='object')

In [12]:
G = em.read_csv_metadata("x2_train.csv", 
                         key='id',
                         ltable=A, rtable=B, 
                         fk_ltable='left_instance_id', fk_rtable='right_instance_id')

Metadata file is not present in the given path; proceeding to read the csv file.


In [13]:
# Generate features automatically 
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [14]:
feature_table

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,instance_id_instance_id_lev_dist,instance_id,instance_id,,,lev_dist,<function instance_id_instance_id_lev_dist at 0x7fa8777d2a70>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,instance_id_instance_id_lev_sim,instance_id,instance_id,,,lev_sim,<function instance_id_instance_id_lev_sim at 0x7fa8777d2c20>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,instance_id_instance_id_jar,instance_id,instance_id,,,jaro,<function instance_id_instance_id_jar at 0x7fa8777d2d40>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,instance_id_instance_id_jwn,instance_id,instance_id,,,jaro_winkler,<function instance_id_instance_id_jwn at 0x7fa8777d2e60>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,instance_id_instance_id_exm,instance_id,instance_id,,,exact_match,<function instance_id_instance_id_exm at 0x7fa8777d2f80>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
5,instance_id_instance_id_jac_qgm_3_qgm_3,instance_id,instance_id,qgm_3,qgm_3,jaccard,<function instance_id_instance_id_jac_qgm_3_qgm_3 at 0x7fa87af0e0e0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
6,brand_brand_lev_dist,brand,brand,,,lev_dist,<function brand_brand_lev_dist at 0x7fa87af0e200>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
7,brand_brand_lev_sim,brand,brand,,,lev_sim,<function brand_brand_lev_sim at 0x7fa87af0e320>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
8,brand_brand_jar,brand,brand,,,jaro,<function brand_brand_jar at 0x7fa87af0e440>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
9,brand_brand_jwn,brand,brand,,,jaro_winkler,<function brand_brand_jwn at 0x7fa87af0e560>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [15]:
# Select the attrs. to be included in the feature vector table
attrs_from_table = ['left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
       'left_weight', 'left_dimensions', 'left_title', 'right_brand',
       'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
       'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
       'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
       'right_weight', 'right_dimensions', 'right_title']
# Convert the labeled data to feature vectors using the feature table
H = em.extract_feature_vecs(G, 
                            feature_table=feature_table, 
                            attrs_before = attrs_from_table,
                            attrs_after='label',
                            show_progress=True)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:15


In [16]:
H.head(10)

Unnamed: 0,id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,weight_weight_mel,weight_weight_lev_dist,weight_weight_lev_sim,weight_weight_nmw,weight_weight_sw,dimensions_dimensions_jac_qgm_3_qgm_3,dimensions_dimensions_cos_dlm_dc0_dlm_dc0,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0,label
0,0,www.flexshopper.com//1098,www.amazon.com//1389,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,0.0,7.0,0.0,-3.0,0.0,1.0,1.0,0.21393,0.308607,1
1,1,www.amazon.com//291,www.amazon.com//1081,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.0,9.0,0.0,-5.0,0.0,0.0,0.0,0.415584,0.563621,1
2,2,buy.net//634,www.amazon.com//1014,acer,intel,,intel core i3,1.70 ghz,4,ddr3l sdram,...,0.706667,7.0,0.3,1.0,2.0,0.2,0.223607,0.087912,0.111111,1
3,3,www.amazon.com//2395,buy.net//393,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.706667,7.0,0.3,1.0,2.0,0.142857,0.223607,0.644444,0.821995,1
4,4,www.flexshopper.com//2173,buy.net//634,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,0.895238,2.0,0.75,5.0,5.0,0.0,0.0,0.438356,0.667823,1
5,5,www.amazon.com//1313,www.amazon.com//291,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.0,10.0,0.0,-6.0,0.0,0.0,0.0,0.351254,0.452589,1
6,6,www.amazon.com//1313,www.amazon.com//1014,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,1.0,0.0,1.0,10.0,10.0,0.703704,0.8,0.220532,0.273998,1
7,7,www.amazon.com//1081,www.amazon.com//2395,acer,intel,,,,4,,...,0.825926,3.0,0.7,6.0,7.0,1.0,1.0,0.609195,0.714575,1
8,8,www.amazon.com//1081,www.amazon.com//1389,acer,intel,,,,4,,...,0.0,9.0,0.0,-5.0,0.0,0.0,0.0,0.450893,0.536111,1
9,9,www.amazon.com//291,www.flexshopper.com//1098,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.0,7.0,0.0,-3.0,0.0,1.0,1.0,0.211823,0.324443,1


In [17]:
H.columns

Index(['id', 'left_instance_id', 'right_instance_id', 'left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
       'left_weight', 'left_dimensions', 'left_title', 'right_brand',
       'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
       'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
       'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
       'right_weight', 'right_dimensions', 'right_title',
       'instance_id_instance_id_lev_dist', 'instance_id_instance_id_lev_sim',
       'instance_id_instance_id_jar', 'instance_id_instance_id_jwn',
       'instance_id_instance_id_exm',
       'instance_id_instance_id_jac_qgm_3_qgm_3', 'brand_brand_lev_dist',
       'brand_brand_lev_sim', 'brand_brand_jar', 'brand_brand_jwn',
       'brand_brand_exm', 'brand_brand_jac_qgm_3_qgm_3',
       'cpu_bra

In [18]:
len(H.columns)

81

In [19]:
H2 = H.copy(deep=True)
H2.head()

Unnamed: 0,id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,weight_weight_mel,weight_weight_lev_dist,weight_weight_lev_sim,weight_weight_nmw,weight_weight_sw,dimensions_dimensions_jac_qgm_3_qgm_3,dimensions_dimensions_cos_dlm_dc0_dlm_dc0,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0,label
0,0,www.flexshopper.com//1098,www.amazon.com//1389,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,0.0,7.0,0.0,-3.0,0.0,1.0,1.0,0.21393,0.308607,1
1,1,www.amazon.com//291,www.amazon.com//1081,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.0,9.0,0.0,-5.0,0.0,0.0,0.0,0.415584,0.563621,1
2,2,buy.net//634,www.amazon.com//1014,acer,intel,,intel core i3,1.70 ghz,4,ddr3l sdram,...,0.706667,7.0,0.3,1.0,2.0,0.2,0.223607,0.087912,0.111111,1
3,3,www.amazon.com//2395,buy.net//393,acer,intel,intel core i3,intel core i3,intel core i3,4,4 gb,...,0.706667,7.0,0.3,1.0,2.0,0.142857,0.223607,0.644444,0.821995,1
4,4,www.flexshopper.com//2173,buy.net//634,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,0.895238,2.0,0.75,5.0,5.0,0.0,0.0,0.438356,0.667823,1


In [20]:
# Instantiate the RF Matcher
rf = em.RFMatcher()

In [21]:
# Get the attributes to be projected while training
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['id', 'left_instance_id', 'right_instance_id', 'label'])
attrs_to_be_excluded.extend(attrs_from_table)

In [22]:
# Train using feature vectors from the labeled data.
rf.fit(table=H2, exclude_attrs=attrs_to_be_excluded, target_attr='label')

In [23]:
# Get the attributes to be projected while predicting
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'left_instance_id', 'right_instance_id'])
attrs_to_be_excluded.extend(attrs_from_table)

# Convert the cancidate set to feature vectors using the feature table
L = em.extract_feature_vecs(C, feature_table=feature_table,
                             attrs_before= attrs_from_table,
                             show_progress=True, n_jobs=-1)

In [24]:
L.head()

Unnamed: 0,_id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,weight_weight_jac_dlm_dc0_dlm_dc0,weight_weight_mel,weight_weight_lev_dist,weight_weight_lev_sim,weight_weight_nmw,weight_weight_sw,dimensions_dimensions_jac_qgm_3_qgm_3,dimensions_dimensions_cos_dlm_dc0_dlm_dc0,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0
0,0,www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,...,1.0,1.0,0.0,1.0,7.0,7.0,1.0,1.0,1.0,1.0
1,1,www.isupplyhub.com//326,www.softwarecity.ca//737,acer,intel,-999,1.6 ghz intel core i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,...,0.0,0.495238,9.0,0.1,-2.0,1.0,0.0,0.0,0.09799,0.157174
2,2,www.isupplyhub.com//821,www.softwarecity.ca//737,hp,amd,-999,-999,-999,4,ddr3 sdram. 4 gb sdram ddr3,...,0.0,0.57619,8.0,0.2,-1.0,2.0,0.0,0.0,0.052897,0.069171
3,3,www.isupplyhub.com//157,www.softwarecity.ca//737,asus,intel,-999,1.7 ghz core i5-3317u,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,...,0.0,0.495238,9.0,0.1,-2.0,1.0,0.0,0.0,0.116625,0.188608
4,4,www.isupplyhub.com//985,www.softwarecity.ca//737,lenovo,intel,-999,intel core i7,intel core i7,0,3 gb,...,0.0,0.0,7.0,0.0,-3.0,0.0,1.0,1.0,0.205742,0.309875


In [42]:
# Predict the matches
predictions = rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,                          
              append=True, target_attr='predicted', inplace=False,)

In [45]:
# Prepare the output 
def duplicates(x):
    return x['left_instance_id'] == x['right_instance_id']
def prepare_sigmod_output(res):
    ret = res[res.predicted == 1]
    ret = ret[['left_instance_id', 'right_instance_id']]
    ret = ret[~ret.apply(duplicates, axis=1)]
    return ret.drop_duplicates()

In [46]:
res = prepare_sigmod_output(predictions)

In [47]:
len(prepare_sigmod_output(predictions)) / len(predictions)

0.04194890888003041

In [36]:
pd.read_csv('../data/sigmod/Y2.csv').label.sum() / len(pd.read_csv('../data/sigmod/Y2.csv')) 

0.03669036536920533

In [30]:
import joblib

In [32]:
joblib.dump(rf, "../src/random_forest.joblib")

['../src/random_forest.joblib']

In [51]:
loaded_rf = joblib.load("../src/random_forest.joblib")

In [52]:
# Predict the matches
predictions = loaded_rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,                          
              append=True, target_attr='predicted', inplace=False,)

In [53]:
prepare_sigmod_output(predictions)

Unnamed: 0,left_instance_id,right_instance_id
20,www.flexshopper.com//884,www.softwarecity.ca//737
34,www.flexshopper.com//488,www.softwarecity.ca//737
40,www.tigerdirect.com//12,www.softwarecity.ca//737
79,buy.net//1960,www.softwarecity.ca//737
88,www.vology.com//80,www.softwarecity.ca//737
...,...,...
102525,www.vology.com//4484,www.vology.com//3017
102541,www.vology.com//105,www.vology.com//3017
102544,www.vology.com//1068,www.vology.com//3017
102559,www.vology.com//3356,www.vology.com//3017
