In [84]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import re
import spacy

In [85]:
sp = spacy.load('en_core_web_sm')

In [86]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), ' ')
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]
    
    df['new_title'] = df.title
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1) 

    # Brand assignment
    all_brands = set()
    
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)
        
        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)
        
        else:
            return str(15.6) # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)
    
    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0
    
    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        
        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbhdd", s2):
            return int(re.findall("\d{3,4}gbhdd", s2)[0][:-5])
        if re.search("hdd\d{3,4}gb", s2):
            return int(re.findall("hdd\d{3,4}gb", s2)[0][3:-2])
        if re.search("hdd\d{1}tb", s2):
            return int(re.findall("hdd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbhdd", s2):
            return int(re.findall("\d{1}tbhdd", s2)[0][0] + '000')
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
                                
    def assign_hdd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbssd", s2):
            return int(re.findall("\d{3,4}gbssd", s2)[0][:-5])
        if re.search("ssd\d{3,4}gb", s2):
            return int(re.findall("ssd\d{3,4}gb", s2)[0][3:-2])
        if re.search("ssd\d{1}tb", s2):
            return int(re.findall("ssd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbssd", s2):
            return int(re.findall("\d{1}tbssd", s2)[0][0] + '000')
        return 0
            
    df['ssd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
    
    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if (sc in brand_tokens[i]):
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i 
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])        

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])
    # Unit stand. in weight
    
    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom", 
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)
    
    return df

In [92]:
x2.columns

Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title',
       'new_title', 'new_title_tokens', 'screen_size', 'model'],
      dtype='object')

In [87]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
x2 = preprocess_laptop_dataset(x2.copy(deep=True)).loc[:, []]
x2.to_csv('X_cleaned.csv', index=False)
len(x2)

343

In [88]:
A = em.read_csv_metadata('X_cleaned.csv', key='instance_id')
B = em.read_csv_metadata('X_cleaned.csv', key='instance_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [89]:
# clean, remove nans
# A = A.fillna("")
# B = B.fillna("")
len(A)

343

In [90]:
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

Number of tuples in A: 343
Number of tuples in B: 343
Number of tuples in A X B (i.e the cartesian product): 117649


In [91]:
A.head()

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
0,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...,lenovo thinkpad x230 34352jf tablet pc 12.5 inplane switching ips technology wireless lan intel ...,"['lenovo', 'thinkpad', 'x230', '34352jf', 'tablet', 'pc', '12.5', 'inplane', 'switching', 'ips',...",12.5,thinkpad x230 34352jf
1,www.isupplyhub.com//1256,acer,intel,,i5,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,,500,0,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers access...,amazon.com acer aspire v7582pg6479 15.6inch touchscreen ultrabook cool steel computers accessories,"['amazon.com', 'acer', 'aspire', 'v7582pg6479', '15.6inch', 'touchscreen', 'ultrabook', 'cool', ...",15.6,aspire v7582pg6479
2,www.isupplyhub.com//326,acer,intel,,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,,500,0,5.2 pounds,15.02 x 10.08 x 1 inches,"amazon.com acer aspire e1-572-6870 15.6 inch laptop intel i5 4200u 1.6ghz processor, 4gb ram, 50...",amazon.com acer aspire e15726870 15.6 inch laptop intel i5 4200u 1.6ghz processor 4gb ram 500gb ...,"['amazon.com', 'acer', 'aspire', 'e15726870', '15.6', 'inch', 'laptop', 'intel', 'i5', '4200u', ...",15.6,aspire
3,www.isupplyhub.com//821,hp,amd,,,,4,ddr3 sdram. 4 gb sdram ddr3,,500,0,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 cert...,amazon.com 15.6 hp 15f009wm amd dualcore e12100 4gb ddr3 ram 500gb hd webcam windows 8.1 certifi...,"['amazon.com', '15.6', 'hp', '15f009wm', 'amd', 'dualcore', 'e12100', '4', 'gb', 'ddr3', 'ram', ...",15.6,15f009wm amd
4,www.isupplyhub.com//157,asus,intel,,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,,0,256,2.9 pounds,8.80 x 0.70 x 12.80 inches,"amazon.com asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor, 4gb ddr3, ...",amazon.com asus ux31axb52 13.3inch ultrabook 1.7 ghz intel core i53317u processor 4gb ddr3 256gb...,"['amazon.com', 'asus', 'ux31axb52', '13.3inch', 'ultrabook', '1.7', 'ghz', 'intel', 'core', 'i53...",13.3,ux31axb52


In [68]:
A.columns

Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title',
       'new_title', 'new_title_tokens', 'screen_size', 'model'],
      dtype='object')

In [69]:
ob = em.OverlapBlocker()
C = ob.block_tables(A, B, 'title', 'title', 
                    l_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'], 
                    r_output_attrs=['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
                    overlap_size=1, show_progress=True, l_output_prefix='left_',
    r_output_prefix='right_',)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table[overlap_attr] = values
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed:

In [70]:
C.head()

Unnamed: 0,_id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,right_cpu_type,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title
0,0,www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,...,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
1,1,www.isupplyhub.com//326,www.softwarecity.ca//737,acer,intel,-1,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,...,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
2,2,www.isupplyhub.com//821,www.softwarecity.ca//737,hp,amd,-1,,-1,4,ddr3 sdram. 4 gb sdram ddr3,...,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
3,3,www.isupplyhub.com//157,www.softwarecity.ca//737,asus,intel,-1,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,...,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...
4,4,www.isupplyhub.com//985,www.softwarecity.ca//737,lenovo,intel,-1,i7,intel core i7,0,3 gb,...,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...


In [71]:
C.columns

Index(['_id', 'left_instance_id', 'right_instance_id', 'left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
       'left_weight', 'left_dimensions', 'left_title', 'right_brand',
       'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
       'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
       'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
       'right_weight', 'right_dimensions', 'right_title'],
      dtype='object')

In [72]:
G = em.read_csv_metadata("../data/sigmod/x2_train.csv", 
                         key='id',
                         ltable=A, rtable=B, 
                         fk_ltable='left_instance_id', fk_rtable='right_instance_id')

Metadata file is not present in the given path; proceeding to read the csv file.


In [73]:
# Generate features automatically 
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [74]:
feature_table

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,instance_id_instance_id_lev_dist,instance_id,instance_id,,,lev_dist,<function instance_id_instance_id_lev_dist at 0x00000283CCC8CB80>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,instance_id_instance_id_lev_sim,instance_id,instance_id,,,lev_sim,<function instance_id_instance_id_lev_sim at 0x00000283CCC8C670>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,instance_id_instance_id_jar,instance_id,instance_id,,,jaro,<function instance_id_instance_id_jar at 0x00000283CCC8C700>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,instance_id_instance_id_jwn,instance_id,instance_id,,,jaro_winkler,<function instance_id_instance_id_jwn at 0x00000283CCC8C940>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,instance_id_instance_id_exm,instance_id,instance_id,,,exact_match,<function instance_id_instance_id_exm at 0x00000283CCC8C820>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
...,...,...,...,...,...,...,...,...,...
67,model_model_mel,model,model,,,monge_elkan,<function model_model_mel at 0x00000283CDC02D30>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
68,model_model_lev_dist,model,model,,,lev_dist,<function model_model_lev_dist at 0x00000283CDC02B80>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
69,model_model_lev_sim,model,model,,,lev_sim,<function model_model_lev_sim at 0x00000283CDC021F0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
70,model_model_nmw,model,model,,,needleman_wunsch,<function model_model_nmw at 0x00000283CDC02160>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [75]:
# Select the attrs. to be included in the feature vector table
attrs_from_table = ['left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity',
       'left_weight', 'left_dimensions', 'left_title', 'right_brand',
       'right_cpu_brand', 'right_cpu_model', 'right_cpu_type',
       'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type',
       'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity',
       'right_weight', 'right_dimensions', 'right_title']
# Convert the labeled data to feature vectors using the feature table
H = em.extract_feature_vecs(G, 
                            feature_table=feature_table, 
                            attrs_before = attrs_from_table,
                            attrs_after='label',
                            show_progress=True)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:52


In [76]:
H.head(10)

Unnamed: 0,id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,screen_size_screen_size_lev_sim,model_model_jac_qgm_3_qgm_3,model_model_cos_dlm_dc0_dlm_dc0,model_model_jac_dlm_dc0_dlm_dc0,model_model_mel,model_model_lev_dist,model_model_lev_sim,model_model_nmw,model_model_sw,label
0,0,www.flexshopper.com//1098,www.amazon.com//1389,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,1.0,0.047619,0.0,0.0,0.625,9.0,0.25,-3.0,2.0,1
1,1,www.amazon.com//291,www.amazon.com//1081,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1
2,2,buy.net//634,www.amazon.com//1014,acer,intel,-1,i3,1.70 ghz,4,ddr3l sdram,...,1.0,0.26087,0.707107,0.5,0.863158,13.0,0.315789,-7.0,6.0,1
3,3,www.amazon.com//2395,buy.net//393,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,0.26087,0.707107,0.5,0.312865,13.0,0.315789,-7.0,6.0,1
4,4,www.flexshopper.com//2173,buy.net//634,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1
5,5,www.amazon.com//1313,www.amazon.com//291,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1
6,6,www.amazon.com//1313,www.amazon.com//1014,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,0.26087,0.707107,0.5,0.863158,13.0,0.315789,-7.0,6.0,1
7,7,www.amazon.com//1081,www.amazon.com//2395,acer,intel,-1,i3,-1,4,-1,...,1.0,0.26087,0.707107,0.5,0.312865,13.0,0.315789,-7.0,6.0,1
8,8,www.amazon.com//1081,www.amazon.com//1389,acer,intel,-1,i3,-1,4,-1,...,1.0,0.047619,0.0,0.0,0.625,9.0,0.25,-3.0,2.0,1
9,9,www.amazon.com//291,www.flexshopper.com//1098,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1


In [77]:
H.columns

Index(['id', 'left_instance_id', 'right_instance_id', 'left_brand',
       'left_cpu_brand', 'left_cpu_model', 'left_cpu_type',
       'left_cpu_frequency', 'left_ram_capacity', 'left_ram_type',
       ...
       'screen_size_screen_size_lev_sim', 'model_model_jac_qgm_3_qgm_3',
       'model_model_cos_dlm_dc0_dlm_dc0', 'model_model_jac_dlm_dc0_dlm_dc0',
       'model_model_mel', 'model_model_lev_dist', 'model_model_lev_sim',
       'model_model_nmw', 'model_model_sw', 'label'],
      dtype='object', length=102)

In [78]:
len(H.columns)

102

In [79]:
H2 = H.copy(deep=True)
H2.head()

Unnamed: 0,id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,screen_size_screen_size_lev_sim,model_model_jac_qgm_3_qgm_3,model_model_cos_dlm_dc0_dlm_dc0,model_model_jac_dlm_dc0_dlm_dc0,model_model_mel,model_model_lev_dist,model_model_lev_sim,model_model_nmw,model_model_sw,label
0,0,www.flexshopper.com//1098,www.amazon.com//1389,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,1.0,0.047619,0.0,0.0,0.625,9.0,0.25,-3.0,2.0,1
1,1,www.amazon.com//291,www.amazon.com//1081,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1
2,2,buy.net//634,www.amazon.com//1014,acer,intel,-1,i3,1.70 ghz,4,ddr3l sdram,...,1.0,0.26087,0.707107,0.5,0.863158,13.0,0.315789,-7.0,6.0,1
3,3,www.amazon.com//2395,buy.net//393,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,1.0,0.26087,0.707107,0.5,0.312865,13.0,0.315789,-7.0,6.0,1
4,4,www.flexshopper.com//2173,buy.net//634,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,6.0,1


In [80]:
import numpy as np
pd.isna(H2).sum()

id                      0
left_instance_id        0
right_instance_id       0
left_brand              0
left_cpu_brand          0
                       ..
model_model_lev_dist    0
model_model_lev_sim     0
model_model_nmw         0
model_model_sw          0
label                   0
Length: 102, dtype: int64

In [81]:
# Instantiate the RF Matcher
rf = em.RFMatcher()

In [82]:
# Get the attributes to be projected while training
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['id', 'left_instance_id', 'right_instance_id', 'label'])
attrs_to_be_excluded.extend(attrs_from_table)

In [83]:
# Train using feature vectors from the labeled data.
rf.fit(table=H2, exclude_attrs=attrs_to_be_excluded, target_attr='label')

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [23]:
# Get the attributes to be projected while predicting
attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'left_instance_id', 'right_instance_id'])
attrs_to_be_excluded.extend(attrs_from_table)

# Convert the cancidate set to feature vectors using the feature table
L = em.extract_feature_vecs(C, feature_table=feature_table,
                             attrs_before= attrs_from_table,
                             show_progress=True, n_jobs=-1)

In [24]:
L.head()

Unnamed: 0,_id,left_instance_id,right_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,weight_weight_jac_dlm_dc0_dlm_dc0,weight_weight_mel,weight_weight_lev_dist,weight_weight_lev_sim,weight_weight_nmw,weight_weight_sw,dimensions_dimensions_jac_qgm_3_qgm_3,dimensions_dimensions_cos_dlm_dc0_dlm_dc0,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0
0,0,www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,...,1.0,1.0,0.0,1.0,7.0,7.0,1.0,1.0,1.0,1.0
1,1,www.isupplyhub.com//326,www.softwarecity.ca//737,acer,intel,-999,1.6 ghz intel core i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,...,0.0,0.495238,9.0,0.1,-2.0,1.0,0.0,0.0,0.09799,0.157174
2,2,www.isupplyhub.com//821,www.softwarecity.ca//737,hp,amd,-999,-999,-999,4,ddr3 sdram. 4 gb sdram ddr3,...,0.0,0.57619,8.0,0.2,-1.0,2.0,0.0,0.0,0.052897,0.069171
3,3,www.isupplyhub.com//157,www.softwarecity.ca//737,asus,intel,-999,1.7 ghz core i5-3317u,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,...,0.0,0.495238,9.0,0.1,-2.0,1.0,0.0,0.0,0.116625,0.188608
4,4,www.isupplyhub.com//985,www.softwarecity.ca//737,lenovo,intel,-999,intel core i7,intel core i7,0,3 gb,...,0.0,0.0,7.0,0.0,-3.0,0.0,1.0,1.0,0.205742,0.309875


In [42]:
# Predict the matches
predictions = rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,                          
              append=True, target_attr='predicted', inplace=False,)

In [45]:
# Prepare the output 
def duplicates(x):
    return x['left_instance_id'] == x['right_instance_id']
def prepare_sigmod_output(res):
    ret = res[res.predicted == 1]
    ret = ret[['left_instance_id', 'right_instance_id']]
    ret = ret[~ret.apply(duplicates, axis=1)]
    return ret.drop_duplicates()

In [46]:
res = prepare_sigmod_output(predictions)

In [47]:
len(prepare_sigmod_output(predictions)) / len(predictions)

0.04194890888003041

In [36]:
pd.read_csv('../data/sigmod/Y2.csv').label.sum() / len(pd.read_csv('../data/sigmod/Y2.csv')) 

0.03669036536920533

In [30]:
import joblib

In [32]:
joblib.dump(rf, "../src/random_forest.joblib")

['../src/random_forest.joblib']

In [51]:
loaded_rf = joblib.load("../src/random_forest.joblib")

In [52]:
# Predict the matches
predictions = loaded_rf.predict(table=L, exclude_attrs=attrs_to_be_excluded,                          
              append=True, target_attr='predicted', inplace=False,)

In [53]:
prepare_sigmod_output(predictions)

Unnamed: 0,left_instance_id,right_instance_id
20,www.flexshopper.com//884,www.softwarecity.ca//737
34,www.flexshopper.com//488,www.softwarecity.ca//737
40,www.tigerdirect.com//12,www.softwarecity.ca//737
79,buy.net//1960,www.softwarecity.ca//737
88,www.vology.com//80,www.softwarecity.ca//737
...,...,...
102525,www.vology.com//4484,www.vology.com//3017
102541,www.vology.com//105,www.vology.com//3017
102544,www.vology.com//1068,www.vology.com//3017
102559,www.vology.com//3356,www.vology.com//3017
