In [2]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [3]:
sp = spacy.load('en_core_web_sm')

In [4]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]
    
    df['new_title'] = df.title
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1) 

    # Brand assignment
    all_brands = set()
    
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)
        
        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)
        
        else:
            return str(15.6) # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)
    
    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0
    
    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        
        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbhdd", s2):
            return int(re.findall("\d{3,4}gbhdd", s2)[0][:-5])
        if re.search("hdd\d{3,4}gb", s2):
            return int(re.findall("hdd\d{3,4}gb", s2)[0][3:-2])
        if re.search("hdd\d{1}tb", s2):
            return int(re.findall("hdd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbhdd", s2):
            return int(re.findall("\d{1}tbhdd", s2)[0][0] + '000')
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
                                
    def assign_hdd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbssd", s2):
            return int(re.findall("\d{3,4}gbssd", s2)[0][:-5])
        if re.search("ssd\d{3,4}gb", s2):
            return int(re.findall("ssd\d{3,4}gb", s2)[0][3:-2])
        if re.search("ssd\d{1}tb", s2):
            return int(re.findall("ssd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbssd", s2):
            return int(re.findall("\d{1}tbssd", s2)[0][0] + '000')
        return 0
            
    df['ssd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
    
    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if (sc in brand_tokens[i]):
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i 
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])        

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])
    # Unit stand. in weight
    
    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom", 
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)
    
    return df

In [5]:
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

In [6]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
x2.set_index('instance_id',  inplace=True, drop=False)
x2 = preprocess_laptop_dataset(x2.copy(deep=True))

In [7]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
x3.set_index('instance_id',  inplace=True, drop=False)
x3 = preprocess_laptop_dataset(x3.copy(deep=True))
# x3 = convert_numbers_to_strings(x3, ['ram_capacity', 'screen_size'])

In [8]:
len(x2), len(x3)

(343, 337)

In [9]:
# x3.head()

In [10]:
x2.index.intersection(x3.index)

Index([], dtype='object', name='instance_id')

In [11]:
# x23 = x3.append(x2)
x23 = x2

In [12]:
x23.head()

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,lenovo thinkpad x230 34352jf tablet pc 12.5 in...,"[lenovo, thinkpad, x230, 34352jf, tablet, pc, ...",12.5,thinkpad x230 34352jf
www.isupplyhub.com//1256,www.isupplyhub.com//1256,acer,intel,,i5,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,,500,0,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch...,amazon.com acer aspire v7582pg6479 15.6inch to...,"[amazon.com, acer, aspire, v7582pg6479, 15.6in...",15.6,aspire v7582pg6479
www.isupplyhub.com//326,www.isupplyhub.com//326,acer,intel,,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,,500,0,5.2 pounds,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch l...,amazon.com acer aspire e15726870 15.6 inch lap...,"[amazon.com, acer, aspire, e15726870, 15.6, in...",15.6,aspire
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,,,,4,ddr3 sdram. 4 gb sdram ddr3,,500,0,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,amazon.com 15.6 hp 15f009wm amd dualcore e1210...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore...",15.6,15f009wm amd
www.isupplyhub.com//157,www.isupplyhub.com//157,asus,intel,,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,,0,256,2.9 pounds,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,amazon.com asus ux31axb52 13.3inch ultrabook 1...,"[amazon.com, asus, ux31axb52, 13.3inch, ultrab...",13.3,ux31axb52


In [13]:
x23.columns, len(x23.columns)

(Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
        'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
        'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title',
        'new_title', 'new_title_tokens', 'screen_size', 'model'],
       dtype='object'),
 18)

In [14]:
to_dedupe = x23[[
    'instance_id',
    'brand',
    'cpu_brand',
    'cpu_type',
    'ram_capacity',
    'hdd_capacity', 
    'ssd_capacity',
    'title',
    'screen_size',
    'model']].copy()

In [15]:
to_dedupe.head()

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_type,ram_capacity,hdd_capacity,ssd_capacity,title,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5,4,320,0,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,12.5,thinkpad x230 34352jf
www.isupplyhub.com//1256,www.isupplyhub.com//1256,acer,intel,i5,8,500,0,amazon.com acer aspire v7-582pg-6479 15.6-inch...,15.6,aspire v7582pg6479
www.isupplyhub.com//326,www.isupplyhub.com//326,acer,intel,i5,4,500,0,amazon.com acer aspire e1-572-6870 15.6 inch l...,15.6,aspire
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,,4,500,0,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,15.6,15f009wm amd
www.isupplyhub.com//157,www.isupplyhub.com//157,asus,intel,i5,4,0,256,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,13.3,ux31axb52


In [16]:
to_dedupe_dict = to_dedupe.to_dict(orient = 'index')

In [17]:
to_dedupe_dict['www.softwarecity.ca//737']

{'instance_id': 'www.softwarecity.ca//737',
 'brand': 'lenovo',
 'cpu_brand': 'intel',
 'cpu_type': 'i5',
 'ram_capacity': 4,
 'hdd_capacity': 320,
 'ssd_capacity': 0,
 'title': 'lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan - intel core i5 i5-3320m 2.60 ghz - black - 4 gb ram - 320 gb hdd - windows 7 professional 64-bit - convertible - 1366 x 768 multi-touch screen display led backlight - bluetooth - french keyboard - 34352jf - softwarecity.ca - canada',
 'screen_size': '12.5',
 'model': 'thinkpad x230 34352jf'}

In [18]:
with open('to_dedupe_dict.pkl', 'wb') as f:
    pickle.dump(to_dedupe_dict, f)
with open('to_dedupe_dict.pkl', 'rb') as f:
    to_dedupe_dict = pickle.load(f)

In [19]:
# docs for this are here: 
fields = [{'field' : 'brand', 'type' : 'Categorical', 'categories' : extra_brands},
          
          {'field' : 'cpu_brand', 'type': 'Categorical', 'categories' : ['amd', 'intel']}, 
          
#           {'field' : 'cpu_model', 'type': 'String', 'has_missing' : True},
          
          {'field' : 'cpu_type', 'type': 'Exact', 'has_missing' : False},
          
          {'field' : 'ram_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'hdd_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'ssd_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'title', 'type': 'Text', 'has_missing' : False},
          
          {'field' : 'screen_size', 'type': 'Categorical', 'has_missing' : False, 'categories' : screen_sizes},
          
          {'field' : 'model', 'type': 'String', 'has_missing' : True},
          
         ]

In [20]:
# There is a bug later on that requires num_cores to be 1, but we can make use of
# multi-threaded processes in the meantime
deduper = dedupe.Dedupe(fields, num_cores=6)

In [21]:
y2 = pd.read_csv('../data/sigmod/Y2.csv')
y3 = pd.read_csv('../data/sigmod/Y3.csv')
# y = y3.append(y2)
y = y2
len(y)

58653

In [22]:
trainig_data = {}
trainig_data['match'] = []
trainig_data['distinct'] = []

In [23]:
match = y[y.label == 1].to_dict(orient='row')
distinct = y[y.label == 0].to_dict(orient='row')



In [24]:
for m in match:
    trainig_data['match'].append( ( to_dedupe_dict[m['left_instance_id']], to_dedupe_dict[m['right_instance_id']] ) )

In [25]:
len(trainig_data['match'])

2152

In [26]:
for d in distinct:
    trainig_data['distinct'].append( ( to_dedupe_dict[d['left_instance_id']], to_dedupe_dict[d['right_instance_id']] ) )

In [27]:
len(trainig_data['distinct'])

56501

In [28]:
# trainig_data['match'].extend(my_own_annotation['match'])

In [29]:
# trainig_data['distinct'].extend(my_own_annotation['distinct'])

In [30]:
with open('y_laptop.json', 'w') as fout:
    json.dump(trainig_data, fout)

In [31]:
training_file = 'y_laptop.json'
with open(training_file) as tf:
    deduper.prepare_training(to_dedupe_dict, training_file=tf, sample_size=1500, blocked_proportion=0.9)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (twoGramFingerprint, title)
  final_betas = opt.minimize(loss,
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, screen_size)
INFO:dedupe.training:SimplePredicate: (sortedAcronym, title)


In [54]:
print('starting active labeling...')
dedupe.console_label(deduper)

brand : lenovo
cpu_brand : intel
cpu_model : None
cpu_type : None
ram_capacity : 4
hdd_capacity : None
ssd_capacity : None
title : miniprice.ca - lenovo thinkpad x230 34352sf tablet pc - 12.5 - in-plane switching ips technology - wireless lan - intel core i7 i7-3520m 2.90 ghz - black - 4 gb ram - 320 gb hdd - windows 7 professional 64-bit 34352sf-ddo 
screen_size : 12.5
model : thinkpad x230 34352sf

brand : lenovo
cpu_brand : intel
cpu_model : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
cpu_type : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
ram_capacity : 4
hdd_capacity : 320 gb hdd 7200 rpm. 320 gb hdd 7200 rpm. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64

starting active labeling...
f


Finished labeling


In [None]:
with open('y_laptop_augmented.json', 'w') as fout:
    json.dump(deduper.training_pairs, fout)

In [None]:
deduper.train(recall=0.8)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...


In [27]:
training_file = 'trained_model_laptops.json'
settings_file = 'trained_model_laptops_settings.json'
with open(training_file, 'w') as tf:
    deduper.write_training(tf)
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [28]:
clustered_dupes = deduper.partition(to_dedupe_dict, 0.5)

print('# duplicate sets', len(clustered_dupes))

# duplicate sets 343


In [29]:
deduper.predicates

((SimplePredicate: (oneGramFingerprint, model),
  TfidfTextCanopyPredicate: (0.8, cpu_model)),
 (SimplePredicate: (wholeFieldPredicate, hdd_capacity),
  SimplePredicate: (oneGramFingerprint, title)),
 (SimplePredicate: (wholeFieldPredicate, hdd_capacity),
  TfidfNGramCanopyPredicate: (0.4, cpu_model),
  LevenshteinCanopyPredicate: (1, screen_size)),
 (SimplePredicate: (firstIntegerPredicate, model),
  SimplePredicate: (sameFiveCharStartPredicate, ram_capacity)),
 (SimplePredicate: (firstIntegerPredicate, cpu_model),
  TfidfNGramCanopyPredicate: (0.8, cpu_model),
  SimplePredicate: (firstIntegerPredicate, ram_capacity)))

In [30]:
clustered_dupes

[(('www.vology.com//3330',), (1.0,)),
 (('www.vology.com//2129',), (1.0,)),
 (('www.vology.com//756',), (1.0,)),
 (('www.vology.com//1026',), (1.0,)),
 (('www.flexshopper.com//326',), (1.0,)),
 (('www.vology.com//2064',), (1.0,)),
 (('www.isupplyhub.com//822',), (1.0,)),
 (('www.vology.com//3679',), (1.0,)),
 (('www.amazon.com//953',), (1.0,)),
 (('www.flexshopper.com//703',), (1.0,)),
 (('buy.net//1963',), (1.0,)),
 (('www.vology.com//3401',), (1.0,)),
 (('www.vology.com//135',), (1.0,)),
 (('www.vology.com//3944',), (1.0,)),
 (('www.vology.com//3017',), (1.0,)),
 (('www.vology.com//81',), (1.0,)),
 (('www.isupplyhub.com//459',), (1.0,)),
 (('www.amazon.com//1014',), (1.0,)),
 (('www.vology.com//1978',), (1.0,)),
 (('buy.net//2109',), (1.0,)),
 (('www.vology.com//1005',), (1.0,)),
 (('www.vology.com//3957',), (1.0,)),
 (('www.vology.com//2097',), (1.0,)),
 (('www.vology.com//4413',), (1.0,)),
 (('www.flexshopper.com//488',), (1.0,)),
 (('www.vology.com//248',), (1.0,)),
 (('www.vology

In [31]:
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

In [32]:
cluster_membership

{'www.vology.com//3330': {'Cluster ID': 0, 'confidence_score': 1.0},
 'www.vology.com//2129': {'Cluster ID': 1, 'confidence_score': 1.0},
 'www.vology.com//756': {'Cluster ID': 2, 'confidence_score': 1.0},
 'www.vology.com//1026': {'Cluster ID': 3, 'confidence_score': 1.0},
 'www.flexshopper.com//326': {'Cluster ID': 4, 'confidence_score': 1.0},
 'www.vology.com//2064': {'Cluster ID': 5, 'confidence_score': 1.0},
 'www.isupplyhub.com//822': {'Cluster ID': 6, 'confidence_score': 1.0},
 'www.vology.com//3679': {'Cluster ID': 7, 'confidence_score': 1.0},
 'www.amazon.com//953': {'Cluster ID': 8, 'confidence_score': 1.0},
 'www.flexshopper.com//703': {'Cluster ID': 9, 'confidence_score': 1.0},
 'buy.net//1963': {'Cluster ID': 10, 'confidence_score': 1.0},
 'www.vology.com//3401': {'Cluster ID': 11, 'confidence_score': 1.0},
 'www.vology.com//135': {'Cluster ID': 12, 'confidence_score': 1.0},
 'www.vology.com//3944': {'Cluster ID': 13, 'confidence_score': 1.0},
 'www.vology.com//3017': {'Cl

In [37]:
def laptop_test(x):
    x.set_index('instance_id',  inplace=True, drop=False)
    x = fill_nulls_with_none(x)
    x = preprocess_laptop_dataset(x.copy(deep=True))
    x = convert_numbers_to_strings(x, ['ram_capacity', 'screen_size'])
    
    to_dedupe = x[['instance_id',
                 'brand',
                 'cpu_brand',
                 'cpu_model',
                'ram_capacity',
                'hdd_capacity', 
                'title',
                'screen_size',
                'model']].copy()
    
    to_dedupe_dict = to_dedupe.to_dict(orient = 'index')
    
    clustered_dupes = deduper.partition(to_dedupe_dict, 0.5)

    print('# duplicate sets', len(clustered_dupes))
    
    res = []
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i+1, len(el[0])):
                res.append((el[0][i], el[0][j]))
    res_df =pd.DataFrame(res)            
#     res_df.columns = ['left_instance_id', 'right_instance_id']
    return res_df

In [38]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
res = laptop_test(x2)

# duplicate sets 343


In [67]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
res = laptop_test(x3)

# duplicate sets 268


In [39]:
len(res)

0

In [40]:
res

In [72]:
res.to_csv("output.csv", index=False)