In [14]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [15]:
sp = spacy.load('en_core_web_sm')

In [16]:
with open('translations_lookup_all.json') as fin:
    variants = json.load(fin)

with open('langs_dict.json') as fin:
    json.load(fin)
    
def get_type(record):
    name = record['name'].lower()
    
    if pd.isna(record['size']):
        if 'tv' in name:
            return 'tv'
        return 'mobile'
    
    
    flash_keywords = ['usb', 'drive']
    memory_stick_keywords = ['card', 'stick', 'sd', 'microsd', 'hc', 'class', 'speicherkarte'] # Add variants here
    
    is_flash = False
    is_memory = False
    
    for w in flash_keywords:
        if w in name:
            is_flash = True
            break
            
    for w in memory_stick_keywords:
        if w in name:
            is_memory = True
            break
    
    if is_flash:
        return 'flash'
    
    if is_memory:
        return 'stick'
    
    return 'stick'

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df


def preprocess_products_dataset(x4_dev):
    # Clean x4
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    x4_dev.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in x4_dev.columns:
        if column == 'instance_id':
            continue
        x4_dev[column] = x4_dev[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
        
    x4_dev['product_type'] = x4_dev.apply(get_type, axis=1)
    x4_dev.drop('price', inplace=True, axis=1)
    x4_dev['size'] = x4_dev['size'].str.lower().str.replace(' ', '')
    x4_dev['size'] = x4_dev['size'].where(x4_dev['size'].notnull(), 0)
    
    # Remove unwanted words from the name
    for i in range(len(x4_dev)):
        record = x4_dev.iloc[i]

        name = record['name']

        # remove unnecessary characters
        basic_punct = '-/\*_,:;/()®™' 
        punct_to_space = str.maketrans(basic_punct, ' ' * len(basic_punct))  # map punctuation to space
        name = name.translate(punct_to_space)

        # remove brand
        name = name.replace(record['brand'], '')

        # remove size

        if record.product_type in ['flash', 'stick']:
            name = re.sub('\d\d\d\s?gb', '', name, 6)
            name = re.sub('\d\d\s?gb', '', name, 6)
            name = re.sub('\d\s?gb', '', name, 6)

        tokens = name.split(' ')
        for wd, wdtl in variants.items():
            while wd in tokens:
                tokens.remove(wd)
            for wdt in wdtl:
                while wdt in tokens:
                    tokens.remove(wdt) 

        unneeded_words = ['mmoire', 'speicherkarte', 'flashgeheugenkaart', 'flash', 'stick', 'speed', 'high']
        for w in unneeded_words:
            while w in tokens:
                tokens.remove(w)
        x4_dev.iloc[i]['name'] = ' '.join(tokens)

    for column in x4_dev.columns:
        if column == 'instance_id':
            continue
        x4_dev[column] = x4_dev[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
        
    return x4_dev


In [41]:
x4 = pd.read_csv('../data/sigmod/X4.csv')
x4_dev = convert_numbers_to_strings(x4, ['price']).copy(deep=True)
x4_dev.set_index('instance_id', inplace=True)
x4_dev = preprocess_products_dataset(x4_dev)

In [43]:
to_dedupe = x4_dev[[
    'name',
    'brand',
    'size',
    'product_type']].copy()

In [44]:
to_dedupe.head()

Unnamed: 0_level_0,name,brand,size,product_type
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altosight.com//0,1400x 210mb s professional xqd memriakrtya,lexar,32gb,stick
altosight.com//25,microsdxc uhs 1 u3 memriakrtya adapter srg1uxa,sony,128gb,stick
altosight.com//66,dual drive type c usb 3.0 130 mb s,sandisk,16gb,flash
altosight.com//68,dual drive type c usb 3.0 150 mb s,sandisk,64gb,flash
altosight.com//94,xqd x1400 professional xqd kupon premium 50 n...,lexar,32gb,stick


In [45]:
to_dedupe_dict = to_dedupe.to_dict(orient = 'index')
list(to_dedupe_dict.keys())[0]

'altosight.com//0'

In [46]:
# docs for this are here: 
fields = [{'field' : 'name', 'type' : 'String', 'has_missing' : False},
          {'field' : 'brand', 'type': 'Categorical', 'categories' : list(x4_dev.brand.unique()), 'has_missing' : False}, 
          {'field' : 'size', 'type': 'Exact', 'has_missing' : False},
          {'field' : 'product_type', 'type': 'Categorical', 'categories' : list(x4_dev.product_type.unique()),
           'has_missing' : False}]

In [47]:
# There is a bug later on that requires num_cores to be 1, but we can make use of
# multi-threaded processes in the meantime
deduper = dedupe.Dedupe(fields, num_cores=6)

In [48]:
y = pd.read_csv('../data/sigmod/Y4.csv')

In [49]:
trainig_data = {}
trainig_data['match'] = []
trainig_data['distinct'] = []

In [50]:
match = y[y.label == 1].to_dict(orient='row')
distinct = y[y.label == 0].to_dict(orient='row')



In [51]:
for m in match:
    trainig_data['match'].append( ( to_dedupe_dict[m['left_instance_id']], to_dedupe_dict[m['right_instance_id']] ) )

In [52]:
len(trainig_data['match'])

4082

In [53]:
for d in distinct:
    trainig_data['distinct'].append( ( to_dedupe_dict[d['left_instance_id']], to_dedupe_dict[d['right_instance_id']] ) )

In [54]:
len(trainig_data['distinct'])

344113

In [None]:
with open('y4_train.json', 'w') as fout:
    json.dump(trainig_data, fout)
    
training_file = 'y4_train.json'
with open(training_file) as tf:
    deduper.prepare_training(to_dedupe_dict, training_file=tf, sample_size=1500, blocked_proportion=0.9)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, name)


In [54]:
print('starting active labeling...')
dedupe.console_label(deduper)

brand : lenovo
cpu_brand : intel
cpu_model : None
cpu_type : None
ram_capacity : 4
hdd_capacity : None
ssd_capacity : None
title : miniprice.ca - lenovo thinkpad x230 34352sf tablet pc - 12.5 - in-plane switching ips technology - wireless lan - intel core i7 i7-3520m 2.90 ghz - black - 4 gb ram - 320 gb hdd - windows 7 professional 64-bit 34352sf-ddo 
screen_size : 12.5
model : thinkpad x230 34352sf

brand : lenovo
cpu_brand : intel
cpu_model : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
cpu_type : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
ram_capacity : 4
hdd_capacity : 320 gb hdd 7200 rpm. 320 gb hdd 7200 rpm. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64

starting active labeling...
f


Finished labeling


In [None]:
with open('y_laptop_augmented.json', 'w') as fout:
    json.dump(deduper.training_pairs, fout)

In [32]:
deduper.train(recall=0.8)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.21225558005515902
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (suffixArray, model), SimplePredicate: (wholeFieldPredicate, cpu_type), SimplePredicate: (wholeFieldPredicate, screen_size))


In [27]:
training_file = 'trained_model_laptops.json'
settings_file = 'trained_model_laptops_settings.json'
with open(training_file, 'w') as tf:
    deduper.write_training(tf)
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [28]:
clustered_dupes = deduper.partition(to_dedupe_dict, 0.5)

print('# duplicate sets', len(clustered_dupes))

# duplicate sets 343


In [29]:
deduper.predicates

((SimplePredicate: (oneGramFingerprint, model),
  TfidfTextCanopyPredicate: (0.8, cpu_model)),
 (SimplePredicate: (wholeFieldPredicate, hdd_capacity),
  SimplePredicate: (oneGramFingerprint, title)),
 (SimplePredicate: (wholeFieldPredicate, hdd_capacity),
  TfidfNGramCanopyPredicate: (0.4, cpu_model),
  LevenshteinCanopyPredicate: (1, screen_size)),
 (SimplePredicate: (firstIntegerPredicate, model),
  SimplePredicate: (sameFiveCharStartPredicate, ram_capacity)),
 (SimplePredicate: (firstIntegerPredicate, cpu_model),
  TfidfNGramCanopyPredicate: (0.8, cpu_model),
  SimplePredicate: (firstIntegerPredicate, ram_capacity)))

In [30]:
clustered_dupes

[(('www.vology.com//3330',), (1.0,)),
 (('www.vology.com//2129',), (1.0,)),
 (('www.vology.com//756',), (1.0,)),
 (('www.vology.com//1026',), (1.0,)),
 (('www.flexshopper.com//326',), (1.0,)),
 (('www.vology.com//2064',), (1.0,)),
 (('www.isupplyhub.com//822',), (1.0,)),
 (('www.vology.com//3679',), (1.0,)),
 (('www.amazon.com//953',), (1.0,)),
 (('www.flexshopper.com//703',), (1.0,)),
 (('buy.net//1963',), (1.0,)),
 (('www.vology.com//3401',), (1.0,)),
 (('www.vology.com//135',), (1.0,)),
 (('www.vology.com//3944',), (1.0,)),
 (('www.vology.com//3017',), (1.0,)),
 (('www.vology.com//81',), (1.0,)),
 (('www.isupplyhub.com//459',), (1.0,)),
 (('www.amazon.com//1014',), (1.0,)),
 (('www.vology.com//1978',), (1.0,)),
 (('buy.net//2109',), (1.0,)),
 (('www.vology.com//1005',), (1.0,)),
 (('www.vology.com//3957',), (1.0,)),
 (('www.vology.com//2097',), (1.0,)),
 (('www.vology.com//4413',), (1.0,)),
 (('www.flexshopper.com//488',), (1.0,)),
 (('www.vology.com//248',), (1.0,)),
 (('www.vology

In [31]:
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

In [32]:
cluster_membership

{'www.vology.com//3330': {'Cluster ID': 0, 'confidence_score': 1.0},
 'www.vology.com//2129': {'Cluster ID': 1, 'confidence_score': 1.0},
 'www.vology.com//756': {'Cluster ID': 2, 'confidence_score': 1.0},
 'www.vology.com//1026': {'Cluster ID': 3, 'confidence_score': 1.0},
 'www.flexshopper.com//326': {'Cluster ID': 4, 'confidence_score': 1.0},
 'www.vology.com//2064': {'Cluster ID': 5, 'confidence_score': 1.0},
 'www.isupplyhub.com//822': {'Cluster ID': 6, 'confidence_score': 1.0},
 'www.vology.com//3679': {'Cluster ID': 7, 'confidence_score': 1.0},
 'www.amazon.com//953': {'Cluster ID': 8, 'confidence_score': 1.0},
 'www.flexshopper.com//703': {'Cluster ID': 9, 'confidence_score': 1.0},
 'buy.net//1963': {'Cluster ID': 10, 'confidence_score': 1.0},
 'www.vology.com//3401': {'Cluster ID': 11, 'confidence_score': 1.0},
 'www.vology.com//135': {'Cluster ID': 12, 'confidence_score': 1.0},
 'www.vology.com//3944': {'Cluster ID': 13, 'confidence_score': 1.0},
 'www.vology.com//3017': {'Cl

In [37]:
def laptop_test(x):
    x.set_index('instance_id',  inplace=True, drop=False)
    x = fill_nulls_with_none(x)
    x = preprocess_laptop_dataset(x.copy(deep=True))
    x = convert_numbers_to_strings(x, ['ram_capacity', 'screen_size'])
    
    to_dedupe = x[['instance_id',
                 'brand',
                 'cpu_brand',
                 'cpu_model',
                'ram_capacity',
                'hdd_capacity', 
                'title',
                'screen_size',
                'model']].copy()
    
    to_dedupe_dict = to_dedupe.to_dict(orient = 'index')
    
    clustered_dupes = deduper.partition(to_dedupe_dict, 0.5)

    print('# duplicate sets', len(clustered_dupes))
    
    res = []
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i+1, len(el[0])):
                res.append((el[0][i], el[0][j]))
    res_df =pd.DataFrame(res)            
#     res_df.columns = ['left_instance_id', 'right_instance_id']
    return res_df

In [38]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
res = laptop_test(x2)

# duplicate sets 343


In [67]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
res = laptop_test(x3)

# duplicate sets 268


In [39]:
len(res)

0

In [40]:
res

In [72]:
res.to_csv("output.csv", index=False)