In [1]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [2]:
sp = spacy.load('en_core_web_sm')

In [3]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
    
extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]
    
    df['new_title'] = df.title
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1) 

    # Count the number of nans in a certain row and remove records with more than 3 nans
    #     nans_count = df.isnull().sum(axis=1)
    #     mask = nans_count > 4
    #     print("removing {} records containing nans".format(len(df[mask])))
    #     df = df[~mask]

    # Brand assignment
    all_brands = set()
    
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    # cpu model
    def assign_cpu_type(record):
        
        if record['cpu_brand'] == 'intel':
            record['cpu_model'] 
            record['cpu_type']
            
        else:
            pass
    
    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)
        
        for sc in screen_sizes:
            if str(sc) in arr:
                return sc
        
        else:
            return 15.6 # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)
    
    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0
    
    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if (sc in brand_tokens[i]):
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i 
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])        

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)

    # Unit stand. in weight
    return df

In [4]:
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

In [5]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
x2.set_index('instance_id',  inplace=True, drop=False)
x2 = fill_nulls_with_none(x2)
x2 = preprocess_laptop_dataset(x2.copy(deep=True))
x2 = convert_numbers_to_strings(x2, ['ram_capacity', 'screen_size'])

In [6]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
x3.set_index('instance_id',  inplace=True, drop=False)
x3 = fill_nulls_with_none(x3)
x3 = preprocess_laptop_dataset(x3.copy(deep=True))
x3 = convert_numbers_to_strings(x3, ['ram_capacity', 'screen_size'])

In [7]:
len(x2), len(x3)

(343, 337)

In [8]:
# x3.head()

In [9]:
x2.index.intersection(x3.index)

Index([], dtype='object', name='instance_id')

In [10]:
x23 = x3.append(x2)

In [11]:
len(x23)

680

In [12]:
x23.columns, len(x23.columns)

(Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
        'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
        'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title',
        'new_title', 'new_title_tokens', 'screen_size', 'model'],
       dtype='object'),
 18)

In [13]:
to_dedupe = x23[['instance_id',
                 'brand',
                 'cpu_brand',
                 'cpu_model',
                'ram_capacity',
                'hdd_capacity', 
                'title',
                'screen_size',
                'model']].copy()

In [14]:
# to_dedupe.head()

In [15]:
to_dedupe_dict = to_dedupe.to_dict(orient = 'index')

In [16]:
to_dedupe_dict['www.softwarecity.ca//737']

{'instance_id': 'www.softwarecity.ca//737',
 'brand': 'lenovo',
 'cpu_brand': 'intel',
 'cpu_model': 'i5-3320m',
 'ram_capacity': '4',
 'hdd_capacity': '320 gb',
 'title': 'lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan - intel core i5 i5-3320m 2.60 ghz - black - 4 gb ram - 320 gb hdd - windows 7 professional 64-bit - convertible - 1366 x 768 multi-touch screen display led backlight - bluetooth - french keyboard - 34352jf - softwarecity.ca - canada',
 'screen_size': '12.5',
 'model': 'thinkpad x230 34352jf'}

In [17]:
with open('to_dedupe_dict.pkl', 'wb') as f:
    pickle.dump(to_dedupe_dict, f)
with open('to_dedupe_dict.pkl', 'rb') as f:
    to_dedupe_dict = pickle.load(f)

In [18]:
# docs for this are here: 
fields = [{'field' : 'brand', 'type' : 'String'},
          {'field' : 'cpu_brand', 'type': 'String'}, 
          {'field' : 'cpu_model', 'type': 'String', 'has_missing' : True},
#           {'field' : 'cpu_type', 'type': 'String', 'has_missing' : True},
          {'field' : 'ram_capacity', 'type': 'String', 'has_missing' : False},
          {'field' : 'hdd_capacity', 'type': 'String', 'has_missing' : True},
#           {'field' : 'ssd_capacity', 'type': 'String', 'has_missing' : True},
          {'field' : 'title', 'type': 'String', 'has_missing' : False},
          {'field' : 'screen_size', 'type': 'String', 'has_missing' : True},
          {'field' : 'model', 'type': 'String', 'has_missing' : True},
         ]

In [19]:
# There is a bug later on that requires num_cores to be 1, but we can make use of
# multi-threaded processes in the meantime
deduper = dedupe.Dedupe(fields, num_cores=6)

In [20]:
y2 = pd.read_csv('../data/sigmod/Y2.csv')
y3 = pd.read_csv('../data/sigmod/Y3.csv')
y = y3.append(y2)
len(y)

115269

In [21]:
trainig_data = {}
trainig_data['match'] = []
trainig_data['distinct'] = []

In [22]:
match = y[y.label == 1].to_dict(orient='row')
distinct = y[y.label == 0].to_dict(orient='row')



In [23]:
for m in match:
    trainig_data['match'].append( ( to_dedupe_dict[m['left_instance_id']], to_dedupe_dict[m['right_instance_id']] ) )

In [24]:
len(trainig_data['match'])

3405

In [25]:
for d in distinct:
    trainig_data['distinct'].append( ( to_dedupe_dict[d['left_instance_id']], to_dedupe_dict[d['right_instance_id']] ) )

In [26]:
len(trainig_data['distinct'])

111864

In [27]:
# trainig_data['match'].extend(my_own_annotation['match'])

In [28]:
# trainig_data['distinct'].extend(my_own_annotation['distinct'])

In [29]:
with open('y_laptop.json', 'w') as fout:
    json.dump(trainig_data, fout)

In [31]:
training_file = 'y_laptop.json'
with open(training_file) as tf:
    deduper.prepare_training(to_dedupe_dict, training_file=tf, sample_size=500, blocked_proportion=0.1)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (fingerprint, cpu_model)
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramCanopyPredicate: (0.6, brand)


In [54]:
print('starting active labeling...')
dedupe.console_label(deduper)

brand : lenovo
cpu_brand : intel
cpu_model : None
cpu_type : None
ram_capacity : 4
hdd_capacity : None
ssd_capacity : None
title : miniprice.ca - lenovo thinkpad x230 34352sf tablet pc - 12.5 - in-plane switching ips technology - wireless lan - intel core i7 i7-3520m 2.90 ghz - black - 4 gb ram - 320 gb hdd - windows 7 professional 64-bit 34352sf-ddo 
screen_size : 12.5
model : thinkpad x230 34352sf

brand : lenovo
cpu_brand : intel
cpu_model : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
cpu_type : intel core i5 3rd gen 3320m 2.6 ghz. intel core i5 3rd gen 3320m 2.6 ghz 3.3 ghz 3 mb cache. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64-bit - 4 gb ram - 320 gb hdd
ram_capacity : 4
hdd_capacity : 320 gb hdd 7200 rpm. 320 gb hdd 7200 rpm. lenovo thinkpad x230 tablet 3438 - 12.5 - core i5 3320m - windows 7 pro 64

starting active labeling...
f


Finished labeling


In [None]:
with open('y_laptop_augmented.json', 'w') as fout:
    json.dump(deduper.training_pairs, fout)

In [33]:
deduper.train(recall=0.8)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.1577526638971606
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, model), SimplePredicate: (firstTokenPredicate, ram_capacity))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, cpu_model), TfidfTextCanopyPredicate: (0.6, cpu_model), TfidfTextCanopyPredicate: (0.8, hdd_capacity))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, hdd_capacity), SimplePredicate: (commonSixGram, brand))
INFO:dedupe.training:(SimplePredicate: (firstIntegerPredicate, model), SimplePredicate: (alphaNumericPredicate, cpu_model))
INFO:dedupe.training:(SimplePredicate: (firstIntegerPredicate, model), LevenshteinCanopyPredicate: (1, hdd_capacity), SimplePredicate: (alphaNumericPredicate, model))
INFO:dedupe.training:(SimplePredicate: (firstIntegerPredicate, cpu_model), SimplePredicate: (wholeFieldPredicate, model), Si

In [34]:
training_file = 'trained_model_laptops.json'
settings_file = 'trained_model_laptops_settings.json'
with open(training_file, 'w') as tf:
    deduper.write_training(tf)
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [36]:
clustered_dupes = deduper.partition(to_dedupe_dict, 0.3)

print('# duplicate sets', len(clustered_dupes))

# duplicate sets 253


In [35]:
deduper.predicates

((SimplePredicate: (oneGramFingerprint, model),
  SimplePredicate: (firstTokenPredicate, ram_capacity)),
 (LevenshteinCanopyPredicate: (4, cpu_model),
  TfidfTextCanopyPredicate: (0.6, cpu_model),
  TfidfTextCanopyPredicate: (0.8, hdd_capacity)),
 (SimplePredicate: (twoGramFingerprint, hdd_capacity),
  SimplePredicate: (commonSixGram, brand)),
 (SimplePredicate: (firstIntegerPredicate, model),
  SimplePredicate: (alphaNumericPredicate, cpu_model)),
 (SimplePredicate: (firstIntegerPredicate, model),
  LevenshteinCanopyPredicate: (1, hdd_capacity),
  SimplePredicate: (alphaNumericPredicate, model)),
 (SimplePredicate: (firstIntegerPredicate, cpu_model),
  SimplePredicate: (wholeFieldPredicate, model),
  SimplePredicate: (commonFourGram, cpu_brand)),
 (SimplePredicate: (firstIntegerPredicate, model),
  SimplePredicate: (suffixArray, cpu_brand)),
 (SimplePredicate: (firstIntegerPredicate, model),
  TfidfTextCanopyPredicate: (0.2, model),
  SimplePredicate: (commonFourGram, cpu_model)),
 (S

In [37]:
clustered_dupes

[(('source4__121', 'source4__92'), array([0.50633454, 0.50633454])),
 (('buy.net//121', 'source13__239'), array([0.42757975, 0.42757975])),
 (('buy.net//1759',
   'buy.net//1801',
   'buy.net//2012',
   'buy.net//393',
   'buy.net//634',
   'buy.net//93',
   'source1__130',
   'source2__181',
   'source2__393',
   'source2__470',
   'source2__89',
   'source7__1655',
   'source8__970',
   'www.amazon.com//1081',
   'www.amazon.com//1313',
   'www.amazon.com//1664',
   'www.amazon.com//2191',
   'www.amazon.com//2284',
   'www.amazon.com//291',
   'www.flexshopper.com//1098',
   'www.flexshopper.com//1352',
   'www.flexshopper.com//1905',
   'www.flexshopper.com//2173',
   'www.flexshopper.com//2217',
   'www.flexshopper.com//43',
   'www.isupplyhub.com//326'),
  array([0.33934953, 0.17859371, 0.34246383, 0.34298704, 0.34335662,
         0.34048375, 0.08632393, 0.08531495, 0.08538482, 0.15285421,
         0.1436774 , 0.15129462, 0.30165405, 0.32719195, 0.19673201,
         0.13212553, 0

In [38]:
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

In [39]:
cluster_membership

{'source4__121': {'Cluster ID': 0, 'confidence_score': 0.5063345408083064},
 'source4__92': {'Cluster ID': 0, 'confidence_score': 0.5063345408083064},
 'buy.net//121': {'Cluster ID': 1, 'confidence_score': 0.4275797509265582},
 'source13__239': {'Cluster ID': 1, 'confidence_score': 0.4275797509265582},
 'buy.net//1759': {'Cluster ID': 2, 'confidence_score': 0.33934952766061754},
 'buy.net//1801': {'Cluster ID': 2, 'confidence_score': 0.17859370829012877},
 'buy.net//2012': {'Cluster ID': 2, 'confidence_score': 0.3424638318266203},
 'buy.net//393': {'Cluster ID': 2, 'confidence_score': 0.34298703993627633},
 'buy.net//634': {'Cluster ID': 2, 'confidence_score': 0.34335661916499804},
 'buy.net//93': {'Cluster ID': 2, 'confidence_score': 0.34048374568126294},
 'source1__130': {'Cluster ID': 2, 'confidence_score': 0.0863239309628161},
 'source2__181': {'Cluster ID': 2, 'confidence_score': 0.08531494995904043},
 'source2__393': {'Cluster ID': 2, 'confidence_score': 0.08538482159534644},
 's

In [43]:
clustered_dupes[0]

(('source4__121', 'source4__92'), array([0.50633454, 0.50633454]))

In [44]:
clustered_dupes[1]

(('buy.net//121', 'source13__239'), array([0.42757975, 0.42757975]))

In [45]:
clustered_dupes[2]

(('buy.net//1759',
  'buy.net//1801',
  'buy.net//2012',
  'buy.net//393',
  'buy.net//634',
  'buy.net//93',
  'source1__130',
  'source2__181',
  'source2__393',
  'source2__470',
  'source2__89',
  'source7__1655',
  'source8__970',
  'www.amazon.com//1081',
  'www.amazon.com//1313',
  'www.amazon.com//1664',
  'www.amazon.com//2191',
  'www.amazon.com//2284',
  'www.amazon.com//291',
  'www.flexshopper.com//1098',
  'www.flexshopper.com//1352',
  'www.flexshopper.com//1905',
  'www.flexshopper.com//2173',
  'www.flexshopper.com//2217',
  'www.flexshopper.com//43',
  'www.isupplyhub.com//326'),
 array([0.33934953, 0.17859371, 0.34246383, 0.34298704, 0.34335662,
        0.34048375, 0.08632393, 0.08531495, 0.08538482, 0.15285421,
        0.1436774 , 0.15129462, 0.30165405, 0.32719195, 0.19673201,
        0.13212553, 0.14581085, 0.13121542, 0.16480847, 0.16830851,
        0.16573698, 0.16572288, 0.16830999, 0.1505267 , 0.15045543,
        0.29146888]))

In [52]:
res = []
for el in clustered_dupes:
    for i in range(len(el[0])):
        for j in range(i+1, len(el[0])):
            res.append((el[0][i], el[0][j]))

In [54]:
len(res)

30720