In [6]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [7]:
sp = spacy.load('en_core_web_sm')

In [8]:
def format_number(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.

        Input: - dataframe with nulls as NaN

        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!

        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number

        Outputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(format_number(str(s).lower())) for s in screen_sizes]


In [9]:
# From Jerry's code. Might not be needed:
title_remove_words = ["price", "comparison", "at", "buy.net", "amazon.com", ":",
"computers", "&", "accessories", "laptop", "vology", "tigerdirect", ".com", "ultraportable", "cool",
"audiophile", "wireless", "bluetooth", "speaker", "portable", "with", "built-in", "microphone", "and",
"micro", "sd","card","slot", "-", "(", ")", "high", "performance", "new", "core", "high", "end", "bes",
"audio", "nx.m8eaa.007", "/", "notebook", "pc", '"', "brand", "new", "hewlett-packard"]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r"[^a-z0-9,.\-\s]")
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')


    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]

    df['new_title'] = df.title
    irrelevant_regex = re.compile(r"[^a-z0-9.\s]")
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1)

    # Brand assignment
    all_brands = set()

    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    intel=['intel', 'i3', 'i5', 'i7'] #Needed because not all entries have intel
    def assign_cpu_brand(record):
        # Search in brand first
        for blue in intel:
            if blue in str(record['cpu_brand']) or blue in str(record['title']) or \
                    blue in str(record['cpu_model']) or blue in str(record['cpu_type']):
                return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)

        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)

        else:
            return str(15.6) # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)


    # # ram capacity
    # def assign_ram_capacity(record):
    #     s = str(record['ram_capacity']).replace(' ', '')
    #     possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
    #                      '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
    #                      '6', '8', '10', '12', '16', '32', '64', '128']
    #     for val in possible_vals:
    #         if val in s:
    #             return int(val.replace('gb', ''))
    #
    #     s = str(record['title']).replace(' ', '')  # This will be wrong, please change
    #     possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
    #                      '32gb', '64gb', '128gb']
    #     for val in possible_vals:
    #         if val in s:
    #             return int(val.replace('gb', ''))
    #
    #     return 0
        #new ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity'])
        t = str(record['title'])
        regex = re.compile(r"(\d{1,3})\s?([gm]b)") # rare chance of encountering MB as an error
        m = None
        #ram_c = df['ram_capacity'].str.extract(regex)
        #title_ram = df['title'].str.extract(regex)
        if s:
            m=re.search(regex, s)
        if m is None:
            m=re.search(regex, t)
        if m is None:
            return None
        else:
            m= m.group()
            return re.sub(r'([gm]b)', "", m) # remove MB and GB

    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)

    def assign_ram_type(record):
        m = None
        if "ddr3" in record['ram_type']:
            return "ddr3"

    df['ram_type'] = df.apply(assign_ram_type, axis=1)


    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))

        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbhdd", s2):
            return int(re.findall("\d{3,4}gbhdd", s2)[0][:-5])
        if re.search("hdd\d{3,4}gb", s2):
            return int(re.findall("hdd\d{3,4}gb", s2)[0][3:-2])
        if re.search("hdd\dtb", s2):
            return int(re.findall("hdd\dtb", s2)[0][3:4] + '000')
        if re.search("\dtbhdd", s2):
            return int(re.findall("\dtbhdd", s2)[0][0] + '000')
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)

    def assign_ssd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))


        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbssd", s2):
            return int(re.findall("\d{3,4}gbssd", s2)[0][:-5])
        if re.search("ssd\d{3,4}gb", s2):
            return int(re.findall("ssd\d{3,4}gb", s2)[0][3:-2])
        if re.search("ssd\dtb", s2):
            return int(re.findall("ssd\dtb", s2)[0][3:4] + '000')
        if re.search("\dtbssd", s2):
            return int(re.findall("\dtbssd", s2)[0][0] + '000')
        return 0

    df['ssd_capacity'] = df.apply(assign_ssd_capacity, axis=1)

    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if sc in brand_tokens[i]:
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i
                else:
                    break
        except Exception:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])

    # Intermediate column
    df['model'] = df.apply(assign_laptop_model, axis=1)

    def assign_model_name(record): # laptop Line
        #print(record['model'].split())
        if record['model'] is None:
            return None
        ans = record['model'].split(" ")[0]
        if ans.isalpha():
            return ans
        return None

    df['model_name'] = df.apply(assign_model_name, axis=1)

    def assign_model_number(record):
        '''
        if "x230" in org_title and "3435" in org_title:
                        mod_item["model"] = "3435"

                    if "hp" in org_title:
                        #regex for specific HP laptops
                        hp_li = hp_new_model.findall(org_title)
                        if len(hp_li) > 0:
                            mod_item["model"] = " ".join(hp_li[0].replace("-","").replace(" ","").split())

                    if "hp" in org_title and "revolve" in org_title and "810" in org_title:
                        mod_item["model"] = "revolve 810 "
                        if "g1" in org_title.lower():
                            mod_item["model"] += "g1"
                        elif "g2" in org_title.lower():
                            mod_item["model"] += "g2"

                    if "hp" in org_title and "compaq" in org_title and "nc6400" in org_title:
                        mod_item["model"] = "nc6400"

                    if "lenovo" in org_title or "thinkpad" in org_title:
                        tp_li = lenovo_thinkpad_model.findall(org_title)
                        if len(tp_li) > 0:
                            mod_item["model"] = " ".join(tp_li[0].split())
        '''
        return "232";

    df['model_number'] = df.apply(assign_model_number)

    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])
    # Unit stand. in weight
    def assign_weight(record): #TO DO: Convert kg to lb if needed
        regex=re.compile('.?(\d{1,2}\.\d{1,2})\s?[lpk]')
        s = record['weight']
        m = None
        if s:
            m = re.search(regex, s)
        if m is None:
            m = re.search(regex, record['title'])
        if m is None:
            return None
        else:
            m = m.group()
            return re.sub(r"\s?[lpk]", "", m)

    df['weight'] = df.apply(assign_weight, axis=1)

    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom",
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)

    #TO DO: there are laptops called E1-572 and cpus called E1-2100 or E-300
    def assign_cpu_model(record):
        model=record['cpu_model']
        regex=re.compile(r"-?\d{1,4}([mu])") #For intel cpus
        regex2=re.compile(r"[ea]\d?-\d{1,4}[m]?") #for amd A and E series. Needs detection after AMD tag in title
        m=None
        if record['cpu_brand']=='intel' and model is not None :
            m = re.search(regex, model)
            if m is not None:
                m=m.group()
                return re.sub(r'-', "", m)
        if re.search("intel", record['title']): # one case where laptop model is 50m and gets caught
            m = re.search(regex, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'-', "", m)
        if record['cpu_brand']=='amd' and model is not None:
            m = re.search(regex2, model)
            if m is not None:
                m=m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if re.search("amd", record['title']):
            m = re.search(regex2, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if m is None:
            return None

    df['cpu_model'] = df.apply(assign_cpu_model, axis=1)

    def assign_cpu_frequency(record):
        s = record['cpu_frequency']
        regex=re.compile(r"\d?.\d{1,2}\s?ghz")
        m = None
        if s:
            m=re.search(regex, s)
            if m is not None:
                m=m.group()
                return re.sub(r'ghz', "", m)
        if re.search("ghz", record['title']):
            m = re.search(regex, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'ghz', "", m)
        if m is None:
            return None
    df['cpu_frequency'] = df.apply(assign_cpu_frequency, axis=1)

    return df

In [10]:
'''
Redundant with previous functions
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Outputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df
'''

'\nRedundant with previous functions\ndef fill_nulls_with_none(df):\n    """ Fills nulls in a dataframe with None.\n        This is required for the Dedupe package to work properly.\n        \n        Input: - dataframe with nulls as NaN\n        \n        Output: - new dataframe with nulls as None\n    """\n    new_df = df.copy()\n    for col in df.columns:\n        new_df[col] = new_df[col].where(new_df[col].notnull(), None)\n    return new_df\n\ndef convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):\n    """ Convert number types to strings in a dataframe.\n        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!\n        \n        Inputs: - df -> dataframe to convert number types\n                - cols_to_convert -> list of columns to convert\n                - remove_point_zero -> bool to say whether you want \'.0\' removed from number\n        \n        Outputs: - dataframe with converted number types\n    """\n    new_df = df.c

In [11]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
x2.set_index('instance_id',  inplace=True, drop=False)
x2 = preprocess_laptop_dataset(x2.copy(deep=True))
#x2.ram_capacity[50:60]

TypeError: argument of type 'float' is not iterable

In [None]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
x3.set_index('instance_id',  inplace=True, drop=False)
x3 = preprocess_laptop_dataset(x3.copy(deep=True))
# x3 = convert_numbers_to_strings(x3, ['ram_capacity', 'screen_size'])

In [None]:
len(x2), len(x3)

In [None]:
#x3.head()

In [None]:
x2.index.intersection(x3.index)

In [None]:
# x23 = x3.append(x2)
x23 = x2

In [None]:
x23.head()

In [None]:
x23.columns, len(x23.columns)

In [None]:
to_dedupe = x23[[
    'instance_id',
    'brand',
    'cpu_brand',
    'cpu_type',
    'ram_capacity',
    'hdd_capacity', 
    'ssd_capacity',
    'title',
    'screen_size',
    'model']].copy()

In [None]:
to_dedupe.head()

In [None]:
to_dedupe_dict = to_dedupe.to_dict(orient = 'index')

In [None]:
to_dedupe_dict['www.softwarecity.ca//737']

In [None]:
with open('to_dedupe_dict.pkl', 'wb') as f:
    pickle.dump(to_dedupe_dict, f)
with open('to_dedupe_dict.pkl', 'rb') as f:
    to_dedupe_dict = pickle.load(f)

In [None]:
# docs for this are here: 
fields = [{'field' : 'brand', 'type' : 'Categorical', 'categories' : extra_brands},
          
          {'field' : 'cpu_brand', 'type': 'Categorical', 'categories' : ['amd', 'intel']}, 
          
#           {'field' : 'cpu_model', 'type': 'String', 'has_missing' : True},
          
          {'field' : 'cpu_type', 'type': 'Exact', 'has_missing' : False},
          
          {'field' : 'ram_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'hdd_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'ssd_capacity', 'type': 'Price', 'has_missing' : False},
          
          {'field' : 'title', 'type': 'Text', 'has_missing' : False},
          
          {'field' : 'screen_size', 'type': 'Categorical', 'has_missing' : False, 'categories' : screen_sizes},
          
          {'field' : 'model', 'type': 'String', 'has_missing' : True},
          
         ]

In [None]:
# There is a bug later on that requires num_cores to be 1, but we can make use of
# multi-threaded processes in the meantime
deduper = dedupe.Dedupe(fields, num_cores=24)


In [None]:
y2 = pd.read_csv('../data/sigmod/Y2.csv')
y3 = pd.read_csv('../data/sigmod/Y3.csv')
# y = y3.append(y2)
y = y2
len(y)

In [None]:
training_data = {'match': [], 'distinct': []}

In [None]:
match = y[y.label == 1].to_dict(orient='row')
distinct = y[y.label == 0].to_dict(orient='row')

In [None]:
for m in match:
    training_data['match'].append( ( to_dedupe_dict[m['left_instance_id']], to_dedupe_dict[m['right_instance_id']] ) )

In [None]:
len(training_data['match'])

In [None]:
for d in distinct:
    training_data['distinct'].append( ( to_dedupe_dict[d['left_instance_id']], to_dedupe_dict[d['right_instance_id']] ) )

In [None]:
len(training_data['distinct'])

In [None]:
# training_data['match'].extend(my_own_annotation['match'])

In [None]:
# training_data['distinct'].extend(my_own_annotation['distinct'])

In [None]:
with open('y_laptop.json', 'w') as fout:
    json.dump(training_data, fout)

In [None]:
training_file = 'y_laptop.json'
with open(training_file) as tf:
    deduper.prepare_training(to_dedupe_dict, training_file=tf, sample_size=1500, blocked_proportion=0.9)

In [None]:
print('starting active labeling...')
dedupe.console_label(deduper)

In [None]:
with open('y_laptop_augmented.json', 'w') as fout:
    json.dump(deduper.training_pairs, fout)

In [None]:
deduper.train(recall=0.9)

In [None]:
training_file = 'trained_model_laptops.json'
settings_file = 'trained_model_laptops_settings.json'
with open(training_file, 'w') as tf:
    deduper.write_training(tf)
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [None]:
clustered_dupes = deduper.partition(to_dedupe_dict, 0.5)

print('# duplicate sets', len(clustered_dupes))

In [None]:
deduper.predicates

In [None]:
clustered_dupes

In [None]:
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

In [None]:
cluster_membership

In [None]:
def laptop_test(x):
    x.set_index('instance_id',  inplace=True, drop=False)
    x = preprocess_laptop_dataset(x.copy(deep=True))
    
    to_dedupe = x[[ 'instance_id',
    'brand',
    'cpu_brand',
    'cpu_type',
    'ram_capacity',
    'hdd_capacity', 
    'ssd_capacity',
    'title',
    'screen_size',
    'model']].copy()
    
    with open('trained_model_laptops_settings.json', 'rb') as fin:
        dr = dedupe.StaticDedupe(fin)
    to_dedupe_dict = to_dedupe.to_dict(orient = 'index')
    
    clustered_dupes = dr.partition(to_dedupe_dict, 0.5)

    print('# duplicate sets', len(clustered_dupes))
    
    res = []
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i+1, len(el[0])):
                res.append((el[0][i], el[0][j]))
    res_df =pd.DataFrame(res)            
#     res_df.columns = ['left_instance_id', 'right_instance_id']
    res_df.columns = ['left_instance_id', 'right_instance_id']
    return res_df

In [None]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
res = laptop_test(x2)

In [None]:
x3 = pd.read_csv("../data/sigmod/X3.csv")
res = laptop_test(x3)

In [None]:
len(res)

In [None]:
res

In [None]:
res.to_csv("output_x2.csv", index=False)

In [None]:
deduper.predicates

In [None]:
deduper.data_model.predicates