In [4]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [5]:
sp = spacy.load('en_core_web_sm')

In [76]:
def format_number(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.

        Input: - dataframe with nulls as NaN

        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!

        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number

        Outputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(format_number(str(s).lower())) for s in screen_sizes]


In [81]:
# From Jerry's code. Might not be needed:
title_remove_words = ["price", "comparison", "at", "buy.net", "amazon.com", ":",
"computers", "&", "accessories", "laptop", "vology", "tigerdirect", ".com", "ultraportable", "cool",
"audiophile", "wireless", "bluetooth", "speaker", "portable", "with", "built-in", "microphone", "and",
"micro", "sd","card","slot", "-", "(", ")", "high", "performance", "new", "core", "high", "end", "bes",
"audio", "nx.m8eaa.007", "/", "notebook", "pc", '"', "brand", "new", "hewlett-packard"]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r"[^a-z0-9,.\-\s]")
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')


    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]

    df['new_title'] = df.title
    irrelevant_regex = re.compile(r"[^a-z0-9.\s]")
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1)

    # Brand assignment
    all_brands = set()

    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    intel=['intel', 'i3', 'i5', 'i7'] #Needed because not all entries have intel
    def assign_cpu_brand(record):
        # Search in brand first
        for blue in intel:
            if blue in str(record['cpu_brand']) or blue in str(record['title']) or \
                    blue in str(record['cpu_model']) or blue in str(record['cpu_type']):
                return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)

        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)

        else:
            return str(15.6) # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)


    # # ram capacity
    # def assign_ram_capacity(record):
    #     s = str(record['ram_capacity']).replace(' ', '')
    #     possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
    #                      '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
    #                      '6', '8', '10', '12', '16', '32', '64', '128']
    #     for val in possible_vals:
    #         if val in s:
    #             return int(val.replace('gb', ''))
    #
    #     s = str(record['title']).replace(' ', '')  # This will be wrong, please change
    #     possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
    #                      '32gb', '64gb', '128gb']
    #     for val in possible_vals:
    #         if val in s:
    #             return int(val.replace('gb', ''))
    #
    #     return 0
        #new ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity'])
        t = str(record['title'])
        regex = re.compile(r'(\d{1,3})\s?([gm]b)') # rare chance of encountering MB as an error
        m = None
        #ram_c = df['ram_capacity'].str.extract(regex)
        #title_ram = df['title'].str.extract(regex)
        if s:
            m=re.search(regex, s)
        if m is None:
            m=re.search(regex, t)
        if m is None:
            return None
        else:
            m= m.group()
            return re.sub(r'([gm]b)', "", m) # remove MB and GB

    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)

    def assign_ram_type(record):
        s = str(record['ram_type'])
        regex = re.compile(r"DDR3")
        m = None
        if "ddr3" in s:
            return "ddr3"

    df['ram_type'] = df.apply(assign_ram_type, axis=1)


    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))

        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbhdd", s2):
            return int(re.findall("\d{3,4}gbhdd", s2)[0][:-5])
        if re.search("hdd\d{3,4}gb", s2):
            return int(re.findall("hdd\d{3,4}gb", s2)[0][3:-2])
        if re.search("hdd\dtb", s2):
            return int(re.findall("hdd\dtb", s2)[0][3:4] + '000')
        if re.search("\dtbhdd", s2):
            return int(re.findall("\dtbhdd", s2)[0][0] + '000')
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)

    def assign_ssd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))


        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbssd", s2):
            return int(re.findall("\d{3,4}gbssd", s2)[0][:-5])
        if re.search("ssd\d{3,4}gb", s2):
            return int(re.findall("ssd\d{3,4}gb", s2)[0][3:-2])
        if re.search("ssd\dtb", s2):
            return int(re.findall("ssd\dtb", s2)[0][3:4] + '000')
        if re.search("\dtbssd", s2):
            return int(re.findall("\dtbssd", s2)[0][0] + '000')
        return 0

    df['ssd_capacity'] = df.apply(assign_ssd_capacity, axis=1)

    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if sc in brand_tokens[i]:
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])

    df['model'] = df.apply(assign_laptop_model, axis=1)


    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])
    # Unit stand. in weight
    def assign_weight(record): #TO DO: Convert kg to lb if needed
        regex=re.compile('.?(\d{1,2}\.\d{1,2})\s?[lpk]')
        s = record['weight']
        m = None
        if s:
            m = re.search(regex, s)
        if m is None:
            m = re.search(regex, record['title'])
        if m is None:
            return None
        else:
            m = m.group()
            return re.sub(r"\s?[lpk]", "", m)

    df['weight'] = df.apply(assign_weight, axis=1)

    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom",
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)

    #TO DO: there are laptops called E1-572 and cpus called E1-2100 or E-300
    def assign_cpu_model(record):
        model=record['cpu_model']
        regex=re.compile(r"-?\d{1,4}([mu])") #For intel cpus
        regex2=re.compile(r"[ea]\d?-\d{1,4}[m]?") #for amd A and E series. Needs detection after AMD tag in title
        m=None
        if record['cpu_brand']=='intel' and model is not None :
            m = re.search(regex, model)
            if m is not None:
                m=m.group()
                return re.sub(r'-', "", m)
        if re.search("intel", record['title']): # one case where laptop model is 50m and gets caught
            m = re.search(regex, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'-', "", m)
        if record['cpu_brand']=='amd' and model is not None:
            m = re.search(regex2, model)
            if m is not None:
                m=m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if re.search("amd", record['title']):
            m = re.search(regex2, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if m is None:
            return None

    df['cpu_model'] = df.apply(assign_cpu_model, axis=1)

    def assign_cpu_frequency(record):
        s = record['cpu_frequency']
        regex=re.compile(r"\d?.\d{1,2}\s?ghz")
        m = None
        if s:
            m=re.search(regex, s)
            if m is not None:
                m=m.group()
                return re.sub(r'ghz', "", m)
        if re.search("ghz", record['title']):
            m = re.search(regex, record['title'])
            if m is not None:
                m=m.group()
                return re.sub(r'ghz', "", m)
        if m is None:
            return None
    df['cpu_frequency'] = df.apply(assign_cpu_frequency, axis=1)

    return df


In [82]:
x2 = pd.read_csv("../data/sigmod/X2.csv")
x2.set_index('instance_id',  inplace=True, drop=False)
x2 = preprocess_laptop_dataset(x2.copy(deep=True))
x2.head()

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,3320m,i5,2.6,4,ddr3,ddr3-1600 pc3-12800,320,0,1.8,,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,lenovo thinkpad x230 34352jf tablet pc 12.5 in...,"[lenovo, thinkpad, x230, 34352jf, tablet, pc, ...",12.5,thinkpad x230 34352jf
www.isupplyhub.com//1256,www.isupplyhub.com//1256,acer,intel,,i5,1.6,8,ddr3,,500,0,4.8,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch...,amazon.com acer aspire v7582pg6479 15.6inch to...,"[amazon.com, acer, aspire, v7582pg6479, 15.6in...",15.6,aspire v7582pg6479
www.isupplyhub.com//326,www.isupplyhub.com//326,acer,intel,4200u,i5,1.6,4,ddr3,,500,0,5.2,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch l...,amazon.com acer aspire e15726870 15.6 inch lap...,"[amazon.com, acer, aspire, e15726870, 15.6, in...",15.6,aspire
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,2100,,,4,ddr3,,500,0,4.8,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,amazon.com 15.6 hp 15f009wm amd dualcore e1210...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore...",15.6,15f009wm amd
www.isupplyhub.com//157,www.isupplyhub.com//157,asus,intel,3317u,i5,1.7,4,ddr3,,0,256,2.9,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,amazon.com asus ux31axb52 13.3inch ultrabook 1...,"[amazon.com, asus, ux31axb52, 13.3inch, ultrab...",13.3,ux31axb52


In [83]:
# x2['new_title'].head()
x2[85:89]

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
www.amazon.com//1904,www.amazon.com//1904,acer,intel,,celeron,2.16,2,ddr3,,320,0,2.8,11.46 x 8.31 x 0.83 inches,amazon.com acer aspire e3-111-c5gl 11.6-inch l...,amazon.com acer aspire e3111c5gl 11.6inch lapt...,"[amazon.com, acer, aspire, e3111c5gl, 11.6inch...",11.6,aspire e3111c5gl
www.amazon.com//1836,www.amazon.com//1836,acer,intel,2348m,i3,,4,,,0,0,15.6,,amazon.com lb1 high performance new wireless b...,amazon.com lb1 high performance new wireless b...,"[amazon.com, lb1, high, performance, new, wire...",15.6,black silver
www.amazon.com//2226,www.amazon.com//2226,acer,intel,50m,i3,1.7,4,,,500,0,15.6,,amazon.com brand new acer america corp. acer a...,amazon.com brand new acer america corp. acer a...,"[amazon.com, brand, new, acer, america, corp, ...",15.6,america corp
www.amazon.com//848,www.amazon.com//848,hp,intel,3120m,i3,,6,ddr3,,750,0,6.1,1.17 x 10.20 x 14.88 inches,amazon.com hp 15-d053cl touchsmart - 15.6 hd t...,amazon.com hp 15d053cl touchsmart 15.6 hd touc...,"[amazon.com, hp, 15d053cl, touchsmart, 15.6, h...",15.6,15d053cl touchsmart
