In [2]:
import pandas as pd
import os
import csv
import re
import logging
import optparse
import re
import spacy
import dedupe
import pickle
import copy
import json
from unidecode import unidecode

In [3]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num


def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.

        Input: - dataframe with nulls as NaN

        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df


def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!

        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number

        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0', '') \
                if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x) \
                if not isinstance(x, type(None)) else x)
    return new_df

In [12]:
def clean_laptops_dataset(x_org):
    # Copy the dataset
    df = x_org.copy(deep=True)

    # Set the index
    df.set_index('instance_id', inplace=True, drop=False)

    spacy.cli.download("en_core_web_sm")
    sp = spacy.load('en_core_web_sm')

    # Read helper datasets stats
    extra_brands = set(
        pd.read_csv('../data/sigmod/laptops.csv', encoding='windows-1251').Company.str.lower().unique())
    screen_sizes = set(pd.read_csv('../data/sigmod/laptops.csv', encoding='windows-1251').Inches)
    screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

    # Keep only Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+')  # Why it doesn't work
    multispace_regex_2 = re.compile(r'\s-\s')  # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')

    # Tokenize the new title
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]

    remove_words = ['with', 'clarinet', 'audiophile', 'end', 'pc', 'french', 'performance', '"', 'burner', 'sd',
                    'canada', 'certified',
                    'keyboard', 'backlight', 'professional', 'at', 'beats', 'drive', 'microphone', 'vology',
                    'america',
                    'refurbished', 'computer', 'dimm', 'ultrabase', 'audio', ':', 'switching', 'premium', 'special',
                    'dvd', 'portable',
                    'speaker', 'buy.net', 'downgrade', '/', '&', 'wireless', 'home', 'notebook', ')', 'edition',
                    'built-in',
                    'dualcore', 'high', 'revolve', 'cool', 'and', 'micro', 'aluminum', 'g', 'tigerdirect', 'voice',
                    'nx.m8eaa.007',
                    'comfyview', 'amazon.com', 'bes', 'ultraportable', 'gb', 'core', 'computers', 'screen', 'slot',
                    'lan', 'supermulti', 'technology', 'bluray', 'price', 'display', 'dvdrw', '.com',
                    'internationalaccessories',
                    'touch', 'card', 'us', 'bluetooth', 'dvdwriter', 'for', 'new', 'comparison', 'webcam', '(',
                    'laptop',
                    'accessories', 'brand', 'builtin']
    replace_words = {'hewlett-packard': 'hp'}

    def clean_title(record):
        title = record['title']

        # Remove unneeded words
        for w in remove_words:
            title = title.replace(w, '')

        # Replace words with common word
        for w, fix_w in replace_words.items():
            title = title.replace(w, fix_w)

        return title

    df['new_title'] = df.apply(clean_title, axis=1)
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+')
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex,
                                                                                             ' ').str.replace(
        multispace_regex_2, ' ')
    df['new_title_tokens'] = df.apply(tokenize_new_tile, axis=1)

    # Brand assignment
    all_brands = set(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return None

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    intel = ['intel', 'i3', 'i5', 'i7', 'celeron', 'pentium']  # Needed because not all entries have intel

    def assign_cpu_brand(record):
        # Search in brand first
        for blue in intel:
            if blue in str(record['cpu_brand']) or blue in str(record['title']) or \
                    blue in str(record['cpu_model']) or blue in str(record['cpu_type']):
                return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)

        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)

        else:
            return None

    df['screen_size'] = df.apply(assign_screen_size, axis=1)

    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity'])
        t = str(record['title'])
        regex = re.compile(r'(\d{1,3})\s?([gm]b)')  # rare chance of encountering MB as an error
        m = None
        # ram_c = df['ram_capacity'].str.extract(regex)
        # title_ram = df['title'].str.extract(regex)
        if s:
            m = re.search(regex, s)
        if m is None:
            m = re.search(regex, t)
        if m is None:
            return None
        else:
            m = m.group()
            return re.sub(r'([gm]b)', "gb", m)

    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))

        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return str(re.findall("\d{3,4}gb", s)[0][:-2]) + ' gb'
        if re.search("\dtb", s):
            return str(re.findall("\dtb", s)[0][:-2] + '000') + ' gb'
        if re.search("\d{3,4}gbhdd", s2):
            return str(re.findall("\d{3,4}gbhdd", s2)[0][:-5]) + ' gb'
        if re.search("hdd\d{3,4}gb", s2):
            return str(re.findall("hdd\d{3,4}gb", s2)[0][3:-2]) + ' gb'
        if re.search("hdd\d{1}tb", s2):
            return str(re.findall("hdd\d{1}tb", s2)[0][3:4] + '000') + ' gb'
        if re.search("\d{1}tbhdd", s2):
            return str(re.findall("\d{1}tbhdd", s2)[0][0] + '000') + ' gb'
        return None

    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)

    def assign_ssd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))

        if re.search("\d{3,4}gbssd", s):
            return str(re.findall("\d{3,4}gb", s)[0][:-2]) + ' gb'
        if re.search("\dtbssd", s):
            return str(re.findall("\dtb", s)[0][:-2] + '000') + ' gb'
        if re.search("\d{3,4}gbssd", s2):
            return str(re.findall("\d{3,4}gbssd", s2)[0][:-5]) + ' gb'
        if re.search("ssd\d{3,4}gb", s2):
            return str(re.findall("ssd\d{3,4}gb", s2)[0][3:-2]) + ' gb'
        if re.search("ssd\d{1}tb", s2):
            return str(re.findall("ssd\d{1}tb", s2)[0][3:4] + '000') + ' gb'
        if re.search("\d{1}tbssd", s2):
            return str(re.findall("\d{1}tbssd", s2)[0][0] + '000') + ' gb'
        return None

        # if re.search("\d{3,4}gbssd", s):
        #     return str(re.findall("\d{3,4}gb", s)[0][:-2]) + ' gb'
        # if re.search("\dtbssd", s):
        #     return str(re.findall("\dtb", s)[0][:-2] + '000') + ' gb'
        # if re.search("\d{3,4}gbssd", s2):
        #     return str(re.findall("\d{3,4}gbssd", s2)[0][:-5]) + ' gb'
        # if re.search("ssd\d{3,4}gb", s2):
        #     return str(re.findall("ssd\d{3,4}gb", s2)[0][3:-2]) + ' gb'
        # if re.search("ssd\d{1}tb", s2):
        #     return str(re.findall("ssd\d{1}tb", s2)[0][3:4] + '000') + ' gb'
        # if re.search("\d{1}tbssd", s2):
        #     return str(re.findall("\d{1}tbssd", s2)[0][0] + '000') + ' gb'
        # return None

    df['ssd_capacity'] = df.apply(assign_ssd_capacity, axis=1)

    def assign_laptop_model(record):
        brand = record['brand']
        t = record['new_title']

        if brand == 'acer':
            acer_regex = [r'\sv.-.....-', r'\se.-.....-?....?']
            for r in acer_regex:
                cr = re.compile(r)
                if re.search(cr, t):
                    return re.search(cr, t).group()

        if brand == 'asus':
            regex = [r'\sux...-.....']
            for r in regex:
                cr = re.compile(r)
                if re.search(cr, t):
                    return re.search(cr, t).group()

        if brand == 'lenovo':
            regex = [r'\sx\d{3}\s?tablet?\s?\d{0,4}', r'\sx\d{3}\s?laptop?\s?\d{0,4}', r'\sx\d{3}\s?\d{0,4}',
                     r'\sx\d{1}\scarbon\s\d{4}', r'\sx\d{1}\scarbon touch\s\d{4}']

            for r in regex:
                cr = re.compile(r)
                if re.search(cr, t):
                    res = re.search(cr, t).group()
                    for w in ['carbon', 'touch', 'tablet', 'laptop']:
                        res = res.replace(w, '')
                    return res

        if brand == 'hp':
            regex = [r'\sfolio\s?\d{4}.', r'\selitebook\s?\d{3,4}.']
            for r in regex:
                cr = re.compile(r)
                if re.search(cr, t):
                    res = re.search(cr, t).group()
                    for w in ['folio', 'elitebook']:
                        res = res.replace(w, '')
                    return res

        if brand == 'dell':
            regex = [r'\s[nmi]\d{3,4}(-\d{4})?']
            for r in regex:
                cr = re.compile(r)
                if re.search(cr, t):
                    res = re.search(cr, t).group()
                    for w in ['folio', 'elitebook']:
                        res = res.replace(w, '')
                    return res

        return None

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)

    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])

    def assign_model_name(record):  # laptop Line
        # print(record['model'].split())
        if record['model'] is None:
            return None;
        ans = record['model'].split(" ")[0]
        if ans.isalpha():
            return ans
        return None

    df['model_name'] = df.apply(assign_model_name, axis=1)

    def assign_cpu_model(record):
        model = record['cpu_model']
        if record['cpu_type'] is not None:
            if model is not None:
                model += ' '
                model += record['cpu_type']
            else:
                model = record['cpu_type']

        regex = re.compile(r"-?\d{3,4}([mul]{1,2})")  # For intel cpus
        regex2 = re.compile(r"[ea]\d?-\d{1,4}[m]?")  # for amd A and E series. Needs detection after AMD tag in title
        m = None
        if record['cpu_brand'] == 'intel' and model is not None:
            m = re.search(regex, model)
            if m is not None:
                m = m.group()
                return re.sub(r'-', "", m)
        if re.search("intel", record['title']):  # one case where laptop model is 50m and gets caught
            m = re.search(regex, record['title'])
            if m is not None:
                m = m.group()
                return re.sub(r'-', "", m)
        if record['cpu_brand'] == 'amd' and model is not None:
            m = re.search(regex2, model)
            if m is not None:
                m = m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if re.search("amd", record['title']):
            m = re.search(regex2, record['title'])
            if m is not None:
                m = m.group()
                return re.sub(r'[ea]\d?-', "", m)
        if m is None:
            return None

    df['cpu_model'] = df.apply(assign_cpu_model, axis=1)

    def assign_cpu_type(record):
        # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom",
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in str(record['cpu_type']):
                return cpu

            if record['cpu_model'] is not None and cpu in str(record['cpu_model']):
                return cpu
            if record['cpu_frequency'] is not None and cpu in str(record['cpu_frequency']):
                return cpu

            if cpu in str(record['title']):
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

        return None

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)

    def assign_cpu_frequency(record):
        s = record['cpu_frequency']
        regex = re.compile(r"\d?.\d{1,2}\s?ghz")
        m = None
        if s:
            m = re.search(regex, s)
            if m is not None:
                m = m.group()
                return re.sub(r'ghz', "", m)
        if re.search("ghz", record['title']):
            m = re.search(regex, record['title'])
            if m is not None:
                m = m.group()
                return re.sub(r'ghz', "", m)
        if m is None:
            return None

    df['cpu_frequency'] = df.apply(assign_cpu_frequency, axis=1)

    def assign_new_title(record):
        # Remove extracted data from the title

        # Remove model name
        record['new_title'] = record['nwe_titl']

        # Remove brand
        # Remove screen size
        # Remove cpu brand
        # Remove cpu type
        # Ram capacity, hdd capacity, ssd capacity

    def assign_cpu(record):
        cpu_type = record['cpu_type']
        cpu_model = record['cpu_model']

        res = ""
        if cpu_type is not None:
            res += cpu_type

        if cpu_model is not None:
            res += '-'
            res += cpu_model

        return res

    df['cpu_model'] = df.apply(assign_cpu, axis=1)

    return df

In [13]:
# Read the X2 dataset and see what else can we get from just 
x2_org = pd.read_csv('../data/sigmod/X2.csv')

# Current cleaning output
x2_dev = clean_laptops_dataset(x2_org)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
x2_dev.head()

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model,model_name
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.6,4 gb,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,,1.80 kg,,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,lenovo thinkpad x230 34352jf tablet 12.5 inpe ...,"[lenovo, thinkpad, x230, 34352jf, tablet, 12.5...",12.5,x230 3435,
www.isupplyhub.com//1256,www.isupplyhub.com//1256,acer,intel,i5-4200u,i5,1.6,8 gb,ddr3 sdram. 8 gb ddr3l sdram,,500 gb,,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch...,acer aspire v7582p6479 15.6inch ultrabook ste...,"[ , acer, aspire, v7582p6479, 15.6inch, ultrab...",15.6,,
www.isupplyhub.com//326,www.isupplyhub.com//326,acer,intel,i5-4200u,i5,1.6,4 gb,ddr3 sdram. 4 gb ddr3-sdram,,500 gb,,5.2 pounds,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch l...,acer aspire e15726870 15.6 inch intel i5 4200...,"[ , acer, aspire, e15726870, 15.6, inch, intel...",15.6,,
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,-2100,,,4 gb,ddr3 sdram. 4 gb sdram ddr3,,500 gb,,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,15.6 hp 15f009wm amd dual e12100 4b ddr3 ram ...,"[ , 15.6, hp, 15f009wm, amd, dual, e12100, 4b,...",15.6,,
www.isupplyhub.com//157,www.isupplyhub.com//157,asus,intel,i5-3317u,i5,1.7,4 gb,ddr3 sdram. 4 gb ddr3,,,256 gb,2.9 pounds,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,as ux31axb52 13.3inch ultrabook 1.7 hz intel ...,"[ , as, ux31axb52, 13.3inch, ultrabook, 1.7, h...",13.3,,


In [55]:
# Extract the cpu model

# Improve the laptop model

# Use CPU frequency

# During Inference: Parition by screen size / brand and then do the blocking (During the inference)

# Use Py-entity matching and try it

# Remove any extracted words


In [75]:
# Remove any unncessary words
# all_tokens = []
# for i in range(len(x2_dev)):
#     record = x2_dev.iloc[i]
#     all_tokens.extend(list(record.new_title_tokens))

In [74]:
from collections import Counter

# Counter((all_tokens))

In [73]:
# Get all alpha only words
all_alpa_words = [i for i in (all_tokens) if i.isalpha()]
# set(all_alpa_words)

In [72]:
words_to_remove = [
    'aluminum',
    'america',
    'and',
    'at',
    'audio',
    'audiophile',
    'backlight',
    'beats',
    'bluetooth',
    'bluray',
    'brand',
    'builtin',
    'burner',
    'buy.net',
    'canada',
    'card',
    'certified',
    'clarinet',
    'comfyview',
    'comparison',
    'computer',
    'computers',
    'cool',
    'dimm',
    'display',
    'downgrade',
    'drive',
    'dvd',
    'dvdrw',
    'dvdwriter',
    'edition',
    'end',
    'for',
    'french',
    'g',
    'gb',
    'high',
    'home',
    'internationalaccessories',
    'keyboard',
    'lan',
    'portable',
    'premium',
    'price',
    'professional',
    'refurbished',
    'revolve',
    'screen',
    'slot',
    'special',
    'supermulti',
    'switching',
    'technology',
    'touch',
    'ultrabase',
    'us',
    'voice',
    'vology',
    'webcam',
    'dualcore'
]

words_to_replace = {'hewlettpackard': 'hp'}

In [None]:
# Brands laptop model regex

In [6]:
{'acer',
 'apple',
 'asus',
 'chuwi',
 'dell',
 'fujitsu',
 'google',
 'hp',
 'huawei',
 'lenovo',
 'lg',
 'mediacom',
 'microsoft',
 'msi',
 'razer',
 'samsung',
 'toshiba',
 'vero',
 'xiaomi'}

{'acer',
 'apple',
 'asus',
 'chuwi',
 'dell',
 'fujitsu',
 'google',
 'hp',
 'huawei',
 'lenovo',
 'lg',
 'mediacom',
 'microsoft',
 'msi',
 'razer',
 'samsung',
 'toshiba',
 'vero',
 'xiaomi'}

In [15]:
grouped = x2_dev.groupby(['brand'])
output = []
for g in grouped.groups.keys():
    output.append(x2_dev.loc[grouped.groups[g]])
print("Number of groups", len(output))

Number of groups 5


In [16]:
acer_title = list(output[0].title)
acer_title

['amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers accessories',
 'amazon.com acer aspire e1-572-6870 15.6 inch laptop intel i5 4200u 1.6ghz processor, 4gb ram, 500gb hard drive, windows 8 clarinet black laptop computers computers accessories',
 'amazon.com acer aspire nx.mg7aa.005 e1-771-6496 17.3-inch laptop computers accessories',
 'amazon.com acer aspire v3-772g-9460 17.3 led notebook intel core i7-4702mq 2.20 ghz 12gb ddr3 1tb hdd 120gb ssd blu-ray reader dvd-writer nvidia geforce gtx 760m windows 8 black laptop computers computers accessories',
 'amazon.com acer aspire v3-572pg-767j 15.6-inch touchscreen laptop platinum silver computers accessories',
 'amazon.com acer aspire v3-572-78s3 15.6-inch laptop platinum silver computers accessories',
 'amazon.com acer aspire e3-111-c5gl 11.6-inch laptop cool silver computers accessories',
 'amazon.com acer aspire v3-111p-43bc 11.6-inch touchscreen laptop cool silver computers accessories',
 'amazo

In [17]:
acer_regex = [r'\sv.-.....-?....?', r'\se.-.....-?....?']
c = 0
for r in acer_regex:
    cr = re.compile(r)
    for t in acer_title:
        if re.search(cr, t):
            print(re.search(cr, t).group(), t)
            c+=1
print()
print(c)

 v7-582pg-6479 amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers accessories
 v3-772g-9460 amazon.com acer aspire v3-772g-9460 17.3 led notebook intel core i7-4702mq 2.20 ghz 12gb ddr3 1tb hdd 120gb ssd blu-ray reader dvd-writer nvidia geforce gtx 760m windows 8 black laptop computers computers accessories
 v3-572pg-767j amazon.com acer aspire v3-572pg-767j 15.6-inch touchscreen laptop platinum silver computers accessories
 v3-572-78s3  amazon.com acer aspire v3-572-78s3 15.6-inch laptop platinum silver computers accessories
 v3-111p-43bc amazon.com acer aspire v3-111p-43bc 11.6-inch touchscreen laptop cool silver computers accessories
 v7-482pg-5842 amazon.com acer aspire v7-482pg-5842 14-inch touchscreen ultrabook cool steel computers accessories
 v3-111p-43bc acer aspire v3-111p-43bc 11.6 touchscreen notebook computer, intel pentium n3530 quad-core 2.16ghz, 4gb ram, 500gb hdd, windows 8.1, cool silver
 v3-572-78s3  acer aspire v3-572-78s3 15.6

In [57]:
c = 0
for r in acer_regex:
    cr = re.compile(r)
    for t in acer_title:
        if re.search(cr, t):
            print(re.search(cr, t).group(), t)
            c+=1
print()
print(c)

 v7-582pg-6479  acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers accessories
 v3-772g-9460  acer aspire v3-772g-9460 17.3 led notebook intel core i7-4702mq 2.20 ghz 12gb ddr3 1tb hdd 120gb ssd blu-ray reader dvd-writer nvidia geforce gtx 760m windows 8 black laptop computers computers accessories
 v3-572pg-767j  acer aspire v3-572pg-767j 15.6-inch touchscreen laptop platinum silver computers accessories
 v3-572-78s3   acer aspire v3-572-78s3 15.6-inch laptop platinum silver computers accessories
 v3-111p-43bc  acer aspire v3-111p-43bc 11.6-inch touchscreen laptop cool silver computers accessories
 v7-482pg-5842  acer aspire v7-482pg-5842 14-inch touchscreen ultrabook cool steel computers accessories
 v3-111p-43bc acer aspire v3-111p-43bc 11.6 touchscreen notebook computer, intel pentium n3530 quad-core 2.16ghz, 4gb ram, 500gb hdd, windows 8.1, cool silver
 v3-572-78s3  acer aspire v3-572-78s3 15.6 notebook computer, intel core i7-4510u 2ghz, 8gb ram, 1tb h

In [62]:
asus_title = list(output[1].title)
asus_title


[' asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor, 4gb ddr3, 256gb ssd, windows 7 professional silver aluminum laptop computers computers accessories',
 ' asus ux31a ux31a-xb52 13.3-inch laptop silver aluminum laptop computers computers accessories']

In [63]:
c = 0
regex = [r'\sux...-.....']
for r in regex:
    cr = re.compile(r)
    for t in asus_title:
        if re.search(cr, t):
            print(re.search(cr, t).group(), t)
            c+=1
print()
print(c)

 ux31a-xb52   asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor, 4gb ddr3, 256gb ssd, windows 7 professional silver aluminum laptop computers computers accessories
 ux31a-xb52   asus ux31a ux31a-xb52 13.3-inch laptop silver aluminum laptop computers computers accessories

2


In [91]:
hp = ((list(output[3].title)))
list(output[3].title)


[' 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 certified refurbished computers accessories',
 ' hp 15-p030nr 15.6-inch special edition laptop with beats audio computers accessories',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 64-bit - 4 gb ram - 500 gb hdd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 32-bit - 4 gb ram - 320 gb hdd',
 'hp elitebook revolve 810 g2 tablet - 11.6 - core i3 4010u - windows 7 pro 64-bit windows 8.1 pro downgrade - 4 gb ram - 128 gb ssd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 64-bit - 8 gb ram - 320 gb hdd',
 'hp elitebook revolve 810 g2 tablet - 11.6 - core i3 4010u - windows 7 pro 64-bit windows 8.1 pro downgrade - 4 gb ram - 128 gb ssd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro - 8 gb ram - 180 gb ssd',
 'hp elitebook revolve 810 g1 tablet - 11.6 - core i5 3437u - windows 8 pro 64-bit - 4 gb ram - 128 gb ssd',
 'hp elitebook folio

In [90]:
lenovo = (list(output[4].title))
print(len(lenovo))
# lenovo

234


In [116]:
c = 0
regex = [r'\sx\d{3}\s?tablet?\s?\d{0,4}', r'\sx\d{3}\s?laptop?\s?\d{0,4}', r'\sx\d{3}\s?\d{0,4}',r'\sx\d{1}\scarbon\s\d{4}', r'\sx\d{1}\scarbon touch\s\d{4}']
for t in lenovo:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
            print(re.search(cr, t).group(), "___", t[:50])
            c+=1
            found = True
            break
    if not found:
        print(t)
    
print()
print(c)

 x230 3435 ___ lenovo thinkpad x230 34352jf tablet pc - 12.5 - in
 x201 tablet 3093 ___  lenovo 3093b51 thinkpad x201 tablet 3093 - conver
 x201 tablet 3093 ___  lenovo 3093b65 thinkpad x201 tablet 3093 - conver
 x230 laptop  ___  lenovo thinkpad x230 laptop - 2325t11 - 12.5 ultr
 x1 carbon 3444 ___ lenovo thinkpad x1 carbon 3444 - 14 - core i7 3667
 x230 tablet 3435 ___ lenovo thinkpad x230 tablet 3435 - 12.5 - core i5 
 x230 tablet 3437 ___ lenovo thinkpad x230 tablet 3437 - 12.5 - core i5 
 x230 2320 ___ lenovo thinkpad x230 2320 - 12.5 - core i5 3210m -
 x230 tablet 3435 ___ lenovo thinkpad x230 tablet 3435 - 12.5 - core i5 
 x230 ___ lenovo thinkpad x230t 3435-22u tablet pc - 3rd gen
 x130 ___  lenovo thinkpad x130e 2338 - 11.6 - core i3 2367m
 x1 carbon 3460 ___ lenovo x1 carbon 346058u 14 led ultrabook - intel 
 x230 3435 ___ lenovo x230 34352ju tablet pc - 12.5 - in-plane sw
 x130 ___ lenovo thinkpad x130e 2339 - 11.6 - core i3 2367m 
 x230 2324 ___ lenovo thinkpad x230 2324 - 

In [97]:
print(len(hp))
hp

37


[' 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 certified refurbished computers accessories',
 ' hp 15-p030nr 15.6-inch special edition laptop with beats audio computers accessories',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 64-bit - 4 gb ram - 500 gb hdd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 32-bit - 4 gb ram - 320 gb hdd',
 'hp elitebook revolve 810 g2 tablet - 11.6 - core i3 4010u - windows 7 pro 64-bit windows 8.1 pro downgrade - 4 gb ram - 128 gb ssd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro 64-bit - 8 gb ram - 320 gb hdd',
 'hp elitebook revolve 810 g2 tablet - 11.6 - core i3 4010u - windows 7 pro 64-bit windows 8.1 pro downgrade - 4 gb ram - 128 gb ssd',
 'hp elitebook folio 9470m - 14 - core i5 3427u - windows 7 pro - 8 gb ram - 180 gb ssd',
 'hp elitebook revolve 810 g1 tablet - 11.6 - core i5 3437u - windows 8 pro 64-bit - 4 gb ram - 128 gb ssd',
 'hp elitebook folio

In [109]:
c = 0
regex = [r'\sfolio\s?\d{4}.', r'\selitebook\s?\d{4}.', r'\s\d{2}-.{4,6}']
for t in hp:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
            print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)    
print()
print(c)

 15-f009wm
 15-p030nr
 folio 9470m
 folio 9470m
 64-bit wi
 folio 9470m
 64-bit wi
 folio 9470m
 64-bit - 
 folio 9470m
 folio 9470m
 elitebook 2170p
 15-p030nr
 15-series
 15-series
 15-d053cl
 15-p030nr
 15-d053cl
 elitebook 8560p
 64-bit - 
 64-bit - 
 elitebook 8460p
 elitebook 2170p
 folio 9470m
 elitebook 8560p
 folio 9470m
 elitebook 8570p
 elitebook 2570p
 elitebook 8570p
 elitebook 8560p
 elitebook 8560p
 elitebook 2760p
 elitebook 2570p
 elitebook 8460p
 elitebook 2760p
 elitebook 8570p
 elitebook 2170p

37


In [102]:
dell = ((list(output[2].title)))
print(len(dell))
dell

9


[' dell inspiron i5547-5780slv 15.6-inch laptop computers accessories',
 ' dell inspiron m731r 18-inch hd notebook amd quad-core a8-5545m 1.7ghz processor, 8gb ram, 1tb hard drive, windows 8.1 indigo blue computers accessories',
 ' dell inspiron i5547-15001slv 15.6-inch touchscreen laptop computers accessories',
 ' dell inspiron i5547-3753slv 15.6-inch laptop 1.7 ghz intel core i5-4210u processor, 6gb ddr3, 1tb hdd, windows 8.1 silver computers accessories',
 ' dell inspiron i5547-3751slv 16-inch touchscreen laptop 1.70 ghz intel core i5-4210u processor, 6gb memory, 1tb hard drive, win 8.1 computers accessories',
 ' dell inspiron i5547-3751slv 16-inch touchscreen laptop 1.70 ghz intel core i5-4210u processor, 6gb memory, 1tb hard drive, win 8.1 computers accessories',
 ' dell inspiron i5547-15001slv 15.6-inch touchscreen laptop computers accessories',
 ' dell inspiron i5547-5780slv 15.6-inch laptop computers accessories',
 ' dell inspiron m731r 18-inch hd notebook amd quad-core a8-5545

In [108]:
c = 0
regex = [r'\s[nmi]\d{3,4}(-\d{4})?']
for t in dell:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
            print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)    
print()
print(c)

 i5547-5780
 m731
 i5547-1500
 i5547-3753
 i5547-3751
 i5547-3751
 i5547-1500
 i5547-5780
 m731

9


In [125]:
# Read the X2 dataset and see what else can we get from just 
x3_org = pd.read_csv('../data/sigmod/X3.csv')

# Current cleaning output
x3_dev = clean_laptops_dataset(x3_org)

In [127]:
grouped = x3_dev.groupby(['brand'])
output = []
for g in grouped.groups.keys():
    output.append(x3_dev.loc[grouped.groups[g]])
print("Number of groups", len(output))

Number of groups 5


In [128]:
output[0]

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
source7__1731,source7__1731,acer,intel,,i3,1.90 ghz,6,ddr3 sdram,,500,0,4.30 lbs,9.6 in. 13.4 in,acer aspire m5-481t-33226g52mtss 14 led ultrab...,acer aspire m5481t33226g52mtss 14 led ultraboo...,"[acer, aspire, m5481t33226g52mtss, 14, led, ul...",14,aspire m5481t33226g52mtss
source7__1655,source7__1655,acer,intel,,celeron,1 ghz,4,ddr3l sdram,,500,0,3.04 lbs,8.1 in. 11.4 in,acer aspire v5-132p-10194g50nss 11.6 touchscre...,acer aspire v5132p10194g50nss 11.6 touchscreen...,"[acer, aspire, v5132p10194g50nss, 11.6, touchs...",11.6,aspire
source1__130,source1__130,acer,amd,acer aspire e5-571-588m 15.6 laptop with core ...,i5,acer aspire e5-571-588m 15.6 laptop with core ...,4,acer aspire e5-571-588m 15.6 laptop with core ...,,0,0,,,acer aspire e5-571-588m 15.6 laptop on sale fo...,acer aspire e5571588m 15.6 laptop on sale for ...,"[acer, aspire, e5571588, m, 15.6, laptop, on, ...",15.6,aspire
source1__112,source1__112,acer,intel,acer aspire 5742-6838 15.6 notebook with core ...,i5,0 mhz. acer aspire 5742-6838 15.6 notebook wit...,4,acer aspire 5742-6838 15.6 notebook with core ...,,0,0,,,acer aspire 5742-6838 15.6 notebook on sale fo...,acer aspire 57426838 15.6 notebook on sale for...,"[acer, aspire, 57426838, 15.6, notebook, on, s...",15.6,aspire 57426838
source6__885,source6__885,acer,intel,i3-3227u,i3,1.90 ghz,0,ddr3 sdram. ddr3 sdram,,500,0,4.30 lb,9.6 . 0.8,acer aspire m5-481t-33226g52mtss 14 led ultrab...,acer aspire m5481t33226g52mtss 14 led ultraboo...,"[acer, aspire, m5481t33226g52mtss, 14, led, ul...",14,aspire m5481t33226g52mtss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
source2__406,source2__406,acer,intel,n2930. battery information battery chemistry l...,celeron,1.83 ghz. battery information battery chemistr...,4,ddr3 sdram. ddr3l sdram,,500,0,48.96 oz,8.3 . 0.9 . battery information battery chemis...,best acer aspire v3-111p-c6lc-intel celeron n2...,best acer aspire v3111pc6lcintel celeron n2930...,"[best, acer, aspire, v3111pc6lcintel, celeron,...",15.6,aspire v3111pc6lcintel
source2__93,source2__93,acer,intel,i5-3230m. battery information battery chemistr...,i5,2.60 ghz. battery information battery chemistr...,6,ddr3 sdram. ddr3 sdram,,500,0,112.8 oz,10.8 . 1.4 . battery information battery chemi...,best acer aspire e1-771-53236g50mnii-6gb-500gb...,best acer aspire e177153236g50mnii6gb500gb hdd...,"[best, acer, aspire, e177153236g50mnii6gb500, ...",15.6,aspire
source2__110,source2__110,acer,amd,a4-6210. product name aspire e5-521-435w a4-62...,a-series,1.80 ghz. product name aspire e5-521-435w a4-6...,4,ddr3 sdram. ddr3l sdram,,500,0,,10.1 . 1.2 . 1 . 1.19 . product name aspire e5...,best acer aspire e5-521-435w-amd a-series a4-6...,best acer aspire e5521435wamd aseries a46210 1...,"[best, acer, aspire, e5521435wamd, aseries, a4...",15.6,aspire
source2__492,source2__492,acer,intel,i3-3227u. battery information battery chemistr...,i3,1.90 ghz. battery information battery chemistr...,6,ddr3 sdram. ddr3 sdram,,500,0,68.8 oz,9.6 . 0.8 . battery information battery chemis...,best acer aspire m5-481t-33226g52mtss-i3-3227u...,best acer aspire m5481t33226g52mtssi33227u6gb5...,"[best, acer, aspire, m5481t33226g52mtssi33227u...",15.6,aspire m5481t33226g52mtssi33227u6gb500


In [129]:
acer_title = list(output[0].title)
acer_title

['acer aspire m5-481t-33226g52mtss 14 led ultrabook - intel core i3 i3-3227u 1.90 ghz 6 gb ram - 500 gb hdd - 20 gb ssd - dvd-writer - intel hd 4000 - windows 7 home premium 64-bit - 1366 x 768 display - bluetooth price comparison at buy.net',
 'acer aspire v5-132p-10194g50nss 11.6 touchscreen led notebook - intel celeron 1019y 1 ghz - silver 4 gb ram - 500 gb hdd - intel hd graphics - windows 8 64-bit - 1366 x 768 display - bluetooth price comparison at buy.net',
 'acer aspire e5-571-588m 15.6 laptop on sale for 390.99',
 'acer aspire 5742-6838 15.6 notebook on sale for 838.43',
 'acer aspire m5-481t-33226g52mtss 14 led ultrabook - intel core i3 i3-3227u 1.90 ghz - nxm26aa009 - laptops notebooks - acer laptops notebooks - thenerds.net',
 'acer aspire e1-731-20204g50mnii 17.3 led notebook - intel pentium 2020m 2.40 ghz - nxmgaaa004 - laptops notebooks - acer laptops notebooks - thenerds.net',
 'acer aspire p3-171-3322y4g12as ultrabook tablet - 11.6 - in-plane switching ips technology -

In [267]:
regex = [r'\s[esrvmpa]\d-?\s?.....-?....?', r'\s[esrvma]?\d-....', r'\s?acer\saspire\s\d{4}-\d{4}', r'\s?acer\sextensa\s.{0,2}\d{4}\s?\d{0,4}', r'\s?acer\saspire\sas\d{4}-?\s?\d{4}',
        r'\s?acer\saspire\s\d{4}-?\s\d{4}']
c = 0
for t in acer_title:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
#             print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)  
        pass
print(c)

63


In [143]:
asus_title = list(output[1].title)
asus_title

['asus zenbook ux301la-dh51t 13.3 touchscreen ultrabook - intel core i5 i5-4200u 1.60 ghz - blue 8 gb ram - 256 gb ssd - intel hd 4400 - windows 8 64-bit - 2560 x 1440 display - bluetooth price comparison at buy.net',
 'asus ux31a-xb52 13.3 laptop on sale for 1414.95',
 'asus zenbook prime ux31a 13.3 ultrabook on sale for 749.99',
 'asus zenbook ux21e 11 6 core i5 2467m 128gb ssd 4gb ram microsoft office inclu ebay',
 'new asus zenbook prime ux31a xb52 13 3 fhd ultrabook i5 3317u 4gb 256gb ssd 886227257386 ebay',
 'open box asus zenbook ux21e dh52 i5 2467m 1 60ghz 4gb 128gb ssd 11 6 win 7 884840974178 ebay',
 'best asus zenbook prime ux31a-xb52-core i5-3317u 1.70 ghz-4gb ram-256gb ssd-win 7 pro-13.3 ultrabook-silver aluminum topendelectronics uk',
 'best asus zenbook ux301la-dh51t intel core i5-4200u-1.6ghz-13.3 touchscreen ultrabook topendelectronics uk']

In [163]:
c = 0
regex = [r'\sux...-.....', r'\sux.{3,5}-?.....?'] # There is a problem here
c = 0
for t in asus_title:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
#             print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)  
        pass
print(c)

8


In [164]:
dell = ((list(output[2].title)))
print(len(dell))
dell

11


['dell silver 15.6 inspiron 15 5547 laptop pc with intel core i5-4210u processor, 6gb memory, touchscreen, 1tb hard drive and windows 8.1 computers walmart.com',
 'dell inspiron 15 7537 15 6 touch screen laptop i7 4500u 1 8ghz 8gb 1tb 1080p bt ebay',
 'dell inspiron 17r i5735 17 3 hd a8 5545m 1 7ghz 8gb ram 1tb hdd win 8 1 laptop ebay',
 'dell inspiron 15 7000 7537 i7 4500u 8gb 1tb 1080p touch bklit bt dell wty ebay',
 'brand new dell inspiron 15 7000 7537 i7 4500u 8gb 1tb 1080p webcam backlit bt w8 ebay',
 'dell inspiron 17r 5735 laptop quad core a8 5545m 1tb 8gb ddr3 windows 8 1 ebay',
 'dell inspiron 15 5547 core i5 4th gen 6gb 1tb win 8 64 bit ebay',
 'dell inspiron 15 7537 8gb intel r core tm i7 4500u cpu 1 80ghz windows 8 ebay',
 'dell inspiron 15 7000 7537 i7 4500u 8gb 1tb hybrid 1080p touch gt750m wcam w8 ebay',
 'dell inspiron 17 3 win8 1 m731r amd quad core a8 5545m 1 7ghz 8gb 1tb laptop ebay',
 'dell inspiron 15 i5547-3751slv 15.6 i5547-3751slv b h']

In [183]:
c = 0
regex = [r'\s[nmi]\d{3,4}(-\d{4})?', r'\sinspiron\s15?\s?\d{4}', r'\sinspiron\s17?.?\s?\d{4}']
words = ['inspiron', '15', '17', '   ', '  ']
for t in dell:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
#             print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)    
print()
print(c)


11


In [184]:
lenovo = (list(output[4].title))
print(len(lenovo))
# lenovo

56


In [186]:
c = 0
regex = [r'\sx\d{3}\s?tablet?\s?\d{0,4}', r'\sx\d{3}\s?laptop?\s?\d{0,4}', r'\sx\d{3}\s?\d{0,4}',r'\sx\d{1}\scarbon\s\d{4}', r'\sx\d{1}\scarbon touch\s\d{4}']
for t in lenovo:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
#             print(re.search(cr, t).group(), "___", t[:50])
            c+=1
            found = True
            break
    if not found:
        print(t)
    
print()
print(c)

miniprice.ca - lenovo thinkpad 3448c5u 14 led ultrabook - intel core i7 i7-3667u 2 ghz - black - 1600 x 900 hd display - 8 gb ram - 128 gb ssd - intel hd 4000 graphics - bluetooth - webcam - finger print reader 3448c5u 
lenovo thinkpad x1 carbon touch ultrabook i7 3667u 8gb ram 240gb ssd 3444cuu 887619432923 ebay
lenovo thinkpad x1 carbon ultrabook i7 3667u 2ghz 14 8gb 240gb ssd 3448 2d9 02 ebay
lenovo thinkpad x1 carbon ultrabook i7 3667u 2ghz 14 8gb 240gb ssd 3448 2d9 ebay

52


In [187]:
hp = ((list(output[3].title)))
list(output[3].title)



['hp elitebook 8560p intel core i7 quad 2.2ghz 128gb ssd 15.6-inch laptop computer refurbished overstock.com shopping - the best deals on laptops',
 'hp 8540w notebook pc refurbished overstock.com shopping - the best deals on laptops',
 'hp elitebook 8560p intel core i7 quad core 2.2ghz 750gb 15.6-inch laptop computer refurbished overstock.com shopping - the best deals on laptops',
 'hp hewlett-packard elitebook 8440p 14 notebook 2.6 ghz intel core i7-620m, 4 gb ddr3, 250 gb hdd, dvdrw, windows 7 professional, led backlight professional - 700443660570 price comparison at buy.net',
 'hp elitebook revolve 810 g2 tablet - 11.6 - core i5 4310u - windows 8.1 pro 64-bit - 4 gb ram - 128 gb ssd',
 'hp 15-f009wm 15.6 notebook pc on sale for 229.22',
 'hp 15-g070nr 15.6 laptop on sale for 229.00',
 'hp elitebook revolve 810 g2 tablet pc - 11.6 - wireless lan - intel core i5 i5-4310u 2 ghz - j8u39utaba - laptops notebooks - hewlett-packard laptops notebooks - thenerds.net',
 ' hp 15-r150nr 15.6-

In [228]:
c = 0
regex = [r'\sfolio\s?\d{4}.', r'\selitebook\s?-?\d{3,4}.',
         r'\s\d{2}-.{4,6}', r'hp\s?\d{4}.', r'\spavilion\s?..\s?.{5}',
         r'\s?compaq\s?.{5}', r'\s?hp\s?15\s?[a-z]\d{3}[a-z]{1,2}',  r'\shp\s?12\s?.{5}',
         r'\s?elitebook\srevolve\s?\d{3}\s?', '\s.\d{3}[pgmwm][pgmwm]?']
for t in hp:
    found = False
    for r in regex: 
        cr = re.compile(r)
        if re.search(cr, t):
#             print(re.search(cr, t).group())
            c+=1
            found = True
            break
    if not found:
        print(t)  
        pass
print()
print(c)

hp 15 series laptop brand new 500 gb ebay
new hp 15 6 laptop 500gb hdd 4gb memory dual core windows 8 hdmi webcam wifi ebay

197


In [7]:
# Clean X4
x4_org = pd.read_csv('../data/sigmod/X4.csv')


In [31]:

def clean_products_dataset(x_org):
    replace_words = {
        'professional': 'pro',
        'data traveler': 'datatraveler',
        ' hx ': 'hyperx',
        'generation': 'g',
        ' micro sd': ' microsd',
        ' extrem ': 'extreme',
        ' classe ': 'class'
    }

    remove_words = ['tesco', 'direct', 'accessoires' ,'montres', 'bracelets', 'connects' ]
    
    all_brands = ['lexar', 'sony', 'sandisk', 'pny', 'kingston', 'samsung',
       'intenso', 'toshiba', 'transcend']
    
    brand_lines = {'lexar': {'stick': ['xqd', 'xqd pro', 'platinum'],
                             'flash': ['jumpdrive'],
                             'mobile': [],
                             'tv': []},
                   'sony': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'sandisk': {'stick': ['extreme pro', 'ultra plus', 'ultra'], 
                               'flash': ['cruzer glide', 'cruzer edge', 'cruzer fit', 'cruzer'],
                               'mobile': [], 'tv': []},
                   'pny': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'kingston': 
                   {
                       'stick': ['ultimate', 'hyperx'],
                       'flash': ['datatraveler', 'hyperx savage', 'hyperx'],
                       'mobile': [], 'tv': []
                   },
                   'samsung': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'intenso': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'toshiba': {'stick': ['exceria pro', 'exceria'],
                               'flash': [], 'mobile': [], 'tv': []},
                   'transcend': {'stick': [], 'flash': [], 'mobile': [], 'tv': []}}
    
    model_regex = {'lexar': {
                         'stick': [r'\s\d{3,4}x'],
                         'flash': [r'\sv\d\d.?', r'\sp\d\d.?', r'\sc\d\d.?', r'\ss\d\d.?'],
                         'mobile': [],
                         'tv': []
                    },
               'sony': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'sandisk': {'stick': [r'\s\d{4}x', r'x\d{2}', ],
                           'flash': [], 'mobile': [], 'tv': []},
               'pny': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   
               'kingston': {'stick': [],
                            'flash': [r'dt\d\d\d(g\d)?', r'\sg\d', r'101', r'g\d'],
                            'mobile': [], 'tv': []},
                   
               'samsung': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'intenso': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'toshiba': {'stick': [r'[umn]\d{2,4}.?'], 'flash': [r'[umn]\d{2,4}.?'], 'mobile': [], 'tv': []},
               'transcend': {'stick': [], 'flash': [], 'mobile': [], 'tv': []}}
    
    class_regex = [r'class\s?\d{1,2}\s', r'\sc\d{1,2}\s']
    
    spacy.cli.download("en_core_web_sm")

    x4_dev = convert_numbers_to_strings(x_org, ['price']).copy(deep=True)
    x4_dev.set_index('instance_id', inplace=True)

    def get_type(record):
        name = record['name'].lower()
        
        for b, d in brand_lines.items():
            for w in d['stick']:
                if w in name:
                    return 'stick'
            
            for w in d['flash']:
                if w in name:
                    return 'flash'

        if pd.isna(record['size']):
            if 'tv' in name:
                return 'tv'
            return 'mobile'

        flash_keywords = ['usb', 'drive', 'flashdisk', 'cruzer']
        memory_stick_keywords = ['card', 'stick', 'sd', 'microsd', 'hc', 'class', 'speicherkarte']  # Add variants here

        is_flash = False
        is_memory = False

        for w in flash_keywords:
            if w in name:
                is_flash = True
                break

        for w in memory_stick_keywords:
            if w in name:
                is_memory = True
                break

        if is_flash:
            return 'flash'

        if is_memory:
            return 'stick'

        return 'stick'

    with open('../data/sigmod/translations_lookup_all.json') as fin:
        variants = json.load(fin)

    with open('../data/sigmod/langs_dict.json') as fin:
        json.load(fin)

    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+')  # Why it doesn't work
    x4_dev.replace({r'[^\x00-\x7F]+': ''}, regex=True, inplace=True)

    for column in x4_dev.columns:
        if column == 'instance_id':
            continue
        x4_dev[column] = x4_dev[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex,
                                                                                                   ' ')

    x4_dev['product_type'] = x4_dev.apply(get_type, axis=1)
    x4_dev.drop('price', inplace=True, axis=1)
    x4_dev['size'] = x4_dev['size'].str.lower().str.replace(' ', '')
    x4_dev['size'] = x4_dev['size'].where(x4_dev['size'].notnull(), 0)
    
    
    def get_line(record):
        name = record['name'].lower()
        brand = record['brand']
        product_type = record['product_type'] 

        for w in brand_lines[brand][product_type]:
            if w in name:
                return w

        return None

    def get_model_number(record):
        name = record['name'].lower()
        brand = record['brand']
        product_type = record['product_type'] 
        
        for t in model_regex[brand][product_type]:
            cr = re.compile(t)
            if re.search(cr, name):
                return re.search(cr, name).group()
        
        return None
    
    x4_dev['line'] = x4_dev.apply(get_line, axis=1)
    x4_dev['model'] = x4_dev.apply(get_model_number, axis=1)

    # Remove unwanted words from the name
    for i in range(len(x4_dev)):
        record = x4_dev.iloc[i]

        name = record['name']

        # remove unnecessary characters
        basic_punct = '-/\*_,:;/()®™'
        punct_to_space = str.maketrans(basic_punct, ' ' * len(basic_punct))  # map punctuation to space
        name = name.translate(punct_to_space)

        # remove brand
        name = name.replace(record['brand'], '')

        # remove size

        if record.product_type in ['flash', 'stick']:
            name = re.sub('\d\d\d\s?gb', '', name, 6)
            name = re.sub('\d\d\s?gb', '', name, 6)
            name = re.sub('\d\s?gb', '', name, 6)

        tokens = name.split(' ')
        for wd, wdtl in variants.items():
            while wd in tokens:
                tokens.remove(wd)
            for wdt in wdtl:
                while wdt in tokens:
                    tokens.remove(wdt)

        unneeded_words = ['mmoire', 'speicherkarte', 'flashgeheugenkaart', 'flash', 'stick', 'speed', 'high']
        for w in unneeded_words:
            while w in tokens:
                tokens.remove(w)
        x4_dev.iloc[i]['name'] = ' '.join(tokens)

    for column in x4_dev.columns:
        if column == 'instance_id':
            continue
        x4_dev[column] = x4_dev[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex,
                                                                                                   ' ')

    return x4_dev

In [32]:
x4_dev = clean_products_dataset(x4_org)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [33]:
x4_dev.head()

Unnamed: 0_level_0,name,brand,size,product_type,line,number
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
altosight.com//0,lexar 32gb 1400x 210mb s professional xqd memr...,lexar,32gb,stick,xqd,1400x
altosight.com//25,sony microsdxc uhs-1 u3 128gb memriakrtya adap...,sony,128gb,stick,,
altosight.com//66,"sandisk dual drive, type-c, usb 3.0, 16gb, 130...",sandisk,16gb,flash,,
altosight.com//68,"sandisk dual drive, type-c, usb 3.0, 64gb, 150...",sandisk,64gb,flash,,
altosight.com//94,lexar xqd 32gb x1400 professional xqd kupon pr...,lexar,32gb,stick,xqd,


In [17]:
all_brands = x4_dev.brand.unique()
all_brands

array(['lexar', 'sony', 'sandisk', 'pny', 'kingston', 'samsung',
       'intenso', 'toshiba', 'transcend'], dtype=object)

In [18]:
x4_dev.product_type.unique()

array(['stick', 'flash', 'mobile', 'tv'], dtype=object)

In [None]:
replace_words = {
    'professional': 'pro'
    'data traveler': 'datatraveler'
    ' hx ': 'hyperx',
    'generation': 'g',
    ' micro sd': ' microsd',
    ' extrem ': 'extreme',
    ' classe ': 'class'
}

remove_words = ['tesco', 'direct', 'accessoires' ,'montres', 'bracelets', 'connects' ]

In [19]:
def assign_line(record):
    brand = record['brand']
    product_type = record['product_type']
    
    all_brands = ['lexar', 'sony', 'sandisk', 'pny', 'kingston', 'samsung',
       'intenso', 'toshiba', 'transcend']
    
    brand_lines = {'lexar': {'stick': ['xqd', 'xqd pro', 'platinum'],
                             'flash': ['jumpdrive'],
                             'mobile': [],
                             'tv': []},
                   'sony': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'sandisk': {'stick': ['extreme pro', 'ultra plus', 'ultra'], 
                               'flash': ['cruzer glide', 'cruzer edge', 'cruzer fit', 'cruzer'],
                               'mobile': [], 'tv': []},
                   'pny': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'kingston': 
                   {
                       'stick': ['ultimate', 'hyperx'],
                       'flash': ['datatraveler', 'hyperx savage', 'hyperx'],
                       'mobile': [], 'tv': []
                   },
                   'samsung': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'intenso': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   'toshiba': {'stick': ['exceria pro', 'exceria'],
                               'flash': [], 'mobile': [], 'tv': []},
                   'transcend': {'stick': [], 'flash': [], 'mobile': [], 'tv': []}}
    
    model_regex = {'lexar': {
                         'stick': [r'\s\d{3,4}x'],
                         'flash': [r'\sv\d\d.?', r'\sp\d\d.?', r'\sc\d\d.?', r'\ss\d\d.?'],
                         'mobile': [],
                         'tv': []
                    },
               'sony': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'sandisk': {'stick': [r'\s\d{4}x', r'x\d{2}', ],
                           'flash': [], 'mobile': [], 'tv': []},
               'pny': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
                   
               'kingston': {'stick': [],
                            'flash': [r'dt\d\d\d(g\d)?', r'\sg\d', r'101', r'g\d'],
                            'mobile': [], 'tv': []},
                   
               'samsung': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'intenso': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
               'toshiba': {'stick': [r'[umn]\d{2,4}.?'], 'flash': [r'[umn]\d{2,4}.?'], 'mobile': [], 'tv': []},
               'transcend': {'stick': [], 'flash': [], 'mobile': [], 'tv': []}}
    
    class_regex = [r'class\s?\d{1,2}\s', r['\sc\d{1,2}\s']]
    
    if brand not in all_brands:
        return None
    
    if product_type not in list(brand_lines[brand].keys()):
        return None
    
    # Check in the name for the possible brand lines
    

In [20]:
brand_lines = {}
for b in all_brands: 
    brand_lines[b] = {
        'stick': [],
        'flash': [],
        'mobile': [],
        'tv': []
    }

In [21]:
brand_lines

{'lexar': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'sony': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'sandisk': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'pny': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'kingston': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'samsung': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'intenso': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'toshiba': {'stick': [], 'flash': [], 'mobile': [], 'tv': []},
 'transcend': {'stick': [], 'flash': [], 'mobile': [], 'tv': []}}

In [22]:
x4_dev.groupby('brand').count()

Unnamed: 0_level_0,name,size,product_type
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
intenso,61,61,61
kingston,115,115,115
lexar,163,163,163
pny,29,29,29
samsung,42,12,42
sandisk,171,171,171
sony,99,99,99
toshiba,146,146,146
transcend,9,9,9
