In [289]:
import pandas as pd
import re
import spacy

In [290]:
sp = spacy.load('en_core_web_sm')

In [315]:
x1 = pd.read_csv('../data/sigmod/X1.csv')
x2 = pd.read_csv('../data/sigmod/X2.csv')
x3 = pd.read_csv('../data/sigmod/X3.csv')
x4 = pd.read_csv('../data/sigmod/X4.csv')

In [292]:
len(x1.columns), len(x2.columns), len(x3.columns), len(x4.columns)

(16, 14, 14, 5)

In [293]:
x1.columns # --> convert it to be a validation set 

Index(['cpu_brand', 'battery_chemistry', 'ram_type', 'cpu_frequency',
       'hdd_capacity', 'dimensions_height', 'brand', 'cpu_type', 'title',
       'cpu_cache', 'battery_life', 'dimensions_depth', 'dimensions',
       'display_size', 'instance_id', 'cpu_model'],
      dtype='object')

In [294]:
x2.columns

Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
      dtype='object')

In [295]:
x3.columns

Index(['instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
       'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
       'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'],
      dtype='object')

In [296]:
x4.columns

Index(['name', 'price', 'brand', 'size', 'instance_id'], dtype='object')

In [613]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), None)
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]
    
    df['new_title'] = df.title
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1) 

    # Brand assignment
    all_brands = set()
    
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)
        
        for sc in screen_sizes:
            if str(sc) in arr:
                return sc
        
        else:
            return 15.6 # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)
    
    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0
    
    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        
        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return re.findall("\d{3,4}gb", s)[0][:-2]
        if re.search("\dtb", s):
            return re.findall("\dtb", s)[0][:-2] + '000'
        if re.search("\d{3,4}gbhdd", s2):
            return re.findall("\d{3,4}gbhdd", s2)[0][:-5]
        if re.search("hdd\d{3,4}gb", s2):
            return re.findall("hdd\d{3,4}gb", s2)[0][3:-2]
        if re.search("hdd\d{1}tb", s2):
            return re.findall("hdd\d{1}tb", s2)[0][3:4] + '000'
        if re.search("\d{1}tbhdd", s2):
            return re.findall("\d{1}tbhdd", s2)[0][0] + '000'
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
    
    def assign_hdd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        

        if re.search("\d{3,4}gb", s):
            return re.findall("\d{3,4}gb", s)[0][:-2]
        if re.search("\dtb", s):
            return re.findall("\dtb", s)[0][:-2] + '000'
        if re.search("\d{3,4}gbssd", s2):
            return re.findall("\d{3,4}gbssd", s2)[0][:-5]
        if re.search("ssd\d{3,4}gb", s2):
            return re.findall("ssd\d{3,4}gb", s2)[0][3:-2]
        if re.search("ssd\d{1}tb", s2):
            return re.findall("ssd\d{1}tb", s2)[0][3:4] + '000'
        if re.search("\d{1}tbssd", s2):
            return re.findall("\d{1}tbssd", s2)[0][0] + '000'
        return 0
            
    df['ssd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
    
    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if (sc in brand_tokens[i]):
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i 
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])        

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['ram_capacity', 'screen_size'])
    # Unit stand. in weight
    
    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom", 
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)
    
    return df

In [614]:
x2 = pd.read_csv('../data/sigmod/X2.csv')
x2.head()

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title
0,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,,1.80 kg,,"Lenovo Thinkpad X230 34352jf Tablet Pc - 12.5""..."
1,www.isupplyhub.com//1256,Acer,1.6 GHz Intel Core i5-4200U. Intel Core I5,,1.6 GHz Intel Core i5-4200U,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...
2,www.isupplyhub.com//326,Acer,1.6 GHz Intel Core i5. Intel Core I5,,1.6 GHz Intel Core i5,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...
3,www.isupplyhub.com//821,HP,,,,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"Amazon.com : 15.6"" HP 15-f009wm Amd Dual-Core ..."
4,www.isupplyhub.com//157,Asus,1.7 GHz Core i5-3317U. Intel,,1.7 GHz Core i5-3317U,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...


In [615]:
x = x2.copy()
x.set_index('instance_id',  inplace=True, drop=False)
x = preprocess_laptop_dataset(x)
x = fill_nulls_with_none(x)
x = convert_numbers_to_strings(x, ['ram_capacity', 'screen_size'])
x.head(8)

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
www.softwarecity.ca//737,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,lenovo thinkpad x230 34352jf tablet pc 12.5 in...,"[lenovo, thinkpad, x230, 34352jf, tablet, pc, ...",12.5,thinkpad x230 34352jf
www.isupplyhub.com//1256,www.isupplyhub.com//1256,acer,intel,,i5,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,,500,0,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch...,amazon.com acer aspire v7582pg6479 15.6inch to...,"[amazon.com, acer, aspire, v7582pg6479, 15.6in...",15.6,aspire v7582pg6479
www.isupplyhub.com//326,www.isupplyhub.com//326,acer,intel,,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,,500,0,5.2 pounds,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch l...,amazon.com acer aspire e15726870 15.6 inch lap...,"[amazon.com, acer, aspire, e15726870, 15.6, in...",15.6,aspire
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,,,,4,ddr3 sdram. 4 gb sdram ddr3,,500,0,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,amazon.com 15.6 hp 15f009wm amd dualcore e1210...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore...",15.6,15f009wm amd
www.isupplyhub.com//157,www.isupplyhub.com//157,asus,intel,,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,,0,256,2.9 pounds,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,amazon.com asus ux31axb52 13.3inch ultrabook 1...,"[amazon.com, asus, ux31axb52, 13.3inch, ultrab...",13.3,ux31axb52
www.isupplyhub.com//985,www.isupplyhub.com//985,lenovo,intel,,i7,intel core i7,0,3 gb,,160,160,,,amazon.com lenovo 3093b51 thinkpad x201 tablet...,amazon.com lenovo 3093b51 thinkpad x201 tablet...,"[amazon.com, lenovo, 3093b51, thinkpad, x201, ...",15.6,3093b51 thinkpad
www.isupplyhub.com//648,www.isupplyhub.com//648,acer,intel,,i5,2.6 ghz core i5 3230m,6,ddr3 sdram. 6 gb ddr3,,500,0,7.1 pounds,16.30 x 10.80 x 1.40 inches,amazon.com acer aspire nx.mg7aa.005 e1-771-649...,amazon.com acer aspire nx.mg7aa.005 e17716496 ...,"[amazon.com, acer, aspire, nx.mg7aa.005, e1771...",17.3,aspire nx.mg7aa.005
www.isupplyhub.com//669,www.isupplyhub.com//669,acer,intel,,i7,,2,12 gb ddr3,,1000,120,16 pounds,,amazon.com acer aspire v3-772g-9460 17.3 led n...,amazon.com acer aspire v3772g9460 17.3 led not...,"[amazon.com, acer, aspire, v3772g9460, 17.3, l...",12.0,aspire v3772g9460


In [599]:
def assign_hdd_capacity(record):
    s = str(record['hdd_capacity']).replace(' ', '')
    s2 = str(record['title'].replace(' ', ''))
    print(s)
    print(s2)
    print()
    if re.search("\d{3,4}gb", s):
        return re.findall("\d{3,4}gb", s)[0][:-2]
    if re.search("\dtb", s):
        return re.findall("\dtb", s)[0][:-2] + '000'
    if re.search("\d{3,4}gbhdd", s2):
        return re.findall("\d{3,4}gbhdd", s2)[0][:-5]
    if re.search("hdd\d{3,4}gb", s2):
        print (re.findall("hdd\d{3,4}gb", s2)[0])
        return re.findall("hdd\d{3,4}gb", s2)[0][3:-2]
    if re.search("hdd\d{1}tb", s2):
        return re.findall("hdd\d{1}tb", s2)[0][3:4] + '000'
    if re.search("\d{1}tbhdd", s2):
        return re.findall("\d{1}tbhdd", s2)[0][0] + '000'
    return 0

In [600]:
re.findall("\d\d\dgb", str(x.iloc[0, :]['hdd_capacity']).replace(' ', ''))

['320gb']

In [601]:
x.iloc[:6, ].apply(assign_hdd_capacity, axis=1)

320gb
lenovothinkpadx23034352jftabletpc-12.5-in-planeswitchingipstechnology-wirelesslan-intelcorei5i5-3320m2.60ghz-black-4gbram-320gbhdd-windows7professional64-bit-convertible-1366x768multi-touchscreendisplayledbacklight-bluetooth-frenchkeyboard-34352jf-softwarecity.ca-canada

500gbmechanicalharddrive
amazon.comaceraspirev7-582pg-647915.6-inchtouchscreenultrabookcoolsteelcomputersaccessories

500gbmechanicalharddrive
amazon.comaceraspiree1-572-687015.6inchlaptopinteli54200u1.6ghzprocessor4gbram500gbharddrivewindows8clarinetblacklaptopcomputerscomputersaccessories

500gb
amazon.com15.6hp15-f009wmamddual-coree1-21004gbddr3ram500gbhdwebcamwindows8.1certifiedrefurbishedcomputersaccessories

256mb
amazon.comasusux31a-xb5213.3-inchultrabook1.7ghzintelcorei5-3317uprocessor4gbddr3256gbssdwindows7professionalsilveraluminumlaptopcomputerscomputersaccessories

160gb
amazon.comlenovo3093b51thinkpadx201tablet3093-convertible-corei7640lm2.13ghz-winxptabletpc2005-3gbram-160gbssd-noopticaldrive-12.1in

instance_id
www.softwarecity.ca//737    320
www.isupplyhub.com//1256    500
www.isupplyhub.com//326     500
www.isupplyhub.com//821     500
www.isupplyhub.com//157       0
www.isupplyhub.com//985     160
dtype: object

In [416]:
def find_cpu_type(record):
    # Find the cpu type
    cpu_list = ["i5", "i3", "i7", "atom", 
                "pentium", "celeron", "a-series",
                "e-series", "aseries", "eseries",
                "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]
    
    for cpu in cpu_list:
        if record['cpu_type'] is not None and cpu in record['cpu_type']:
            return cpu
        if cpu in record['title']:
            return cpu
        if record['cpu_model'] is not None and cpu in record['cpu_model']:
            return cpu
        if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
            return cpu
        
        if re.search("e-[0-9]{3}", record['title']):
            return re.findall("e-[0-9]{3}", record['title'])[0]
                
        if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
            return re.findall("e-[0-9]{3}", record['cpu_model'])[0]
        
x['cpu_type'] = x.apply(find_cpu_type, axis=1)
x[x['cpu_type'].isna()]

Unnamed: 0_level_0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,new_title,new_title_tokens,screen_size,model
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
www.isupplyhub.com//821,www.isupplyhub.com//821,hp,amd,,,,4,ddr3 sdram. 4 gb sdram ddr3,,500 gb,,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,amazon.com 15.6 hp 15f009wm amd dualcore e1210...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore...",15.6,15f009wm amd


In [409]:
x.cpu_type.unique()

array(['i5', None, 'i7', 'a8', 'celeron', 'pentium', 'i3', 'a-series',
       'a4', 'e-series', 'e-300', 'e-450', 'e-350'], dtype=object)

In [344]:
# preprocess_laptop_dataset(x2.copy(deep=True)).head(20)

In [193]:
x2_dev = preprocess_laptop_dataset(x2.copy(deep=True))

In [80]:
x2_dev['tokens'] = x2_dev.title.apply(sp)

In [81]:
x2_dev.loc[:,['tokens']]

Unnamed: 0,tokens
0,"(lenovo, thinkpad, x230, 34352jf, tablet, pc, ..."
1,"(amazon.com, acer, aspire, v7, -, 582pg-6479, ..."
2,"(amazon.com, acer, aspire, e1, -, 572, -, 6870..."
3,"(amazon.com, 15.6, hp, 15, -, f009wm, amd, dua..."
4,"(amazon.com, asus, ux31a, -, xb52, 13.3, -, in..."
...,...
338,"(lenovo, thinkpad, x230, 2320, -, 12.5, -, cor..."
339,"(lenovo, thinkpad, x230, 2325, -, 12.5, -, cor..."
340,"(lenovo, thinkpad, x230, tablet, 3438, -, 12.5..."
341,"(lenovo, thinkpad, x230, 2324, -, 12.5, -, cor..."


In [109]:
x2_dev['new_title'] = x2_dev.title
irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
x2_dev['new_title'] = x2_dev.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')

In [110]:
x2_dev['model'] = pd.Series("" * len(x2_dev))

In [187]:
def get_laptop_model(record):
    brand_tokens = [w.text for w in sp(record['new_title'])]
    
    print(record['new_title'])
    
    screen_sizes = set(pd.read_csv('laptops.csv').Inches)
    screen_sizes = [str(s).lower() for s in screen_sizes]
    
    try:
        brand_index = brand_tokens.index(record['brand'])
        finish_index = brand_index + 1
        should_break = False
        for i in range(2 + brand_index, 5 + brand_index, 1):
            for sc in screen_sizes:
                if (sc in brand_tokens[i]):
                    should_break = True
                    break
            if should_break:
                break
            if not (brand_tokens[i].isalpha()):
                finish_index = i 
            else:
                break
                
    except:
        brand_index = -1
        
    if brand_index == -1:
        return ""
    
    return ' '.join(brand_tokens[brand_index+1:finish_index+1])
    

In [189]:
x2_dev

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title,tokens,new_title,model
0,www.softwarecity.ca//737,lenovo,intel,i5-3320m,dual-core 2 core . core i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320 gb,-999,1.80 kg,-999,lenovo thinkpad x230 34352jf tablet pc - 12.5 ...,"(lenovo, thinkpad, x230, 34352jf, tablet, pc, ...",lenovo thinkpad x230 34352jf tablet pc 12.5 in...,thinkpad x230 34352jf
1,www.isupplyhub.com//1256,acer,intel,-999,1.6 ghz intel core i5-4200u,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,-999,500 gb mechanical hard drive,-999,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch...,"(amazon.com, acer, aspire, v7, -, 582pg-6479, ...",amazon.com acer aspire v7582pg6479 15.6inch to...,aspire v7582pg6479
2,www.isupplyhub.com//326,acer,intel,-999,1.6 ghz intel core i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,-999,500 gb mechanical hard drive,-999,5.2 pounds,15.02 x 10.08 x 1 inches,amazon.com acer aspire e1-572-6870 15.6 inch l...,"(amazon.com, acer, aspire, e1, -, 572, -, 6870...",amazon.com acer aspire e15726870 15.6 inch lap...,aspire e15726870
3,www.isupplyhub.com//821,hp,amd,-999,-999,-999,4,ddr3 sdram. 4 gb sdram ddr3,-999,500 gb,-999,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-...,"(amazon.com, 15.6, hp, 15, -, f009wm, amd, dua...",amazon.com 15.6 hp 15f009wm amd dualcore e1210...,15f009wm
4,www.isupplyhub.com//157,asus,intel,-999,1.7 ghz core i5-3317u,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,-999,256 mb,-999,2.9 pounds,8.80 x 0.70 x 12.80 inches,amazon.com asus ux31a-xb52 13.3-inch ultrabook...,"(amazon.com, asus, ux31a, -, xb52, 13.3, -, in...",amazon.com asus ux31axb52 13.3inch ultrabook 1...,ux31axb52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,www.vology.com//873,lenovo,intel,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,4,4 gb ddr3 slots qty 2 empty slots 1 max ram su...,4 gb ddr3 slots qty 2 empty slots 1 max ram su...,180 gb ssd. 180 gb ssd. lenovo thinkpad x230 2...,180 gb ssd. 180 gb ssd,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,lenovo thinkpad x230 2320 - 12.5 - core i5 332...,"(lenovo, thinkpad, x230, 2320, -, 12.5, -, cor...",lenovo thinkpad x230 2320 12.5 core i5 3320m w...,thinkpad x230 2320
339,www.vology.com//823,lenovo,intel,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,4,4 gb ddr3 slots qty 2 max ram supported 16 gb ...,4 gb ddr3 slots qty 2 max ram supported 16 gb ...,500 gb hdd 7200 rpm. 500 gb hdd 7200 rpm. leno...,500 gb hdd 7200 rpm. 500 gb hdd 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,lenovo thinkpad x230 2325 - 12.5 - core i5 332...,"(lenovo, thinkpad, x230, 2325, -, 12.5, -, cor...",lenovo thinkpad x230 2325 12.5 core i5 3320m w...,thinkpad x230 2325
340,www.vology.com//2723,lenovo,intel,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,4,form factor so dimm 204-pin technology ddr3 sd...,form factor so dimm 204-pin technology ddr3 sd...,500 gb hdd 7200 rpm. 500 gb hdd 7200 rpm. leno...,500 gb hdd 7200 rpm. 500 gb hdd 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,lenovo thinkpad x230 tablet 3438 - 12.5 - core...,"(lenovo, thinkpad, x230, tablet, 3438, -, 12.5...",lenovo thinkpad x230 tablet 3438 12.5 core i5 ...,thinkpad x230
341,www.vology.com//1349,lenovo,intel,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,intel core i5 3rd gen 3320m 2.6 ghz. intel cor...,4,form factor so dimm 204-pin technology ddr3 sd...,form factor so dimm 204-pin technology ddr3 sd...,320 gb hdd 7200 rpm. 320 gb hdd 7200 rpm. leno...,320 gb hdd 7200 rpm. 320 gb hdd 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,lenovo thinkpad x230 2324 - 12.5 - core i5 332...,"(lenovo, thinkpad, x230, 2324, -, 12.5, -, cor...",lenovo thinkpad x230 2324 12.5 core i5 3320m w...,thinkpad x230 2324


In [276]:
x4 = pd.read_csv('../data/sigmod/X4.csv')
x4.set_index('instance_id', inplace=True)
x4.columns

Index(['name', 'price', 'brand', 'size'], dtype='object')

In [277]:
x4.head()

Unnamed: 0_level_0,name,price,brand,size
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altosight.com//0,Lexar 32GB 1400x 210MB/s professional XQD memó...,31990.0,LEXAR,32 GB
altosight.com//25,Sony microSDXC UHS-1 U3 128GB memóriakártya + ...,59990.0,SONY,128 GB
altosight.com//66,"SANDISK DUAL DRIVE, TYPE-C, USB 3.0, 16GB, 130...",6790.0,SANDISK,16 GB
altosight.com//68,"SanDisk Dual Drive, TYPE-C, USB 3.0, 64GB, 150...",14190.0,SANDISK,64 GB
altosight.com//94,Lexar XQD 32GB X1400 PROFESSIONAL XQD +KUPON P...,329.0,LEXAR,32 GB


In [279]:
def preprocess_products_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9.\-\s]')
    multispace_regex = re.compile(r'\s\s+')

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    return df



In [284]:
x4.info

<bound method DataFrame.info of                                                                    name  \
instance_id                                                               
altosight.com//0      Lexar 32GB 1400x 210MB/s professional XQD memó...   
altosight.com//25     Sony microSDXC UHS-1 U3 128GB memóriakártya + ...   
altosight.com//66     SANDISK DUAL DRIVE, TYPE-C, USB 3.0, 16GB, 130...   
altosight.com//68     SanDisk Dual Drive, TYPE-C, USB 3.0, 64GB, 150...   
altosight.com//94     Lexar XQD 32GB X1400 PROFESSIONAL XQD +KUPON P...   
...                                                                 ...   
altosight.com//13815                 Pendrive LEXAR Jumpdrive S25 64 GB   
altosight.com//13832              Pendrive SANDISK Extreme GO 3.1 256GB   
altosight.com//13932        Tarjeta de Memoria SD  SONY 16GB EXPERIENCE   
altosight.com//13944    Tarjeta de Memoria SANDISK Extreme Pro SDHC 8GB   
altosight.com//13950                Tarjeta de Memoria SANDISK SDHC 

In [285]:
x4_dev = convert_numbers_to_strings(x4, ['price'])
x4_dev = preprocess_laptop_dataset(x4_dev.copy(deep=True))

In [286]:
x4_dev.head()

Unnamed: 0_level_0,name,price,brand,size
instance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altosight.com//0,lexar 32gb 1400x 210mb s professional xqd mem ...,31990,lexar,32 gb
altosight.com//25,sony microsdxc uhs-1 u3 128gb mem riak rtya ad...,59990,sony,128 gb
altosight.com//66,sandisk dual drive type-c usb 3.0 16gb 130 mb s,6790,sandisk,16 gb
altosight.com//68,sandisk dual drive type-c usb 3.0 64gb 150 mb s,14190,sandisk,64 gb
altosight.com//94,lexar xqd 32gb x1400 professional xqd kupon pr...,329,lexar,32 gb


In [288]:
list(x4_dev.name.sample(30))

['tesco direct lexar jumpdrive c20m 32gb usb 3.0 3.1 gen 1 type-a type-c black silver',
 'sandisk ultra plus class 10 sd memory card - 64 gb',
 'cl usb intenso cl amp eacute usb 8gb intenso flashdrive premium line 3.0 - blister aluminium mk935883071 ',
 'sony usm128gqx 128 gb ram usb flash drive',
 'pam ov karta kingston sdhc 16gb uhs-i u1 90r 45w ',
 'sandisk extreme 16 gb usb-flash-laufwerk usb 3.0 bis zu 245 mb sek',
 'lexar jumpdrive c20m 128 gb micro-usb flash drive lexj4 ljdc20m-128bbeu',
 'carte m moire sdxc sandisk ultra plus 64 go classe 10',
 'tesco direct kingston technology sdxc uhs-i u3 sda3 128gb uhs class 3 memory',
 'kingston ultimate - flash-speicherkarte - 16 gb - uhs class 1 ',
 'tesco direct lexar jumpdrive s70 16gb usb memory flash drive - burgundy small blister',
 'toshiba transmemory u202 8gb usb stick usb 2.0 aqua',
 'lexar xqd 32gb 210mb s 1400x',
 'sandisk dual drive type-c usb 3.0 16gb 130 mb s',
 'kingston carte microsd action camera uhs-i u3 pour gopro dron

In [324]:
# Exploring back X2 and X3
x2

Unnamed: 0,instance_id,brand,cpu_brand,cpu_model,cpu_type,cpu_frequency,ram_capacity,ram_type,ram_frequency,hdd_capacity,ssd_capacity,weight,dimensions,title
0,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,,1.80 kg,,"Lenovo Thinkpad X230 34352jf Tablet Pc - 12.5""..."
1,www.isupplyhub.com//1256,Acer,1.6 GHz Intel Core i5-4200U. Intel Core I5,,1.6 GHz Intel Core i5-4200U,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...
2,www.isupplyhub.com//326,Acer,1.6 GHz Intel Core i5. Intel Core I5,,1.6 GHz Intel Core i5,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...
3,www.isupplyhub.com//821,HP,,,,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"Amazon.com : 15.6"" HP 15-f009wm Amd Dual-Core ..."
4,www.isupplyhub.com//157,Asus,1.7 GHz Core i5-3317U. Intel,,1.7 GHz Core i5-3317U,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,www.vology.com//873,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,180 GB SSD. 180 GB SSD. Lenovo ThinkPad X230 2...,180 GB SSD. 180 GB SSD,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2320 - 12.5"" - Core i5 33..."
339,www.vology.com//823,Lenovo ThinkPad X230 2325 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2325 - 12.5"" - Core i5 33..."
340,www.vology.com//2723,Lenovo ThinkPad X230 Tablet 3438 - 12.5 '' - C...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,"Lenovo ThinkPad X230 Tablet 3438 - 12.5"" - Cor..."
341,www.vology.com//1349,Lenovo ThinkPad X230 2324 - 12.5 '' - Core i5 ...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm. ...,320 GB HDD / 7200 rpm. 320 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2324 - 12.5"" - Core i5 33..."
