In [23]:
import sys
import py_entitymatching as em
import pandas as pd
import os
import re
import spacy

In [24]:
sp = spacy.load('en_core_web_sm')

In [25]:
def formatNumber(num):
    num = float(num)
    if num % 1 == 0:
        return int(num)
    else:
        return num
def fill_nulls_with_none(df):
    """ Fills nulls in a dataframe with None.
        This is required for the Dedupe package to work properly.
        
        Input: - dataframe with nulls as NaN
        
        Output: - new dataframe with nulls as None
    """
    new_df = df.copy()
    for col in df.columns:
        new_df[col] = new_df[col].where(new_df[col].notnull(), '-1')
    return new_df

def convert_numbers_to_strings(df, cols_to_convert, remove_point_zero=True):
    """ Convert number types to strings in a dataframe.
        This is convoluted as need to keep NoneTypes as NoneTypes for what comes next!
        
        Inputs: - df -> dataframe to convert number types
                - cols_to_convert -> list of columns to convert
                - remove_point_zero -> bool to say whether you want '.0' removed from number
        
        Ouputs: - dataframe with converted number types
    """
    new_df = df.copy()
    for col in cols_to_convert:
        if remove_point_zero:
            new_df[col] = new_df[col].apply(lambda x: str(x).replace('.0','')\
                                            if not isinstance(x, type(None)) else x)
        else:
            new_df[col] = new_df[col].apply(lambda x: str(x)\
                                            if not isinstance(x, type(None)) else x)
    return new_df

extra_brands = set(pd.read_csv('laptops.csv').Company.str.lower().unique())
screen_sizes = set(pd.read_csv('laptops.csv').Inches)
screen_sizes = [str(formatNumber(str(s).lower())) for s in screen_sizes]

def preprocess_laptop_dataset(df):
    # Alpha numeric
    irrelevant_regex = re.compile(r'[^a-z0-9,.\-\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    for column in df.columns:
        if column == 'instance_id':
            continue
        df[column] = df[column].str.lower().str.replace(irrelevant_regex, ' ').str.replace(multispace_regex, ' ')
    
    
    def tokenize_new_tile(record):
        return [w.text for w in sp(record['new_title'])]
    
    df['new_title'] = df.title
    irrelevant_regex = re.compile(r'[^a-z0-9.\s]')
    multispace_regex = re.compile(r'\s\s+') # Why it doesn't work
    df['new_title'] = df.new_title.str.lower().str.replace(irrelevant_regex, '').str.replace(multispace_regex, ' ')
    df['new_title_tokens'] =  df.apply(tokenize_new_tile, axis=1) 

    # Brand assignment
    all_brands = set()
    
    all_brands.update(extra_brands)

    def assign_brand(record):
        # Search in brand first
        if record['brand'] in all_brands:
            return record['brand']
        # then in the title
        for el in all_brands:
            if el in record['title']:
                return el
        return "NNN"

    df['brand'] = df.apply(assign_brand, axis=1)

    # cpu brand
    def assign_cpu_brand(record):
        # Search in brand first
        if 'intel' in str(record['cpu_brand']) or 'intel' in str(record['title']) or \
                'intel' in str(record['cpu_model']) or 'intel' in str(record['cpu_type']):
            return 'intel'
        return 'amd'

    df['cpu_brand'] = df.apply(assign_cpu_brand, axis=1)

    def assign_screen_size(record):
        brand_tokens = record['new_title_tokens']
        arr = []
        for t in brand_tokens:
            s = t.replace('inch', '')
            s = s.replace('in', '')
            arr.append(s)
        
        for sc in screen_sizes:
            if str(sc) in arr:
                return str(sc)
        
        else:
            return str(15.6) # Some relaxation
    df['screen_size'] = df.apply(assign_screen_size, axis=1)
    
    # ram capacity
    def assign_ram_capacity(record):
        s = str(record['ram_capacity']).replace(' ', '')
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb', '256gb', '512gb', '2', '4',
                         '6', '8', '10', '12', '16', '32', '64', '128']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        s = str(record['title']).replace(' ', '')  # This will be wrong, please change
        possible_vals = ['2gb', '4gb', '6gb', '8gb', '10gb', '12gb', '16gb',
                         '32gb', '64gb', '128gb']
        for val in possible_vals:
            if val in s:
                return int(val.replace('gb', ''))

        return 0
    
    def assign_hdd_capacity(record):
        s = str(record['hdd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        
        if 'ssd' in s:
            return 0

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbhdd", s2):
            return int(re.findall("\d{3,4}gbhdd", s2)[0][:-5])
        if re.search("hdd\d{3,4}gb", s2):
            return int(re.findall("hdd\d{3,4}gb", s2)[0][3:-2])
        if re.search("hdd\d{1}tb", s2):
            return int(re.findall("hdd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbhdd", s2):
            return int(re.findall("\d{1}tbhdd", s2)[0][0] + '000')
        return 0
    df['hdd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
                                
    def assign_hdd_capacity(record):
        s = str(record['ssd_capacity']).replace(' ', '')
        s2 = str(record['title'].replace(' ', ''))
        

        if re.search("\d{3,4}gb", s):
            return int(re.findall("\d{3,4}gb", s)[0][:-2])
        if re.search("\dtb", s):
            return int(re.findall("\dtb", s)[0][:-2] + '000')
        if re.search("\d{3,4}gbssd", s2):
            return int(re.findall("\d{3,4}gbssd", s2)[0][:-5])
        if re.search("ssd\d{3,4}gb", s2):
            return int(re.findall("ssd\d{3,4}gb", s2)[0][3:-2])
        if re.search("ssd\d{1}tb", s2):
            return int(re.findall("ssd\d{1}tb", s2)[0][3:4] + '000')
        if re.search("\d{1}tbssd", s2):
            return int(re.findall("\d{1}tbssd", s2)[0][0] + '000')
        return 0
            
    df['ssd_capacity'] = df.apply(assign_hdd_capacity, axis=1)
    
    def assign_laptop_model(record):
        brand_tokens = record['new_title_tokens']
        try:
            brand_index = brand_tokens.index(str(record['brand']))
            finish_index = brand_index + 2
            should_break = False
            for i in range(2 + brand_index, 5 + brand_index, 1):
                for sc in screen_sizes:
                    if (sc in brand_tokens[i]):
                        should_break = True
                        break
                if should_break:
                    if finish_index == i:
                        finish_index -=1
                    break
                if not (brand_tokens[i].isalpha()):
                    finish_index = i 
                else:
                    break
        except:
            brand_index = -1

        if brand_index == -1:
            return None

        return ' '.join(brand_tokens[brand_index+1:finish_index+1])        

    df['model'] = df.apply(assign_laptop_model, axis=1)
    df['ram_capacity'] = df.apply(assign_ram_capacity, axis=1)
    
    df = fill_nulls_with_none(df)
    df = convert_numbers_to_strings(df, ['screen_size'])
    # Unit stand. in weight
    
    def assign_cpu_type(record):
    # Find the cpu type
        cpu_list = ["i5", "i3", "i7", "atom", 
                    "pentium", "celeron", "a-series",
                    "e-series", "aseries", "eseries",
                    "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9"]

        for cpu in cpu_list:
            if record['cpu_type'] is not None and cpu in record['cpu_type']:
                return cpu
            if cpu in record['title']:
                return cpu
            if record['cpu_model'] is not None and cpu in record['cpu_model']:
                return cpu
            if record['cpu_frequency'] is not None and  cpu in record['cpu_frequency']:
                return cpu

            if re.search("e-[0-9]{3}", record['title']):
                return re.findall("e-[0-9]{3}", record['title'])[0]

            if record['cpu_model'] is not None and re.search("e-[0-9]{3}", record['cpu_model']):
                return re.findall("e-[0-9]{3}", record['cpu_model'])[0]

    df['cpu_type'] = df.apply(assign_cpu_type, axis=1)
    
    return df

In [26]:
# read the data
left_x2 = preprocess_laptop_dataset(pd.read_csv('../data/sigmod/X2.csv'))
right_x2 = preprocess_laptop_dataset(pd.read_csv('../data/sigmod/X2.csv'))
y2 = pd.read_csv('../data/sigmod/Y2.csv')

In [27]:
left_new_columns = []
right_new_columns = []
for col in list(left_x2.columns):
    right_new_columns.append('right_' + col)
    left_new_columns.append('left_' + col)
    
left_x2.columns = left_new_columns
right_x2.columns = right_new_columns

left_x2.head()

Unnamed: 0,left_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,left_ram_frequency,left_hdd_capacity,left_ssd_capacity,left_weight,left_dimensions,left_title,left_new_title,left_new_title_tokens,left_screen_size,left_model
0,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...,lenovo thinkpad x230 34352jf tablet pc 12.5 inplane switching ips technology wireless lan intel ...,"[lenovo, thinkpad, x230, 34352jf, tablet, pc, 12.5, inplane, switching, ips, technology, wireles...",12.5,thinkpad x230 34352jf
1,www.isupplyhub.com//1256,acer,intel,-1,i5,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,-1,500,0,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers access...,amazon.com acer aspire v7582pg6479 15.6inch touchscreen ultrabook cool steel computers accessories,"[amazon.com, acer, aspire, v7582pg6479, 15.6inch, touchscreen, ultrabook, cool, steel, computers...",15.6,aspire v7582pg6479
2,www.isupplyhub.com//326,acer,intel,-1,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,-1,500,0,5.2 pounds,15.02 x 10.08 x 1 inches,"amazon.com acer aspire e1-572-6870 15.6 inch laptop intel i5 4200u 1.6ghz processor, 4gb ram, 50...",amazon.com acer aspire e15726870 15.6 inch laptop intel i5 4200u 1.6ghz processor 4gb ram 500gb ...,"[amazon.com, acer, aspire, e15726870, 15.6, inch, laptop, intel, i5, 4200u, 1.6ghz, processor, 4...",15.6,aspire
3,www.isupplyhub.com//821,hp,amd,-1,,-1,4,ddr3 sdram. 4 gb sdram ddr3,-1,500,0,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 cert...,amazon.com 15.6 hp 15f009wm amd dualcore e12100 4gb ddr3 ram 500gb hd webcam windows 8.1 certifi...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore, e12100, 4, gb, ddr3, ram, 500, gb, hd, webcam, w...",15.6,15f009wm amd
4,www.isupplyhub.com//157,asus,intel,-1,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,-1,0,256,2.9 pounds,8.80 x 0.70 x 12.80 inches,"amazon.com asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor, 4gb ddr3, ...",amazon.com asus ux31axb52 13.3inch ultrabook 1.7 ghz intel core i53317u processor 4gb ddr3 256gb...,"[amazon.com, asus, ux31axb52, 13.3inch, ultrabook, 1.7, ghz, intel, core, i53317u, processor, 4,...",13.3,ux31axb52


In [28]:
right_x2.head()

Unnamed: 0,right_instance_id,right_brand,right_cpu_brand,right_cpu_model,right_cpu_type,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title,right_new_title,right_new_title_tokens,right_screen_size,right_model
0,www.softwarecity.ca//737,lenovo,intel,i5-3320m,i5,2.60 ghz,4,ddr3 sdram. ddr3-1600 pc3-12800. ddr3 sdram,ddr3-1600 pc3-12800,320,0,1.80 kg,-1,lenovo thinkpad x230 34352jf tablet pc - 12.5 - in-plane switching ips technology - wireless lan...,lenovo thinkpad x230 34352jf tablet pc 12.5 inplane switching ips technology wireless lan intel ...,"[lenovo, thinkpad, x230, 34352jf, tablet, pc, 12.5, inplane, switching, ips, technology, wireles...",12.5,thinkpad x230 34352jf
1,www.isupplyhub.com//1256,acer,intel,-1,i5,1.6 ghz intel core i5-4200u,8,ddr3 sdram. 8 gb ddr3l sdram,-1,500,0,4.8 pounds,15.02 x 10.08 x 0.90 inches,amazon.com acer aspire v7-582pg-6479 15.6-inch touchscreen ultrabook cool steel computers access...,amazon.com acer aspire v7582pg6479 15.6inch touchscreen ultrabook cool steel computers accessories,"[amazon.com, acer, aspire, v7582pg6479, 15.6inch, touchscreen, ultrabook, cool, steel, computers...",15.6,aspire v7582pg6479
2,www.isupplyhub.com//326,acer,intel,-1,i5,1.6 ghz intel core i5,4,ddr3 sdram. 4 gb ddr3-sdram,-1,500,0,5.2 pounds,15.02 x 10.08 x 1 inches,"amazon.com acer aspire e1-572-6870 15.6 inch laptop intel i5 4200u 1.6ghz processor, 4gb ram, 50...",amazon.com acer aspire e15726870 15.6 inch laptop intel i5 4200u 1.6ghz processor 4gb ram 500gb ...,"[amazon.com, acer, aspire, e15726870, 15.6, inch, laptop, intel, i5, 4200u, 1.6ghz, processor, 4...",15.6,aspire
3,www.isupplyhub.com//821,hp,amd,-1,,-1,4,ddr3 sdram. 4 gb sdram ddr3,-1,500,0,4.8 pounds,15.18 x 0.89 x 10.16 inches,amazon.com 15.6 hp 15-f009wm amd dual-core e1-2100 4gb ddr3 ram 500gb hd webcam windows 8.1 cert...,amazon.com 15.6 hp 15f009wm amd dualcore e12100 4gb ddr3 ram 500gb hd webcam windows 8.1 certifi...,"[amazon.com, 15.6, hp, 15f009wm, amd, dualcore, e12100, 4, gb, ddr3, ram, 500, gb, hd, webcam, w...",15.6,15f009wm amd
4,www.isupplyhub.com//157,asus,intel,-1,i5,1.7 ghz core i5-3317u,4,ddr3 sdram. 4 gb ddr3,-1,0,256,2.9 pounds,8.80 x 0.70 x 12.80 inches,"amazon.com asus ux31a-xb52 13.3-inch ultrabook 1.7 ghz intel core i5-3317u processor, 4gb ddr3, ...",amazon.com asus ux31axb52 13.3inch ultrabook 1.7 ghz intel core i53317u processor 4gb ddr3 256gb...,"[amazon.com, asus, ux31axb52, 13.3inch, ultrabook, 1.7, ghz, intel, core, i53317u, processor, 4,...",13.3,ux31axb52


In [29]:
len(y2)

58653

In [30]:
y2.head()

Unnamed: 0,left_instance_id,right_instance_id,label
0,www.flexshopper.com//1098,www.amazon.com//1389,1
1,www.amazon.com//291,www.amazon.com//1081,1
2,buy.net//634,www.amazon.com//1014,1
3,www.amazon.com//2395,buy.net//393,1
4,www.flexshopper.com//2173,buy.net//634,1


In [31]:
x2_train = pd.merge(y2, left_x2, on='left_instance_id',how='left')

In [32]:
x2_train = pd.merge(x2_train, right_x2, on='right_instance_id',how='left')

In [33]:
x2_train.head()

Unnamed: 0,left_instance_id,right_instance_id,label,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title,right_new_title,right_new_title_tokens,right_screen_size,right_model
0,www.flexshopper.com//1098,www.amazon.com//1389,1,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,-1,500,0,-1,-1,amazon.com brand new acer america corp. acer aspire e1-572-34014g50mnkk 15.6 led notebook - inte...,amazon.com brand new acer america corp. acer aspire e157234014g50mnkk 15.6 led notebook intel co...,"[amazon.com, brand, new, acer, america, corp, ., acer, aspire, e157234014g50mnkk, 15.6, led, not...",15.6,america corp
1,www.amazon.com//291,www.amazon.com//1081,1,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,-1,3500,0,13 pounds,15 x 10.10 x 0.50 inches,amazon.com acer aspire e1-572-34014g50mnrr 15.6 led notebook intel core i3-4010u 1.70 ghz 4gb dd...,amazon.com acer aspire e157234014g50mnrr 15.6 led notebook intel core i34010u 1.70 ghz 4gb ddr3 ...,"[amazon.com, acer, aspire, e157234014g50mnrr, 15.6, led, notebook, intel, core, i34010u, 1.70, g...",15.6,aspire
2,buy.net//634,www.amazon.com//1014,1,acer,intel,-1,i3,1.70 ghz,4,ddr3l sdram,...,-1,500,0,5.2 pounds,15 x 10.10 x 1 inches,amazon.com acer aspire nx.mhfaa.002 e1-572-6484 15.6-inch laptop computers accessories,amazon.com acer aspire nx.mhfaa.002 e15726484 15.6inch laptop computers accessories,"[amazon.com, acer, aspire, nx.mhfaa.002, e15726484, 15.6inch, laptop, computers, accessories]",15.6,aspire nx.mhfaa.002
3,www.amazon.com//2395,buy.net//393,1,acer,intel,intel core i3,i3,intel core i3,4,4 gb,...,-1,500,0,5.18 lbs,10.1 in. 15 in,acer aspire e1-572-34014g50mnrr 15.6 led notebook - intel core i3 i3-4010u 1.70 ghz - red 4 gb r...,acer aspire e157234014g50mnrr 15.6 led notebook intel core i3 i34010u 1.70 ghz red 4 gb ram 500 ...,"[acer, aspire, e157234014g50mnrr, 15.6, led, notebook, intel, core, i3, i34010u, 1.70, ghz, red,...",15.6,aspire
4,www.flexshopper.com//2173,buy.net//634,1,acer,intel,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,i3,intel core i3 4th gen 4010u 1.7 ghz 3 mb cache,4,4 gb ddr3l,...,-1,500,0,5.18 lbs,10.1 in. 15 in,acer aspire e1-572-34014g50mnkk 15.6 led notebook - intel core i3 i3-4010u 1.70 ghz - black 4 gb...,acer aspire e157234014g50mnkk 15.6 led notebook intel core i3 i34010u 1.70 ghz black 4 gb ram 50...,"[acer, aspire, e157234014g50mnkk, 15.6, led, notebook, intel, core, i3, i34010u, 1.70, ghz, blac...",15.6,aspire


In [34]:
len(right_x2), len(left_x2), len(x2_train)

(343, 343, 58653)

In [35]:
# Save the output merged dataset
x2_train['id'] = pd.Series(np.arange(len(x2_train)))
x2_train.to_csv('../data/sigmod/x2_train.csv', index=False)

In [12]:
# Prepare the test dataset
def df_crossjoin(df1, df2, **kwargs):
    """
    Make a cross join (cartesian product) between two dataframes by using a constant temporary key.
    Also sets a MultiIndex which is the cartesian product of the indices of the input dataframes.
    See: https://github.com/pydata/pandas/issues/5401
    :param df1 dataframe 1
    :param df1 dataframe 2
    :param kwargs keyword arguments that will be passed to pd.merge()
    :return cross join of df1 and df2
    """
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)

    return res

def same_key_pair(record):
    return record['left_instance_id'] == record['right_instance_id']

In [13]:
x2_test = df_crossjoin(left_x2, right_x2)
mask = x2_test.apply(same_key_pair, axis=1)
x2_test = x2_test[~mask]

In [14]:
len(x2_test)

117306

In [26]:
mask.sum()

343

In [28]:
x2_test['id'] = pd.Series(np.arange(len(x2_test)))
x2_test.to_csv("../data/sigmod/x2_test.csv", index=False)

In [18]:
x2_train.head()

Unnamed: 0,left_instance_id,right_instance_id,label,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,...,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title,id
0,www.flexshopper.com//1098,www.amazon.com//1389,1,,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,4 GB DDR3L,4 GB DDR3L,...,Intel Core i3,4 GB,4 GB,,500 GB,,,,Amazon.com : Brand New Acer America Corp. Acer...,0
1,www.amazon.com//291,www.amazon.com//1081,1,,Intel Core i3,Intel Core i3,Intel Core i3,Intel Core i3,4 GB,4 GB,...,,,,,,,13 pounds,15 x 10.10 x 0.50 inches,Amazon.com : Acer Aspire E1-572-34014G50Mnrr 1...,1
2,buy.net//634,www.amazon.com//1014,1,,Intel Core i3,,Intel Core i3,1.70 GHz,4 GB,DDR3L SDRAM,...,1.7 GHz Core i3-4010U,4 GB DDR3L SDRAM,DDR3 SDRAM. 4 GB DDR3L SDRAM,,500 GB,,5.2 pounds,15 x 10.10 x 1 inches,Amazon.com : Acer Aspire NX.MHFAA.002;E1-572-6...,2
3,www.amazon.com//2395,buy.net//393,1,Acer,Intel Core i3,Intel Core i3,Intel Core i3,Intel Core i3,4 GB,4 GB,...,1.70 GHz,4 GB,DDR3L SDRAM,,500 GB,,5.18 lbs,10.1 in. 15 in,"Acer Aspire E1-572-34014G50Mnrr 15.6"" LED Note...",3
4,www.flexshopper.com//2173,buy.net//634,1,,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,Intel Core i3 ( 4th Gen ) 4010U / 1.7 GHz / 3 ...,4 GB DDR3L,4 GB DDR3L,...,1.70 GHz,4 GB,DDR3L SDRAM,,500 GB,,5.18 lbs,10.1 in. 15 in,"Acer Aspire E1-572-34014G50Mnkk 15.6"" LED Note...",4


In [14]:
x2_train.columns

NameError: name 'x2_train' is not defined

In [29]:
x2_test.columns

Index(['left_instance_id', 'left_brand', 'left_cpu_brand', 'left_cpu_model',
       'left_cpu_type', 'left_cpu_frequency', 'left_ram_capacity',
       'left_ram_type', 'left_ram_frequency', 'left_hdd_capacity',
       'left_ssd_capacity', 'left_weight', 'left_dimensions', 'left_title',
       'right_instance_id', 'right_brand', 'right_cpu_brand',
       'right_cpu_model', 'right_cpu_type', 'right_cpu_frequency',
       'right_ram_capacity', 'right_ram_type', 'right_ram_frequency',
       'right_hdd_capacity', 'right_ssd_capacity', 'right_weight',
       'right_dimensions', 'right_title', 'id'],
      dtype='object')

In [30]:
x2_test

Unnamed: 0,left_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,left_ram_frequency,left_hdd_capacity,...,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title,id
1,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...,1.0
2,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...,2.0
3,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"Amazon.com : 15.6"" HP 15-f009wm Amd Dual-Core ...",3.0
4,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...,4.0
5,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,Intel Core i7,3 GB,3 GB,,160 GB,,,,Amazon.com : Lenovo 3093B51 ThinkPad X201 Tabl...,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117643,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2320 - 12.5"" - Core i5 33...",
117644,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,180 GB SSD. 180 GB SSD. Lenovo ThinkPad X230 2...,180 GB SSD. 180 GB SSD,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2320 - 12.5"" - Core i5 33...",
117645,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2325 - 12.5"" - Core i5 33...",
117646,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,"Lenovo ThinkPad X230 Tablet 3438 - 12.5"" - Cor...",


In [31]:
x2_test

Unnamed: 0,left_instance_id,left_brand,left_cpu_brand,left_cpu_model,left_cpu_type,left_cpu_frequency,left_ram_capacity,left_ram_type,left_ram_frequency,left_hdd_capacity,...,right_cpu_frequency,right_ram_capacity,right_ram_type,right_ram_frequency,right_hdd_capacity,right_ssd_capacity,right_weight,right_dimensions,right_title,id
1,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.6 GHz Intel Core i5-4200U,8 GB DDR3L SDRAM,DDR3 SDRAM. 8 GB DDR3L SDRAM,,500 GB mechanical_hard_drive,,4.8 pounds,15.02 x 10.08 x 0.90 inches,Amazon.com : Acer Aspire V7-582PG-6479 15.6-In...,1.0
2,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.6 GHz Intel Core i5,4 GB DDR3-SDRAM,DDR3 SDRAM. 4 GB DDR3-SDRAM,,500 GB mechanical_hard_drive,,5.2 pounds,15.02 x 10.08 x 1 inches,Amazon.com : Acer Aspire E1-572-6870 15.6 Inch...,2.0
3,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,,4 GB SDRAM DDR3,DDR3 SDRAM. 4 GB SDRAM DDR3,,500 GB,,4.8 pounds,15.18 x 0.89 x 10.16 inches,"Amazon.com : 15.6"" HP 15-f009wm Amd Dual-Core ...",3.0
4,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,1.7 GHz Core i5-3317U,4 GB DDR3,DDR3 SDRAM. 4 GB DDR3,,256 MB,,2.9 pounds,8.80 x 0.70 x 12.80 inches,Amazon.com : ASUS UX31A-XB52 13.3-Inch Ultrabo...,4.0
5,www.softwarecity.ca//737,Lenovo,Intel. i5-3320M,i5-3320M,Dual-core ( 2 Core ). Core i5,2.60 GHz,,DDR3 SDRAM. DDR3-1600/PC3-12800. DDR3 SDRAM,DDR3-1600/PC3-12800,320 GB,...,Intel Core i7,3 GB,3 GB,,160 GB,,,,Amazon.com : Lenovo 3093B51 ThinkPad X201 Tabl...,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117643,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,Technology DDR3 SDRAM Form Factor SO DIMM 204-...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2320 - 12.5"" - Core i5 33...",
117644,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,4 GB DDR3 Slots Qty 2 Empty Slots 1 Max RAM Su...,180 GB SSD. 180 GB SSD. Lenovo ThinkPad X230 2...,180 GB SSD. 180 GB SSD,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2320 - 12.5"" - Core i5 33...",
117645,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,4 GB DDR3 Slots Qty 2 Max RAM Supported 16 GB ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,3.3 lbs 3.3 lbs,8.1 in. 12 in x 8.1 in x 1 in. 1 in. 12 in,"Lenovo ThinkPad X230 2325 - 12.5"" - Core i5 33...",
117646,www.vology.com//3017,Lenovo ThinkPad X230 2320 - 12.5 '' - Core i7 ...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Intel Core i7 ( 3rd Gen ) 3520M / 2.9 GHz. Int...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,Empty Slots 1 Slots Qty 2 Max RAM Supported 16...,256 GB SSD - Self Encrypting Drive. 256 GB SSD...,...,Intel Core i5 ( 3rd Gen ) 3320M / 2.6 GHz. Int...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,Form Factor SO DIMM 204-pin Technology DDR3 SD...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm. ...,500 GB HDD / 7200 rpm. 500 GB HDD / 7200 rpm,4 lbs 4 lbs,9 in. 12 in x 9 in x 1.2 in. 1.2 in. 12 in,"Lenovo ThinkPad X230 Tablet 3438 - 12.5"" - Cor...",


In [7]:
g = pd.read_csv("../data/sigmod/Y2.csv")
g.head()

Unnamed: 0,left_instance_id,right_instance_id,label
0,www.flexshopper.com//1098,www.amazon.com//1389,1
1,www.amazon.com//291,www.amazon.com//1081,1
2,buy.net//634,www.amazon.com//1014,1
3,www.amazon.com//2395,buy.net//393,1
4,www.flexshopper.com//2173,buy.net//634,1


In [8]:
g.columns = ['ltable_instance_id', 'rtable_instance_id', 'label']

In [9]:
g.head()

Unnamed: 0,ltable_instance_id,rtable_instance_id,label
0,www.flexshopper.com//1098,www.amazon.com//1389,1
1,www.amazon.com//291,www.amazon.com//1081,1
2,buy.net//634,www.amazon.com//1014,1
3,www.amazon.com//2395,buy.net//393,1
4,www.flexshopper.com//2173,buy.net//634,1


In [12]:
g['id'] = pd.Series(np.arange(len(g)))

In [13]:
g.to_csv("Y2_g.csv", index=False)