## Data merging
This is a continuation of the last notebook. WE will be merging all the features and labels int one csv file for easire processing.

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re

from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# skelearn
from sklearn.preprocessing import LabelBinarizer

# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))
from features import extract_features_from_df, extract_features_from_html

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [3]:
dask.set_options(temporary_directory='~/partd/')  # the temporary diretory for larger-than-memroy operations

<dask.context.set_options at 0x7fbfeaf307b8>

In [4]:
feats, oh, freqs = dd.read_csv('../data/ecommerce-new/interim/feats-*.csv'), dd.read_csv('../data/ecommerce-new/interim/one-hot-*.csv'), dd.read_csv('../data/ecommerce-new/interim/freqs-*.csv')

In [6]:
# merge featues
feats.merge(oh, on=['url', 'path']).merge(freqs, on=['url', 'path']).to_csv('../data/ecommerce-new/features/feats-*.csv')

In [7]:
# merge features and labels
feats = dd.read_csv('../data/ecommerce-new/features/feats-*.csv')
labels = dd.read_csv('../data/ecommerce-new/labels/labels-*.csv')
feats.merge(labels, on=['url', 'path']).to_csv('../data/ecommerce-new/final/data-*.csv')

In [27]:
# drop, convert and reorder columns
feats = dd.read_csv('../data/ecommerce-new/final/data-*.csv')
feats = feats.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x','Unnamed: 0_y', 'Unnamed: 0.1.1'], axis='columns')
feats[['has_text', 'has_id']] = feats[['has_text', 'has_id']].astype(int)
feats = feats[[
    'url',
    'path',
    'depth',
    'sibling_pos',
    'no_classes',
    'has_id',
    'no_children',
    'has_text',
    'descendant1_no_nodes',
    'descendant1_no_children_avg',
    'descendant1_has_id_avg',
    'descendant1_no_classes_avg',
    'descendant1_has_text_avg',
    'descendant2_no_nodes',
    'descendant2_no_children_avg',
    'descendant2_has_id_avg',
    'descendant2_no_classes_avg',
    'descendant2_has_text_avg',
    'descendant3_no_nodes',
    'descendant3_no_children_avg',
    'descendant3_has_id_avg',
    'descendant3_no_classes_avg',
    'descendant3_has_text_avg',
    'descendant4_no_nodes',
    'descendant4_no_children_avg',
    'descendant4_has_id_avg',
    'descendant4_no_classes_avg',
    'descendant4_has_text_avg',
    'descendant5_no_nodes',
    'descendant5_no_children_avg',
    'descendant5_has_id_avg',
    'descendant5_no_classes_avg',
    'descendant5_has_text_avg',
    'ancestor1_depth',
    'ancestor1_sibling_pos',
    'ancestor1_no_classes',
    'ancestor1_has_id',
    'ancestor1_no_children',
    'ancestor1_has_text',
    'ancestor2_depth',
    'ancestor2_sibling_pos',
    'ancestor2_no_classes',
    'ancestor2_has_id',
    'ancestor2_no_children',
    'ancestor2_has_text',
    'ancestor3_depth',
    'ancestor3_sibling_pos',
    'ancestor3_no_classes',
    'ancestor3_has_id',
    'ancestor3_no_children',
    'ancestor3_has_text',
    'ancestor4_depth',
    'ancestor4_sibling_pos',
    'ancestor4_no_classes',
    'ancestor4_has_id',
    'ancestor4_no_children',
    'ancestor4_has_text',
    'ancestor5_depth',
    'ancestor5_sibling_pos',
    'ancestor5_no_classes',
    'ancestor5_has_id',
    'ancestor5_no_children',
    'ancestor5_has_text',
    'tag_html',
    'tag_head',
    'tag_meta',
    'tag_title',
    'tag_link',
    'tag_html_comment',
    'tag_script',
    'tag_body',
    'tag_noscript',
    'tag_div',
    'tag_ul',
    'tag_li',
    'tag_a',
    'tag_i',
    'tag_span',
    'tag_nav',
    'tag_img',
    'tag_form',
    'tag_input',
    'tag_button',
    'tag_section',
    'tag_ol',
    'tag_iframe',
    'tag_small',
    'tag_h1',
    'tag_h2',
    'tag_p',
    'tag_s',
    'tag_sup',
    'tag_strong',
    'tag_footer',
    'tag_h4',
    'tag_label',
    'tag_header',
    'tag_figure',
    'tag_style',
    'tag_b',
    'tag_select',
    'tag_option',
    'tag_br',
    'tag_fieldset',
    'tag_noindex',
    'tag_table',
    'tag_tbody',
    'tag_tr',
    'tag_td',
    'tag_dl',
    'tag_dt',
    'tag_dd',
    'tag_caption',
    'tag_h3',
    'tag_aside',
    'tag_map',
    'tag_area',
    'tag_figcaption',
    'tag_th',
    'tag_em',
    'tag_textarea',
    'tag_address',
    'tag_h5',
    'tag_mktz-div',
    'tag_omni-container',
    'tag_omni-ribb-cent',
    'tag_omni-interact',
    'tag_h6',
    'tag_font',
    'tag_hr',
    'tag_u',
    'tag_object',
    'tag_param',
    'tag_legend',
    'tag_optgroup',
    'tag_svg',
    'tag_defs',
    'tag_g',
    'tag_path',
    'tag_text',
    'tag_tspan',
    'tag_article',
    'tag_animatetransform',
    'tag_del',
    'tag_cat-game-intro',
    'tag_thead',
    'tag_desc',
    'tag_pre',
    'tag_canvas',
    'tag_ins',
    'tag_circle',
    'tag_main',
    'tag_line',
    'tag_rect',
    'tag_like',
    'tag_time',
    'tag_video',
    'tag_center',
    'tag_querylogoperation',
    'tag_colgroup',
    'tag_col',
    'tag_string',
    'ancestor1_tag_',
    'ancestor1_tag_html',
    'ancestor1_tag_head',
    'ancestor1_tag_body',
    'ancestor1_tag_div',
    'ancestor1_tag_ul',
    'ancestor1_tag_li',
    'ancestor1_tag_a',
    'ancestor1_tag_nav',
    'ancestor1_tag_form',
    'ancestor1_tag_button',
    'ancestor1_tag_span',
    'ancestor1_tag_section',
    'ancestor1_tag_ol',
    'ancestor1_tag_h1',
    'ancestor1_tag_h2',
    'ancestor1_tag_p',
    'ancestor1_tag_s',
    'ancestor1_tag_footer',
    'ancestor1_tag_h4',
    'ancestor1_tag_header',
    'ancestor1_tag_strong',
    'ancestor1_tag_select',
    'ancestor1_tag_fieldset',
    'ancestor1_tag_noindex',
    'ancestor1_tag_table',
    'ancestor1_tag_tbody',
    'ancestor1_tag_tr',
    'ancestor1_tag_td',
    'ancestor1_tag_dl',
    'ancestor1_tag_dt',
    'ancestor1_tag_dd',
    'ancestor1_tag_label',
    'ancestor1_tag_h3',
    'ancestor1_tag_small',
    'ancestor1_tag_aside',
    'ancestor1_tag_map',
    'ancestor1_tag_b',
    'ancestor1_tag_figure',
    'ancestor1_tag_figcaption',
    'ancestor1_tag_em',
    'ancestor1_tag_address',
    'ancestor1_tag_mktz-div',
    'ancestor1_tag_omni-container',
    'ancestor1_tag_omni-ribb-cent',
    'ancestor1_tag_omni-interact',
    'ancestor1_tag_font',
    'ancestor1_tag_th',
    'ancestor1_tag_object',
    'ancestor1_tag_optgroup',
    'ancestor1_tag_svg',
    'ancestor1_tag_defs',
    'ancestor1_tag_g',
    'ancestor1_tag_text',
    'ancestor1_tag_article',
    'ancestor1_tag_i',
    'ancestor1_tag_path',
    'ancestor1_tag_thead',
    'ancestor1_tag_del',
    'ancestor1_tag_ins',
    'ancestor1_tag_caption',
    'ancestor1_tag_h5',
    'ancestor1_tag_main',
    'ancestor1_tag_h6',
    'ancestor1_tag_center',
    'ancestor1_tag_querylogoperation',
    'ancestor1_tag_colgroup',
    'ancestor1_tag_legend',
    'ancestor1_tag_string',
    'ancestor2_tag_',
    'ancestor2_tag_html',
    'ancestor2_tag_body',
    'ancestor2_tag_div',
    'ancestor2_tag_ul',
    'ancestor2_tag_li',
    'ancestor2_tag_nav',
    'ancestor2_tag_a',
    'ancestor2_tag_section',
    'ancestor2_tag_ol',
    'ancestor2_tag_p',
    'ancestor2_tag_footer',
    'ancestor2_tag_form',
    'ancestor2_tag_header',
    'ancestor2_tag_span',
    'ancestor2_tag_h2',
    'ancestor2_tag_fieldset',
    'ancestor2_tag_noindex',
    'ancestor2_tag_table',
    'ancestor2_tag_tbody',
    'ancestor2_tag_tr',
    'ancestor2_tag_td',
    'ancestor2_tag_dl',
    'ancestor2_tag_dt',
    'ancestor2_tag_dd',
    'ancestor2_tag_h3',
    'ancestor2_tag_button',
    'ancestor2_tag_aside',
    'ancestor2_tag_figure',
    'ancestor2_tag_figcaption',
    'ancestor2_tag_mktz-div',
    'ancestor2_tag_omni-container',
    'ancestor2_tag_omni-ribb-cent',
    'ancestor2_tag_omni-interact',
    'ancestor2_tag_h4',
    'ancestor2_tag_strong',
    'ancestor2_tag_select',
    'ancestor2_tag_svg',
    'ancestor2_tag_g',
    'ancestor2_tag_article',
    'ancestor2_tag_small',
    'ancestor2_tag_label',
    'ancestor2_tag_h1',
    'ancestor2_tag_thead',
    'ancestor2_tag_ins',
    'ancestor2_tag_i',
    'ancestor2_tag_address',
    'ancestor2_tag_main',
    'ancestor2_tag_center',
    'ancestor2_tag_querylogoperation',
    'ancestor2_tag_string',
    'ancestor2_tag_h5',
    'ancestor3_tag_',
    'ancestor3_tag_html',
    'ancestor3_tag_body',
    'ancestor3_tag_div',
    'ancestor3_tag_ul',
    'ancestor3_tag_nav',
    'ancestor3_tag_li',
    'ancestor3_tag_section',
    'ancestor3_tag_footer',
    'ancestor3_tag_form',
    'ancestor3_tag_header',
    'ancestor3_tag_span',
    'ancestor3_tag_a',
    'ancestor3_tag_fieldset',
    'ancestor3_tag_noindex',
    'ancestor3_tag_table',
    'ancestor3_tag_tbody',
    'ancestor3_tag_tr',
    'ancestor3_tag_td',
    'ancestor3_tag_dl',
    'ancestor3_tag_dd',
    'ancestor3_tag_aside',
    'ancestor3_tag_figure',
    'ancestor3_tag_figcaption',
    'ancestor3_tag_h3',
    'ancestor3_tag_p',
    'ancestor3_tag_mktz-div',
    'ancestor3_tag_omni-container',
    'ancestor3_tag_omni-ribb-cent',
    'ancestor3_tag_ol',
    'ancestor3_tag_svg',
    'ancestor3_tag_article',
    'ancestor3_tag_thead',
    'ancestor3_tag_label',
    'ancestor3_tag_ins',
    'ancestor3_tag_address',
    'ancestor3_tag_main',
    'ancestor3_tag_h2',
    'ancestor3_tag_center',
    'ancestor3_tag_querylogoperation',
    'ancestor3_tag_button',
    'ancestor3_tag_string',
    'ancestor4_tag_',
    'ancestor4_tag_html',
    'ancestor4_tag_body',
    'ancestor4_tag_div',
    'ancestor4_tag_nav',
    'ancestor4_tag_ul',
    'ancestor4_tag_section',
    'ancestor4_tag_footer',
    'ancestor4_tag_form',
    'ancestor4_tag_header',
    'ancestor4_tag_li',
    'ancestor4_tag_span',
    'ancestor4_tag_fieldset',
    'ancestor4_tag_noindex',
    'ancestor4_tag_table',
    'ancestor4_tag_tbody',
    'ancestor4_tag_tr',
    'ancestor4_tag_td',
    'ancestor4_tag_dl',
    'ancestor4_tag_dd',
    'ancestor4_tag_a',
    'ancestor4_tag_figure',
    'ancestor4_tag_h3',
    'ancestor4_tag_figcaption',
    'ancestor4_tag_mktz-div',
    'ancestor4_tag_omni-container',
    'ancestor4_tag_article',
    'ancestor4_tag_ol',
    'ancestor4_tag_p',
    'ancestor4_tag_main',
    'ancestor4_tag_aside',
    'ancestor4_tag_label',
    'ancestor4_tag_center',
    'ancestor4_tag_querylogoperation',
    'ancestor4_tag_button',
    'ancestor5_tag_',
    'ancestor5_tag_html',
    'ancestor5_tag_body',
    'ancestor5_tag_div',
    'ancestor5_tag_nav',
    'ancestor5_tag_section',
    'ancestor5_tag_footer',
    'ancestor5_tag_header',
    'ancestor5_tag_ul',
    'ancestor5_tag_li',
    'ancestor5_tag_span',
    'ancestor5_tag_form',
    'ancestor5_tag_fieldset',
    'ancestor5_tag_noindex',
    'ancestor5_tag_table',
    'ancestor5_tag_tbody',
    'ancestor5_tag_tr',
    'ancestor5_tag_td',
    'ancestor5_tag_dl',
    'ancestor5_tag_figure',
    'ancestor5_tag_figcaption',
    'ancestor5_tag_mktz-div',
    'ancestor5_tag_article',
    'ancestor5_tag_a',
    'ancestor5_tag_dd',
    'ancestor5_tag_ol',
    'ancestor5_tag_main',
    'ancestor5_tag_aside',
    'ancestor5_tag_label',
    'ancestor5_tag_center',
    'ancestor5_tag_p',
    'ancestor5_tag_querylogoperation',
    'ancestor5_tag_button',
    'descendant1_tags_a',
    'descendant1_tags_address',
    'descendant1_tags_animatetransform',
    'descendant1_tags_area',
    'descendant1_tags_article',
    'descendant1_tags_aside',
    'descendant1_tags_b',
    'descendant1_tags_body',
    'descendant1_tags_br',
    'descendant1_tags_button',
    'descendant1_tags_canvas',
    'descendant1_tags_caption',
    'descendant1_tags_cat-game-intro',
    'descendant1_tags_center',
    'descendant1_tags_circle',
    'descendant1_tags_col',
    'descendant1_tags_colgroup',
    'descendant1_tags_dd',
    'descendant1_tags_defs',
    'descendant1_tags_del',
    'descendant1_tags_desc',
    'descendant1_tags_div',
    'descendant1_tags_dl',
    'descendant1_tags_dt',
    'descendant1_tags_em',
    'descendant1_tags_fieldset',
    'descendant1_tags_figcaption',
    'descendant1_tags_figure',
    'descendant1_tags_font',
    'descendant1_tags_footer',
    'descendant1_tags_form',
    'descendant1_tags_g',
    'descendant1_tags_h1',
    'descendant1_tags_h2',
    'descendant1_tags_h3',
    'descendant1_tags_h4',
    'descendant1_tags_h5',
    'descendant1_tags_h6',
    'descendant1_tags_head',
    'descendant1_tags_header',
    'descendant1_tags_hr',
    'descendant1_tags_html_comment',
    'descendant1_tags_i',
    'descendant1_tags_iframe',
    'descendant1_tags_img',
    'descendant1_tags_input',
    'descendant1_tags_ins',
    'descendant1_tags_label',
    'descendant1_tags_legend',
    'descendant1_tags_li',
    'descendant1_tags_like',
    'descendant1_tags_line',
    'descendant1_tags_link',
    'descendant1_tags_main',
    'descendant1_tags_map',
    'descendant1_tags_meta',
    'descendant1_tags_mktz-div',
    'descendant1_tags_nav',
    'descendant1_tags_noindex',
    'descendant1_tags_noscript',
    'descendant1_tags_object',
    'descendant1_tags_ol',
    'descendant1_tags_omni-container',
    'descendant1_tags_omni-interact',
    'descendant1_tags_omni-ribb-cent',
    'descendant1_tags_optgroup',
    'descendant1_tags_option',
    'descendant1_tags_p',
    'descendant1_tags_param',
    'descendant1_tags_path',
    'descendant1_tags_pre',
    'descendant1_tags_querylogoperation',
    'descendant1_tags_rect',
    'descendant1_tags_s',
    'descendant1_tags_script',
    'descendant1_tags_section',
    'descendant1_tags_select',
    'descendant1_tags_small',
    'descendant1_tags_span',
    'descendant1_tags_string',
    'descendant1_tags_strong',
    'descendant1_tags_style',
    'descendant1_tags_sup',
    'descendant1_tags_svg',
    'descendant1_tags_table',
    'descendant1_tags_tbody',
    'descendant1_tags_td',
    'descendant1_tags_text',
    'descendant1_tags_textarea',
    'descendant1_tags_th',
    'descendant1_tags_thead',
    'descendant1_tags_time',
    'descendant1_tags_title',
    'descendant1_tags_tr',
    'descendant1_tags_tspan',
    'descendant1_tags_u',
    'descendant1_tags_ul',
    'descendant1_tags_video',
    'detail_description_label',
    'detail_image_label',
    'detail_price_label',
    'detail_title_label',
    'list_image_label',
    'list_price_label',
    'list_title_label']]

In [None]:
# save it
feats.to_csv('../data/ecommerce-new/final/dom-full-*.csv')