## Data processing
This is the second iteration of [5-data-preparation](5-data-preparation.ipynb). We will be trying to use the `dask` library to extract the features. In order to do this we will be leveraging dask's parallelization ability by using the already defined functions for extracting fetures with `pandas`. We will have a two step tprocess:

1. Use dask's `apply` and `map_partitions` together with pandas feature extraction to extract features which are *trivially paralellizable*
2. Use dask's other methods to implement one-hot-encoding, frequency counting or normalization or any other kind of precedure that depends on the entire dataset

This method could be employed for future work.

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re

from urllib.parse import urlparse

# pandas
import pandas as pd
import dask.dataframe as dd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# skelearn
from sklearn.preprocessing import LabelBinarizer

# local imports
sys.path.append(os.path.join(os.getcwd(), "src"))
from features import extract_features_from_df, extract_features_from_html

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# read some data
df = pd.read_csv('../data/ecommerce-new/raw.csv', nrows=100)
df.head()

Unnamed: 0,html,url
0,"<!DOCTYPE html><html lang=""ro"" class=""""><head>...",https://www.emag.ro/resigilate/placi_video/c?r...
1,"<!DOCTYPE html><html xml:lang=""ro"" lang=""ro"" c...",https://www.emag.ro/resigilate/ventilatoare-pc...
2,"<!DOCTYPE html><html xmlns:og=""http://ogp.me/n...",https://www.olx.ro/auto-masini-moto-ambarcatiu...
3,"<!DOCTYPE html><html lang=""ro"" class=""""><head>...",https://www.emag.ro/resigilate
4,"<!DOCTYPE html><html xml:lang=""ro"" lang=""ro"" c...","https://www.emag.ro/label/pret,intre-200-si-50..."


In [3]:
# extract the features
feats = extract_features_from_df(df, 5, 5)
feats.head()

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,...,ancestor5_depth,ancestor5_sibling_pos,ancestor5_tag,ancestor5_no_classes,ancestor5_has_id,ancestor5_no_children,ancestor5_has_text,ancestor5_classes,path,url
0,1,0,html,0,False,2,False,[],2,83.0,...,0,0,,0,False,0,False,[],/html,https://www.emag.ro/resigilate/placi_video/c?r...
1,2,0,head,0,False,38,False,[],38,0.0,...,0,0,,0,False,0,False,[],/html/head,https://www.emag.ro/resigilate/placi_video/c?r...
2,3,0,meta,0,False,0,False,[],0,0.0,...,0,0,,0,False,0,False,[],/html/head/meta[1],https://www.emag.ro/resigilate/placi_video/c?r...
3,3,1,title,0,False,0,True,[],0,0.0,...,0,0,,0,False,0,False,[],/html/head/title,https://www.emag.ro/resigilate/placi_video/c?r...
4,3,2,meta,0,False,0,False,[],0,0.0,...,0,0,,0,False,0,False,[],/html/head/meta[2],https://www.emag.ro/resigilate/placi_video/c?r...


In [4]:
# check the columns for out candidates
feats.columns

Index(['depth', 'sibling_pos', 'tag', 'no_classes', 'has_id', 'no_children',
       'has_text', 'classes', 'descendant1_no_nodes',
       'descendant1_no_children_avg', 'descendant1_has_id_avg',
       'descendant1_no_classes_avg', 'descendant1_has_text_avg',
       'descendant1_classes', 'descendant1_tags', 'descendant2_no_nodes',
       'descendant2_no_children_avg', 'descendant2_has_id_avg',
       'descendant2_no_classes_avg', 'descendant2_has_text_avg',
       'descendant2_classes', 'descendant2_tags', 'descendant3_no_nodes',
       'descendant3_no_children_avg', 'descendant3_has_id_avg',
       'descendant3_no_classes_avg', 'descendant3_has_text_avg',
       'descendant3_classes', 'descendant3_tags', 'descendant4_no_nodes',
       'descendant4_no_children_avg', 'descendant4_has_id_avg',
       'descendant4_no_classes_avg', 'descendant4_has_text_avg',
       'descendant4_classes', 'descendant4_tags', 'descendant5_no_nodes',
       'descendant5_no_children_avg', 'descendant5_has_id

In [5]:
# make a dask dataframe out of it 
feats_ddf = dd.from_pandas(feats, chunksize=1000)
feats_ddf

Unnamed: 0_level_0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,descendant1_has_text_avg,descendant1_classes,descendant1_tags,descendant2_no_nodes,descendant2_no_children_avg,descendant2_has_id_avg,descendant2_no_classes_avg,descendant2_has_text_avg,descendant2_classes,descendant2_tags,descendant3_no_nodes,descendant3_no_children_avg,descendant3_has_id_avg,descendant3_no_classes_avg,descendant3_has_text_avg,descendant3_classes,descendant3_tags,descendant4_no_nodes,descendant4_no_children_avg,descendant4_has_id_avg,descendant4_no_classes_avg,descendant4_has_text_avg,descendant4_classes,descendant4_tags,descendant5_no_nodes,descendant5_no_children_avg,descendant5_has_id_avg,descendant5_no_classes_avg,descendant5_has_text_avg,descendant5_classes,descendant5_tags,ancestor1_depth,ancestor1_sibling_pos,ancestor1_tag,ancestor1_no_classes,ancestor1_has_id,ancestor1_no_children,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes,ancestor3_depth,ancestor3_sibling_pos,ancestor3_tag,ancestor3_no_classes,ancestor3_has_id,ancestor3_no_children,ancestor3_has_text,ancestor3_classes,ancestor4_depth,ancestor4_sibling_pos,ancestor4_tag,ancestor4_no_classes,ancestor4_has_id,ancestor4_no_children,ancestor4_has_text,ancestor4_classes,ancestor5_depth,ancestor5_sibling_pos,ancestor5_tag,ancestor5_no_classes,ancestor5_has_id,ancestor5_no_children,ancestor5_has_text,ancestor5_classes,path,url
npartitions=163,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1
0,int64,int64,object,int64,bool,int64,bool,object,int64,float64,float64,float64,float64,object,object,int64,float64,float64,float64,float64,object,object,int64,float64,float64,float64,float64,object,object,int64,float64,float64,float64,float64,object,object,int64,float64,float64,float64,float64,object,object,int64,int64,object,int64,bool,int64,bool,object,int64,int64,object,int64,bool,int64,bool,object,int64,int64,object,int64,bool,int64,bool,object,int64,int64,object,int64,bool,int64,bool,object,int64,int64,object,int64,bool,int64,bool,object,object,object
1000,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162000,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162499,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


### One-hot encoding
Explore the posibility of one-hot encoding features

In [6]:
# try to categorize the tag names to be able to one-hot-encode
tag_cats = feats_ddf.categorize(columns=['tag'])
one_hot = dd.get_dummies(data=tag_cats.loc[:, 'tag'], prefix='tag').compute()

In [7]:
# SUCCESS!
one_hot.head()

Unnamed: 0,tag_html,tag_head,tag_meta,tag_title,tag_link,tag_html_comment,tag_script,tag_body,tag_noscript,tag_div,...,tag_em,tag_textarea,tag_address,tag_h5,tag_mktz-div,tag_omni-container,tag_omni-ribb-cent,tag_omni-interact,tag_h6,tag_font
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Frequency vectors
Experiment with obtaining frequency vectors from the lists of decendant tags

In [8]:
# compute all the tags available 
avail_tags = feats_ddf['descendant2_tags'].to_bag().map(set).fold(lambda x, y: x | y).compute()

In [9]:
def count_values(lst, values):
    """Given an iterable of values and one of keys, return the count of 
    the keys in the list(with 0 as default)"""
    count_dict = {val: 0 for val in values} # for overwriting with values
    for elem in lst:
        count_dict[elem] += 1
    return count_dict
    
count_values([1, 1, 2, 1], [1, 2, 3])

{1: 3, 2: 1, 3: 0}

In [10]:
# get the counts, and rename them with a prefix
freqcol_names = {tag_name: int for tag_name in avail_tags}

# returninga series from func creates more columns(this is a LIFE SAVER!)
freq_dicts = feats_ddf['descendant2_tags'].apply(lambda x: pd.Series(count_values(x, avail_tags)), meta=freqcol_names)
freq_dicts.loc[0, :].compute()

Unnamed: 0,a,address,area,aside,b,br,button,caption,dd,div,...,style,sup,table,tbody,td,textarea,th,title,tr,ul
0,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,1,0,0


In [11]:
count_values(feats.loc[0, 'descendant2_tags'], avail_tags)

{'a': 0,
 'address': 0,
 'area': 0,
 'aside': 0,
 'b': 0,
 'br': 0,
 'button': 0,
 'caption': 0,
 'dd': 0,
 'div': 4,
 'dl': 0,
 'dt': 0,
 'em': 0,
 'fieldset': 0,
 'figcaption': 0,
 'figure': 0,
 'font': 0,
 'footer': 0,
 'form': 0,
 'h1': 0,
 'h2': 0,
 'h3': 0,
 'h4': 0,
 'h5': 0,
 'h6': 0,
 'header': 0,
 'html_comment': 4,
 'i': 0,
 'iframe': 0,
 'img': 1,
 'input': 107,
 'label': 0,
 'li': 0,
 'link': 12,
 'map': 0,
 'meta': 15,
 'mktz-div': 0,
 'nav': 0,
 'noindex': 0,
 'noscript': 1,
 'ol': 0,
 'omni-container': 0,
 'omni-interact': 0,
 'omni-ribb-cent': 0,
 'option': 0,
 'p': 0,
 's': 0,
 'script': 21,
 'section': 0,
 'select': 0,
 'small': 0,
 'span': 0,
 'strong': 0,
 'style': 0,
 'sup': 0,
 'table': 0,
 'tbody': 0,
 'td': 0,
 'textarea': 0,
 'th': 0,
 'title': 1,
 'tr': 0,
 'ul': 0}

### Implementation 
Now that we have our proof-of-concept implementations, we can do some more generic implementations to use with our data

In [12]:
def freq_vect_series(ser):
    """Given a series whose elements are python lists, return
    a dataframe where each record is the frequency vector for a certain
    element in the list. The columns will be prefixed with the series name
    
    Returns a dask datagrame."""
    # reduce all to a single set of tags
    avail_tags = ser.to_bag().fold(lambda a, b: a | set(b), set.union, initial=set()).compute()  
    # compute the frequencies of the given tags, pass the index as an argument to concat it to the dict
    # to preserv it 
    freqcol_names = {tag_name: int for tag_name in avail_tags}
    freqs = ser.apply(lambda x: pd.Series(count_values(x, avail_tags)), meta=freqcol_names)
    
    # rename the columns to be prefixed with the name of the series
    col_renames = {col_name: ser.name + '_' + col_name for col_name in avail_tags}
    return freqs.rename(columns=col_renames)
    

In [14]:
freq_vect = freq_vect_series(feats_ddf['descendant2_tags'])
freq_vect.head()

Unnamed: 0,descendant2_tags_a,descendant2_tags_address,descendant2_tags_area,descendant2_tags_aside,descendant2_tags_b,descendant2_tags_br,descendant2_tags_button,descendant2_tags_caption,descendant2_tags_dd,descendant2_tags_div,...,descendant2_tags_style,descendant2_tags_sup,descendant2_tags_table,descendant2_tags_tbody,descendant2_tags_td,descendant2_tags_textarea,descendant2_tags_th,descendant2_tags_title,descendant2_tags_tr,descendant2_tags_ul
0,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# full dataframe implementation
def freq_vect_dataframe(ddf):
    """Given a dataframe of columns with python lists compute
    the merged dataframe of the frequency vectors returned
    by freq_vect_series."""
    ddfs = [freq_vect_series(ddf.loc[:, col_name]) for col_name in ddf.columns.tolist()]
    # basically compute all the frequency dataframes and returned the one-by-one merge result
    result = ddfs[0]
    for ddf in ddfs[1:]:
        result.assign(**{col_name: col_name for col_name in ddf.columns.tolist()})
    return result

In [16]:
freq_ddf = freq_vect_dataframe(feats_ddf.loc[:, ['descendant1_tags', 'descendant2_tags']])
freq_ddf.head()

Unnamed: 0,descendant1_tags_a,descendant1_tags_address,descendant1_tags_area,descendant1_tags_aside,descendant1_tags_b,descendant1_tags_body,descendant1_tags_br,descendant1_tags_button,descendant1_tags_caption,descendant1_tags_dd,...,descendant1_tags_style,descendant1_tags_sup,descendant1_tags_table,descendant1_tags_tbody,descendant1_tags_td,descendant1_tags_textarea,descendant1_tags_th,descendant1_tags_title,descendant1_tags_tr,descendant1_tags_ul
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
def one_hot_dataframe(ddf):
    """Given a dask dataframe encode its columns using one-hot. Every new column will
    be prefixed with the original name.
    
    Returns a dask dataframe."""
    tag_cats = ddf.categorize()  # converted to categoricals
    one_hot = dd.get_dummies(data=tag_cats, prefix=tag_cats.columns.tolist())
    return one_hot

one_hot_dataframe(feats_ddf.loc[:, ['tag', 'ancestor1_tag']]).head()

Unnamed: 0,tag_html,tag_head,tag_meta,tag_title,tag_link,tag_html_comment,tag_script,tag_body,tag_noscript,tag_div,...,ancestor1_tag_b,ancestor1_tag_figure,ancestor1_tag_figcaption,ancestor1_tag_em,ancestor1_tag_address,ancestor1_tag_mktz-div,ancestor1_tag_omni-container,ancestor1_tag_omni-ribb-cent,ancestor1_tag_omni-interact,ancestor1_tag_font
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# WRAPPING IT ALL UP
def extract_features_from_ddf(ddf, depth, height):
    """Given a dask dataframe of the raw data, return the dask dataset containing all the
    extracted features and dropping the redundant ones."""
    feat_ddf = ddf.map_partitions(lambda df: extract_features_from_df(df, depth, height),
                                   meta=extract_features_from_df(pd.DataFrame(), depth, height)).clear_divisions()
    feat_ddf = feat_ddf.categorize(['url', 'path'])
    columns = feat_ddf.columns.tolist()  # used for filtering
    
    # one hot encoding
    one_hot_cols = list(filter(lambda col: re.match(r'.*tag$', col), columns))
    one_hot_ddf = one_hot_dataframe(feat_ddf.loc[:, one_hot_cols])

    # frequency vects
    freq_cols = list(filter(lambda col: re.match(r'descend.*tags$', col), columns))
    freq_ddf = freq_vect_dataframe(feat_ddf.loc[:, freq_cols])
    
    # drop redundant cols
    classes_cols =  list(filter(lambda col: re.match(r'^((descendant|ancestor)[0-9]+_)?classes$', col), columns))
    feat_ddf = feat_ddf.drop(one_hot_cols + freq_cols + classes_cols, axis='columns')
    return one_hot_ddf, freq_ddf, feat_ddf
    
oh, freqs, feats = extract_features_from_ddf(dd.from_pandas(df.iloc[:20], chunksize=10), 5, 5)

In [20]:
oh.compute()

Unnamed: 0,tag_html,tag_head,tag_meta,tag_title,tag_link,tag_html_comment,tag_script,tag_body,tag_noscript,tag_div,...,ancestor5_tag_table,ancestor5_tag_tbody,ancestor5_tag_tr,ancestor5_tag_td,ancestor5_tag_dl,ancestor5_tag_figure,ancestor5_tag_figcaption,ancestor5_tag_mktz-div,path,url
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html,https://www.emag.ro/resigilate/placi_video/c?r...
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head,https://www.emag.ro/resigilate/placi_video/c?r...
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[1],https://www.emag.ro/resigilate/placi_video/c?r...
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/title,https://www.emag.ro/resigilate/placi_video/c?r...
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[2],https://www.emag.ro/resigilate/placi_video/c?r...
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[3],https://www.emag.ro/resigilate/placi_video/c?r...
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[4],https://www.emag.ro/resigilate/placi_video/c?r...
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/link[1],https://www.emag.ro/resigilate/placi_video/c?r...
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[5],https://www.emag.ro/resigilate/placi_video/c?r...
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[6],https://www.emag.ro/resigilate/placi_video/c?r...


In [21]:
freqs.compute()

Unnamed: 0,descendant1_tags_a,descendant1_tags_address,descendant1_tags_area,descendant1_tags_aside,descendant1_tags_b,descendant1_tags_body,descendant1_tags_br,descendant1_tags_button,descendant1_tags_caption,descendant1_tags_dd,...,descendant1_tags_table,descendant1_tags_tbody,descendant1_tags_td,descendant1_tags_textarea,descendant1_tags_th,descendant1_tags_title,descendant1_tags_tr,descendant1_tags_ul,path,url
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,/html,https://www.emag.ro/resigilate/placi_video/c?r...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,/html/head,https://www.emag.ro/resigilate/placi_video/c?r...
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[1],https://www.emag.ro/resigilate/placi_video/c?r...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/title,https://www.emag.ro/resigilate/placi_video/c?r...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[2],https://www.emag.ro/resigilate/placi_video/c?r...
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[3],https://www.emag.ro/resigilate/placi_video/c?r...
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[4],https://www.emag.ro/resigilate/placi_video/c?r...
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/link[1],https://www.emag.ro/resigilate/placi_video/c?r...
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[5],https://www.emag.ro/resigilate/placi_video/c?r...
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,/html/head/meta[6],https://www.emag.ro/resigilate/placi_video/c?r...


In [22]:
feats.compute()

Unnamed: 0,depth,sibling_pos,no_classes,has_id,no_children,has_text,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,...,ancestor4_no_children,ancestor4_has_text,ancestor5_depth,ancestor5_sibling_pos,ancestor5_no_classes,ancestor5_has_id,ancestor5_no_children,ancestor5_has_text,path,url
0,1,0,0,False,2,False,2,83.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html,https://www.emag.ro/resigilate/placi_video/c?r...
1,2,0,0,False,38,False,38,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head,https://www.emag.ro/resigilate/placi_video/c?r...
2,3,0,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[1],https://www.emag.ro/resigilate/placi_video/c?r...
3,3,1,0,False,0,True,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/title,https://www.emag.ro/resigilate/placi_video/c?r...
4,3,2,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[2],https://www.emag.ro/resigilate/placi_video/c?r...
5,3,3,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[3],https://www.emag.ro/resigilate/placi_video/c?r...
6,3,4,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[4],https://www.emag.ro/resigilate/placi_video/c?r...
7,3,5,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/link[1],https://www.emag.ro/resigilate/placi_video/c?r...
8,3,6,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[5],https://www.emag.ro/resigilate/placi_video/c?r...
9,3,7,0,False,0,False,0,0.0,0.0,0.0,...,0,False,0,0,0,False,0,False,/html/head/meta[6],https://www.emag.ro/resigilate/placi_video/c?r...


In [38]:
feats.merge(oh, on=['url' ,'path']).merge(freqs, on=['url', 'path']).compute()

Unnamed: 0,depth,sibling_pos,no_classes,has_id,no_children,has_text,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,...,descendant1_tags_style,descendant1_tags_sup,descendant1_tags_table,descendant1_tags_tbody,descendant1_tags_td,descendant1_tags_textarea,descendant1_tags_th,descendant1_tags_title,descendant1_tags_tr,descendant1_tags_ul
0,1,0,1,False,2,False,2,32.5,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,False,43,False,43,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,3,0,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,False,0,True,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,3,5,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,3,6,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,3,7,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,3,9,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,3,11,0,False,0,True,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,3,15,0,False,0,False,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
