## Dom feature test
This document serves as a sanity check for the implementation of the DOM feature extractor

In [1]:
%matplotlib inline

import sys, os

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# add the library path
sys.path.append(os.path.join(os.getcwd(), "../src"))
from features import extract_features_from_html, extract_features_from_df

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# load the data
df = pd.read_csv('../data/ecommerce.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,html,url
0,0,"<!DOCTYPE html>\n<html lang=""ro"">\n<head>\n ...",https://marketplace-leads.emag.ro/?utm_source=...
1,1,"<!DOCTYPE html>\n<html xml:lang=""ro"" lang=""ro""...",https://www.emag.ro/
2,2,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",https://www.emag.ro/user/login?ref=hdr_login_btn
3,3,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",https://www.emag.ro/user/login?ref=hdr_signup_btn
4,4,"<!DOCTYPE html>\n<html xml:lang=""ro"" lang=""ro""...",https://www.emag.ro/cart/products?ref=cart


In [3]:
# try extracting features 
feats = extract_features_from_html(df.html[0], 2, 2)
feats.iloc[100:105]

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,...,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes
100,3,1,html_comment,0,False,0,True,[],0,0.0,...,False,[],1,0,html,0,False,2,False,[]
101,3,2,script,0,False,0,True,[],0,0.0,...,False,[],1,0,html,0,False,2,False,[]
102,3,3,html_comment,0,False,0,True,[],0,0.0,...,False,[],1,0,html,0,False,2,False,[]
103,3,4,div,1,False,0,False,[baloons],0,0.0,...,False,[],1,0,html,0,False,2,False,[]
104,3,5,div,2,False,1,False,"[container, leads-container]",1,2.0,...,False,[],1,0,html,0,False,2,False,[]


In [4]:
# ancestor features
ancestor_feats = feats.filter(like='ancestor', axis='columns')
ancestor_feats.tail()

Unnamed: 0,ancestor1_depth,ancestor1_sibling_pos,ancestor1_tag,ancestor1_no_classes,ancestor1_has_id,ancestor1_no_children,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes
201,7,1,ul,0,False,3,False,[],6,1,div,1,False,2,False,[leads-step-text]
202,7,1,ul,0,False,3,False,[],6,1,div,1,False,2,False,[leads-step-text]
203,2,1,body,0,False,11,False,[],1,0,html,0,False,2,False,[]
204,2,1,body,0,False,11,False,[],1,0,html,0,False,2,False,[]
205,2,1,body,0,False,11,False,[],1,0,html,0,False,2,False,[]


In [5]:
# descendant features
descendant_feats = feats.filter(like='descendant', axis='columns')
descendant_feats.head()

Unnamed: 0,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,descendant1_has_text_avg,descendant1_classes,descendant1_tags,descendant2_no_nodes,descendant2_no_children_avg,descendant2_has_id_avg,descendant2_no_classes_avg,descendant2_has_text_avg,descendant2_classes,descendant2_tags
0,2,53.5,0.0,0.0,0.0,[],"[head, body]",107,0.028037,0.0,0.046729,0.616822,"[baloons, container, leads-container, leads-st...","[meta, script, title, meta, meta, html_comment..."
1,96,0.0,0.0,0.0,0.625,[],"[meta, script, title, meta, meta, html_comment...",0,0.0,0.0,0.0,0.0,[],[]
2,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
3,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
4,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]


In [None]:
# normal features
non_node_feats = list(descendant_feats.columns) + list(ancestor_feats.columns)
node_feats = feats.drop(non_node_feats, axis='columns')
node_feats.iloc[100:105]

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes
100,3,1,html_comment,0,False,0,True,[]
101,3,2,script,0,False,0,True,[]
102,3,3,html_comment,0,False,0,True,[]
103,3,4,div,1,False,0,False,[baloons]
104,3,5,div,2,False,1,False,"[container, leads-container]"


In [None]:
extract_features_from_df(df.iloc[1:20], 2, 2)

Unnamed: 0,depth,sibling_pos,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,...,ancestor2_depth,ancestor2_sibling_pos,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes,url,domain
0,1,0,html,1,False,2,False,[lang_ro],2,33.0,...,0,0,,0,False,0,False,[],https://www.emag.ro/,www.emag.ro
1,2,0,head,0,False,46,False,[],46,0.0,...,0,0,,0,False,0,False,[],https://www.emag.ro/,www.emag.ro
2,3,0,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
3,3,1,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
4,3,2,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
5,3,3,html_comment,0,False,0,True,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
6,3,4,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
7,3,5,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
8,3,6,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
9,3,7,meta,0,False,0,False,[],0,0.0,...,1,0,html,1,False,2,False,[lang_ro],https://www.emag.ro/,www.emag.ro
