## Dom feature test
This document serves as a sanity check for the implementation of the DOM feature extractor

In [1]:
%matplotlib inline

import sys, os

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# add the library path
sys.path.append(os.path.join(os.getcwd(), "src"))
from features import extract_features_from_html

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# load the data
df = pd.read_csv('../data/2017-07-21-13:41:49-ecommerce-ro.csv')
df.head()

Unnamed: 0,html,url
0,<!DOCTYPE html>\r\n<html>\r\n <head>\r\n <...,https://brx.ro/
1,<!DOCTYPE html>\n<html>\n <head>\n <link r...,https://brx.ro/adauga-anunt
2,<!DOCTYPE html>\n<html>\n <head>\n <link r...,https://brx.ro/anunturi/categorie/auto-moto-si...
3,<!DOCTYPE html>\n<html>\n <head>\n <link r...,https://brx.ro/anunturi/categorie/imobiliare
4,<!DOCTYPE html>\n<html>\n <head>\n <link r...,https://brx.ro/anunturi/categorie/electronice-...


In [3]:
# try extracting features 
feats = extract_features_from_html(df.html[0], 2, 2)
feats.iloc[100:105]

Unnamed: 0,depth,tag,no_classes,has_id,no_children,has_text,classes,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,...,ancestor1_no_children,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes
100,8,a,0,False,2,False,[],2,0.5,0.0,...,1,False,[category-item],6,div,2,False,20,False,"[left, homecategories]"
101,9,div,1,False,1,False,[category-icon],1,0.0,0.0,...,2,False,[],7,li,1,False,1,False,[category-item]
102,10,img,1,False,0,False,[img-responsive],0,0.0,0.0,...,1,False,[category-icon],8,a,0,False,2,False,[]
103,9,span,1,False,0,True,[category-title],0,0.0,0.0,...,2,False,[],7,li,1,False,1,False,[category-item]
104,7,comment,0,False,0,True,[],0,0.0,0.0,...,20,False,"[left, homecategories]",5,div,3,False,5,False,"[container_12, clearfix, home]"


In [4]:
# ancestor features
ancestor_feats = feats.filter(like='ancestor', axis='columns')
ancestor_feats.tail()

Unnamed: 0,ancestor1_depth,ancestor1_tag,ancestor1_no_classes,ancestor1_has_id,ancestor1_no_children,ancestor1_has_text,ancestor1_classes,ancestor2_depth,ancestor2_tag,ancestor2_no_classes,ancestor2_has_id,ancestor2_no_children,ancestor2_has_text,ancestor2_classes
246,6,p,0,False,1,False,[],5,div,1,False,4,False,[form]
247,5,div,1,False,4,False,[form],4,div,2,False,5,False,"[envelope, popup]"
248,4,div,2,False,5,False,"[envelope, popup]",3,div,2,False,2,False,"[invisible, facebookLike]"
249,4,div,2,False,5,False,"[envelope, popup]",3,div,2,False,2,False,"[invisible, facebookLike]"
250,2,body,1,False,9,False,[homepage],1,html,0,False,2,False,[]


In [5]:
# descendant features
descendant_feats = feats.filter(like='descendant', axis='columns')
descendant_feats.head() 

Unnamed: 0,descendant1_no_nodes,descendant1_no_children_avg,descendant1_has_id_avg,descendant1_no_classes_avg,descendant1_has_text_avg,descendant1_classes,descendant1_tags,descendant2_no_nodes,descendant2_no_children_avg,descendant2_has_id_avg,descendant2_no_classes_avg,descendant2_has_text_avg,descendant2_classes,descendant2_tags
0,2,19.5,0.0,0.5,0.0,[homepage],"[head, body]",39,0.179487,0.153846,0.153846,0.230769,"[standardwidth, home, standardwidth, loader, i...","[link, meta, title, meta, comment, link, link,..."
1,30,0.0,0.2,0.0,0.166667,[],"[link, meta, title, meta, comment, link, link,...",0,0.0,0.0,0.0,0.0,[],[]
2,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
3,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]
4,0,0.0,0.0,0.0,0.0,[],[],0,0.0,0.0,0.0,0.0,[],[]


In [6]:
# normal features
non_node_feats = list(descendant_feats.columns) + list(ancestor_feats.columns)
node_feats = feats.drop(non_node_feats, axis='columns')
node_feats.iloc[100:105]

Unnamed: 0,depth,tag,no_classes,has_id,no_children,has_text,classes
100,8,a,0,False,2,False,[]
101,9,div,1,False,1,False,[category-icon]
102,10,img,1,False,0,False,[img-responsive]
103,9,span,1,False,0,True,[category-title]
104,7,comment,0,False,0,True,[]
