In [5]:
import sqlite3
import pandas as pd

In [12]:
db_file_name = "/Users/kalsi/Documents/html_tag_annotator/flask_app/published_date_data.sqlite"

conn = sqlite3.connect(db_file_name)
cur = conn.cursor()

cur.execute("select url, html, selected_contents, selected_contents_text, is_computed, is_annotable from annotated_data where is_computed=0")
data = cur.fetchall()
conn.close()

url_meta_df = pd.DataFrame(data, columns=['url', 'html', 'selected_contents', 'selected_contents_text', 'is_computed', 'is_annotable'])

url_meta_df['tag_name'] = url_meta_df['selected_contents'].apply(lambda x : x.split('<')[1].split()[0])
print(url_meta_df.shape)
url_meta_df.head()


(4, 7)


Unnamed: 0,url,html,selected_contents,selected_contents_text,is_computed,is_annotable,tag_name
0,https://eng.uber.com/charon/,"<html lang=""en-US"" id=""html"" class="" td-md-is-...","<time class=""entry-date updated td-module-date...","April 29, 2021",0,0,time
1,https://www.gojek.io/blog/tips-to-secure-ios-app,"<html lang=""en""><head><style type=""text/css"">....","<time datetime=""1618292424000"" class="" custom_...","APR 13, 2021",0,0,time
2,https://ai.googleblog.com/2021/04/flexible-sca...,"<html class=""v2 detail-page"" dir=""ltr"" itemsco...","<span class=""publishdate custom_selected"" item...","Thursday, April 29, 2021",0,0,span
3,https://developer.amazon.com/blogs,"<html lang=""en_US"" class=""a-ws a-js a-audio a-...","<p class=""dex-blog-post-summary-date custom_se...","April 27, 2021",0,0,p


In [14]:
import requests, re
from bs4 import BeautifulSoup
from bs4.element import NavigableString

In [15]:
def pre_cleaning(soup):
    for script in soup(["script", "footer", "nav", "head"]):  # remove all javascript code
        script.decompose()

# Feature Engineering

Generating features for each of the `HTML` pages

## Properties
1. Tag name
2. Parent tag
3. Child tag/tags
4. sibilings
5. text length
6. how many numbers?
7. depth of tag from root
8. Number of childrens
9. position of the node among its sibling
10. length of the class attribute text
11. number of classes (number of space-delimited words in class)

In [16]:
def clean_text(text):
    if not text:
        return ''
    text = text.lower()
    text = text.encode('ascii', errors='ignore').decode("utf-8")
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = text.replace('\t', '')
    text = text.replace(' ','')
    return text

def get_tag_depth(tag):
    count = 0
    while tag and tag.name != 'body':
        count += len([t for t in tag.previous_siblings if (type(t) != NavigableString and t.name!=None)])
        tag = tag.parent
    return count

def get_siblings(tag):
    prev_siblings = [t.name for t in tag.previous_siblings if (type(t) != NavigableString and t.name!=None)]
    next_siblings = [t.name for t in tag.next_siblings if  (type(t) != NavigableString and t.name!=None)]
    return prev_siblings, next_siblings

def get_total_ancestors(tag):
    level = 0
    while tag.name != 'html':
        tag = tag.parent
        level += 1
    return level

def get_core_tag_attr(tag):
    tag_name = tag.name
    parent_tag_name = None
    if tag.parent:
        parent_tag_name = tag.parent.name
    ancestors_count = get_total_ancestors(tag)
    tag_depth = get_tag_depth(tag)
    return {'tag_name': tag_name, 'parent_tag_name': parent_tag_name, 'ancestors_count': ancestors_count, 'tag_depth':tag_depth}

def get_childrens_attr(tag):
    childrens = tag.find_all()
    children_count = len(childrens)
    children_types = [t.name for t in childrens]
    return {'children_count': children_count}#, 'children_types': children_types}

def get_sibling_attr(tag):
    left_siblings, right_siblings = get_siblings(tag)
    siblings_length = len(left_siblings) + len(right_siblings)
    tag_position_among_siblings = len(left_siblings) + 1
    return {'left_siblings': left_siblings, 'right_siblings': right_siblings, 'tag_position_among_siblings': tag_position_among_siblings}

def get_tag_text_attr(tag):
    text = tag.text
    text_length = len(text.strip())
    clean_text_length = len(clean_text(text))
    total_numerical_groups = len(re.findall(r'\d+', text)) # ['21', '2019']
    total_numbericals = len(re.findall(r'\d',text))
    return {'text_length': text_length, 'total_numerical_groups': total_numerical_groups,
            'total_numbericals': total_numbericals, 'text':text,
           'clean_text_length': clean_text_length}

def get_class_related_attr(tag):
    """
    returns length of classes & number of classes
    """
    total_classes = 0
    class_text = None
    classes = tag.get('class')
    if classes:
        total_classes = len(classes)
        class_text = ' '.join(classes)
    return { 'total_classes': total_classes, 'class_text': class_text }

def fetch_tag_features(tag):
    if type(tag) == NavigableString or tag.name in ['html', 'body']:
        return None
    all_functions = [get_core_tag_attr, get_childrens_attr, get_sibling_attr, get_tag_text_attr, get_class_related_attr]
    data = []
    for f in all_functions:
        data.append(f(tag))
    super_data = {}
    for d in data:
        for k, v in d.items():
            super_data[k]=v
    return super_data

In [17]:
def feature_transform_df(url_meta_df):
    df = None
    urls_with_no_annotation = []
    for idx, row, in url_meta_df.iterrows():
        try:
            all_data = []
            url = row['url']
            print(url)
            res = requests.get(url)
            soup = BeautifulSoup(res.text)
            pre_cleaning(soup)
            all_tags = soup.find_all()
            annotated_text = clean_text(row['selected_contents_text'])
            ground_truth_tags = []
            for tag in all_tags:
                if clean_text(tag.text) == annotated_text:
                    ground_truth_tags.append(tag)
            if not ground_truth_tags:
                urls_with_no_annotation.append(url)
                continue
            for tag in all_tags:
                data = fetch_tag_features(tag)
                if data:
                    data['is_date'] = tag in ground_truth_tags
                    data['url'] = url
                    all_data.append(data)
            temp = pd.DataFrame(all_data)
            if df is None:
                df = temp
            else:
                df = pd.concat([temp, df])
        except:
            urls_with_no_annotation.append(url)
    return urls_with_no_annotation, df

In [18]:
urls_with_no_annotation, df = feature_transform_df(url_meta_df)

https://eng.uber.com/charon/
https://www.gojek.io/blog/tips-to-secure-ios-app
https://ai.googleblog.com/2021/04/flexible-scalable-differentiable.html
https://developer.amazon.com/blogs


In [20]:
df.shape

(2512, 17)

In [19]:
df.head()

Unnamed: 0,tag_name,parent_tag_name,ancestors_count,tag_depth,children_count,left_siblings,right_siblings,tag_position_among_siblings,text_length,total_numerical_groups,total_numbericals,text,clean_text_length,total_classes,class_text,is_date,url
0,div,body,2,0,536,[],[noscript],1,7675,57,133,\n\n\n\n\n\n\n\n\n\n\n\n\nDeveloper Console\n\...,4872,0,,False,https://developer.amazon.com/blogs
1,header,div,3,0,27,[],"[div, div, section]",1,1695,29,59,\n\n\n\n\n\n\n\n\n\n\n\nDeveloper Console\n\nS...,1019,2,dpHeader dphThemedark,False,https://developer.amazon.com/blogs
2,div,header,4,0,21,[],[div],1,1648,29,59,\n\n\n\n\n\n\n\n\n\n\nDeveloper Console\n\nSig...,997,2,dphTopBar clearfix,False,https://developer.amazon.com/blogs
3,div,div,5,0,5,[],[div],1,0,0,0,\n\n\n\n\n\n\n\n,0,1,dphLeft,False,https://developer.amazon.com/blogs
4,div,div,6,0,2,[],[a],1,0,0,0,\n\n\n,0,1,dphMenuToggle,False,https://developer.amazon.com/blogs


In [21]:
df = df[~df.duplicated(['children_count', 'class_text',
       'parent_tag_name', 'tag_depth', 'tag_name',
       'tag_position_among_siblings', 'text_length', 'total_classes',
       'total_numbericals', 'total_numerical_groups'])]
df.shape

(1899, 17)

In [22]:
df['is_date'].value_counts()

False    1893
True        6
Name: is_date, dtype: int64

In [12]:
df.head(2)

Unnamed: 0,tag_name,parent_tag_name,ancestors_count,tag_depth,children_count,left_siblings,right_siblings,tag_position_among_siblings,text_length,total_numerical_groups,total_numbericals,text,clean_text_length,total_classes,class_text,is_date,url
0,div,body,2,0,536,[],[noscript],1,7675,57,133,\n\n\n\n\n\n\n\n\n\n\n\n\nDeveloper Console\n\...,4872,0,,False,https://developer.amazon.com/blogs
1,header,div,3,0,27,[],"[div, div, section]",1,1695,29,59,\n\n\n\n\n\n\n\n\n\n\n\nDeveloper Console\n\nS...,1019,2,dpHeader dphThemedark,False,https://developer.amazon.com/blogs


In [52]:
df.to_csv('published_date_date_v1.csv', index=False)