In [10]:
import requests, re
from bs4 import BeautifulSoup
from bs4.element import NavigableString
import pandas as pd

In [11]:
class HtmlEncoder():
    def __init__(self, url):
        self.url = url
        self.soup = self.get_soup()
        self.__df = None
            
    def get_soup(self):
        res = self.__read_html()
        soup = BeautifulSoup(res.text)
        soup = self.__pre_cleaning(soup)
        return soup
    
    def __pre_cleaning(self, soup):
        for script in soup(["script", "footer", "nav", "head"]):  # remove all javascript code
            script.decompose()
        return soup
        
    def __read_html(self):
        return requests.get(self.url)
        
    def __clean_text(self, text):
        if not text:
            return ''
        text = text.lower()
        text = text.encode('ascii', errors='ignore').decode("utf-8")
        text = text.replace('\n', '')
        text = text.replace('\r', '')
        text = text.replace('\t', '')
        text = text.replace(' ','')
        return text
    
    def __get_tag_depth(self, tag):
        count = 0
        while tag and tag.name != 'body':
            count += len([t for t in tag.previous_siblings if (type(t) != NavigableString and t.name!=None)])
            tag = tag.parent
        return count

    def __get_siblings(self, tag):
        prev_siblings = [t.name for t in tag.previous_siblings if (type(t) != NavigableString and t.name!=None)]
        next_siblings = [t.name for t in tag.next_siblings if  (type(t) != NavigableString and t.name!=None)]
        return prev_siblings, next_siblings

    def __get_total_ancestors(self, tag):
        level = 0
        while tag and tag.name != 'html':
            tag = tag.parent
            level += 1
        return level

    def __get_core_tag_attr(self, tag):
        tag_name = tag.name
        parent_tag_name = None
        if tag.parent:
            parent_tag_name = tag.parent.name
        ancestors_count = self.__get_total_ancestors(tag)
        tag_depth = self.__get_tag_depth(tag)
        return {'tag_name': tag_name, 'parent_tag_name': parent_tag_name, 'ancestors_count': ancestors_count, 'tag_depth':tag_depth}

    def __get_childrens_attr(self, tag):
        childrens = tag.find_all()
        children_count = len(childrens)
        children_types = [t.name for t in childrens]
        return {'children_count': children_count}#, 'children_types': children_types}

    def __get_sibling_attr(self, tag):
        left_siblings, right_siblings = self.__get_siblings(tag)
        siblings_length = len(left_siblings) + len(right_siblings)
        tag_position_among_siblings = len(left_siblings) + 1
        return {'left_siblings': left_siblings, 'right_siblings': right_siblings, 'tag_position_among_siblings': tag_position_among_siblings}

    def __get_tag_text_attr(self, tag):
        text = tag.text
        text_length = len(text.strip())
        clean_text_length = len(self.__clean_text(text))
        total_numerical_groups = len(re.findall(r'\d+', text)) # ['21', '2019']
        total_numbericals = len(re.findall(r'\d',text))
        return {'text_length': text_length, 'total_numerical_groups': total_numerical_groups,
                'total_numbericals': total_numbericals, 'text':text,
               'clean_text_length': clean_text_length}

    def __get_class_related_attr(self, tag):
        """
        returns length of classes & number of classes
        """
        total_classes = 0
        class_text = None
        classes = tag.get('class')
        if classes:
            total_classes = len(classes)
            class_text = ' '.join(classes)
        return { 'total_classes': total_classes, 'class_text': class_text }
    
    def get_featurization_functions(self):
        function_list = [self.__get_core_tag_attr, 
                         self.__get_childrens_attr, 
                         self.__get_sibling_attr, 
                         self.__get_tag_text_attr,
                         self.__get_class_related_attr]
        return function_list        
    
    def featurize_tag(self, tag):
        if type(tag) == NavigableString or tag.name in ['html', 'body']:
            return None
        all_functions = self.get_featurization_functions()
        data = {}
        for func in all_functions:
            data_dict =  func(tag)
            for key, value in data_dict.items():
                data[key] = value
        return data
    
    def __set_html_df(self, df):
        self.__df = df
    
    def get_html_df(self):
        return self.__df
    
    def transform(self):
        all_data = []
        all_tags = self.soup.find_all()
        for tag in all_tags:
            data = self.featurize_tag(tag)
            if data:
                all_data.append(data)
        self.__set_html_df(pd.DataFrame(all_data))

### ML model

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

import numpy as np
import json

In [13]:
tag_name_encoder = load('tag_name_encoder.joblib') 
parent_tag_name_encoder = load('parent_tag_name_encoder.joblib') 
random_forest_model = load('rf_ml_model.joblib') 



In [14]:
class ExtractPubDateFromHTML():
    def __init__(self, df):
        self.df = df
        self.columns_to_consider = ['tag_name', 'parent_tag_name', 'ancestors_count', 'children_count',
                                    'tag_depth','tag_position_among_siblings', 'text_length','clean_text_length',
                                    'total_classes', 'total_numbericals', 'total_numerical_groups', 'text']
        self.is_date_tag_names = ['b', 'p', 'div', 'a', 'date', 'time', 'span', 'strong', 'tr', 'td',
                                  'em', 'chron', 'h6', 'small', 'ul', 'li']
        self.is_date_tag_parent_names = ['p', 'div', 'date', 'span', 'header', 'body', 'strong', 'a', 'table', 'tr',
                                    'td', 'article', 'time', 'em', 'small', 'ul', 'figcaption', 'li']
        self.min_text_length = 8
        self.max_text_length = 140
        self.threshold = 0.31
              
    def encode_ground_truth(self, x):
        
        """return 1 if input is True else 0"""
        return 1 if x else 0

    def decode_ground_truth(self, x):
        """return True if input is 1 else False"""
        return True if x else False
    
    def preprocess_and_cleaning(self):
        self.df = self.df[self.df['tag_name'].isin(self.is_date_tag_names)]
        self.df = self.df[self.df['parent_tag_name'].isin(self.is_date_tag_parent_names)]
        self.df = self.df[~self.df.text.isna()]
        self.df = self.df[((self.df['text_length'] >= self.min_text_length) &
                           (self.df['text_length'] <= self.max_text_length))]
        self.df = self.df[self.df['total_numerical_groups'] > 0]
        self.df = self.df[self.columns_to_consider]
        
        self.df = self.df.reset_index(drop=False)
        html_text_df = self.df[['text']]
        self.df = self.df.drop(['text', 'index'], axis=1)
        
        test_tag_name_encoded = tag_name_encoder.transform(self.df['tag_name'])
        test_parent_tag_name_encoded = parent_tag_name_encoder.transform(self.df['parent_tag_name'])
        
        self.df = self.df.drop(['tag_name', 'parent_tag_name'], axis=1)
        featurised_data = np.hstack([test_tag_name_encoded, test_parent_tag_name_encoded, self.df.values])

        return featurised_data, html_text_df
    
    def fetch_published_date(self):
        featurised_data, html_text_df = self.preprocess_and_cleaning()
        
        html_text_df['is_date_present_proba'] = [i[1] for i in random_forest_model.predict_proba(featurised_data)]
        html_text_df['is_date_present'] = html_text_df['is_date_present_proba'].apply(lambda x: True if x>=self.threshold else False)
        html_text_df = html_text_df.sort_values(by=['is_date_present_proba'], ascending=False)
        return html_text_df

In [15]:
class ExtractPublishedDate():
    
    def __init__(self, url, df):
        self.ml_model = ExtractPubDateFromHTML(df)
    
    def extract_published_date(self):
        return self.ml_model.fetch_published_date()

In [20]:
url = """https://zerodha.tech/blog/a-lesson-in-niche-business-dsls-at-scale/"""
print(url)

https://zerodha.tech/blog/a-lesson-in-niche-business-dsls-at-scale/


In [21]:
html_encoder = HtmlEncoder(url)
html_encoder.transform()
pub_date_extractor = ExtractPublishedDate(url, html_encoder.get_html_df())
tmp = pub_date_extractor.extract_published_date()
tmp.iloc[0]['text']

'24 Nov 2020'

In [22]:
tmp

Unnamed: 0,text,is_date_present_proba,is_date_present
3,24 Nov 2020,0.758755,True
5,Rules in Veto v2 are now written in plain Go b...,0.028571,False
6,These are backed by two small deployments whic...,0.028571,False
1,Rohan Verma\n(Software Engineer)24 Nov 2020Hom...,0.017934,False
0,A lesson in creating and using niche business ...,0.010791,False
2,Rohan Verma\n(Software Engineer)24 Nov 2020Hom...,0.008147,False
4,govaluate:\n1319773 907 ns/op\ngo-plugins:\n27...,0.0,False


In [None]:
tmp['is_date_present'].value_counts()

# Extensions
1. use NER
2. backend rendering tools like puppeteer, selenium
3. Use more training data