# Requirements

In [48]:
import os
import sys
import pandas as pd
from premailer import transform
import requests
from pymongo import MongoClient
from datetime import datetime
from urllib.parse import urlsplit
import re
from bs4 import BeautifulSoup
from bs4.element import Tag
from cssutils import parseStyle

# Load page urls from sample json

In [None]:
def wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath):
    inputFile = open(inputFilePath, 'r')
    outputFile = open(outputFilePath, 'w')

    outputFile.write('[')

    previousLine = False
    for tableLineJsonObject in inputFile:
        if (previousLine):
            outputFile.write(previousLine + ',')
        previousLine = tableLineJsonObject
    if (previousLine):
        outputFile.write(previousLine)

    outputFile.write(']')

    inputFile.close()
    outputFile.close()

In [None]:
inputFilePath = os.path.join('..', 'data', 'sample.json')
outputFilePath = os.path.join('..', 'data', 'sample_fixed.json')
if not os.path.isfile(outputFilePath):
    wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath)

In [None]:
sample = pd.read_json(os.path.join('..', 'data', 'sample_fixed.json'))
urls = sample.loc[sample['tableType'] == 'RELATION']['url'].unique()
print('Amount of unique web pages: ' + str(len(urls)))

# Crawl

In [None]:
def getBaseUrl(url):
    return "{0.scheme}://{0.netloc}/".format(urlsplit(url))

def crawl(url):
    html = requests.get(url, timeout=10).text
    htmlWithInlineCSS = transform(
        str(html), 
        exclude_pseudoclasses=True,
        include_star_selectors=True,
        disable_validation=True,
        cssutils_logging_level='CRITICAL',
        base_url=getBaseUrl(url))
    return htmlWithInlineCSS

In [None]:
def sequenceCrawl():
    client = MongoClient()
    db = client.bob
    wtcPages = db.wtcPages
    for index, url in enumerate(urls):
        print('Progress: ' + str(index + 1) + ' URL: ' + url)
        try:
            inlineHTML = crawl(url)
            inlineHTML = inlineHTML.replace('\n', '')
            inlineHTML = inlineHTML.replace('\t', '')
            wtcPages.insert_one({ 'url': url, 'html': inlineHTML})
        except KeyboardInterrupt:
            break
        except:
            print(sys.exc_info()[0], " occured.")
    client.close()

In [None]:
print('Begin time: ' + str(datetime.now()))
sequenceCrawl()
print('End time: ' + str(datetime.now()))

# Extract tables

In [8]:
client = MongoClient()
db = client.bob
wtcPages = db.wtcPages
cursor = wtcPages.find({})
pages = pd.DataFrame(list(cursor))
client.close()

In [56]:
HEADLINE_PATTERN = re.compile('(h|H)\d')
LABEL_CONTROLS = [
    {
        'label': 'Header',
        'color': 'light-blue'
    }, {
        'label': 'Data',
        'color': 'lime'
    }, {
        'label': 'Other',
        'color': 'orange'
    }
];

def extractPageTitle(soup):
    headlines = soup.select('h1')
    return headlines[0].text if len(headlines) > 0 else 'N/A'

def extractTableTitle(table):
    for sibling in table.previous_siblings:
        if (sibling is not None and sibling.name is not None and HEADLINE_PATTERN.match(sibling.name)):
            return sibling.text
    return 'N/A'

def addLabelControls(row, rowIndex, soup):
    labelControlTag = soup.new_tag(
        'th',
        attrs={
            'class': 'flex space-evenly'
        }
    )
    for labelControl in LABEL_CONTROLS:
        labelControlButton = soup.new_tag(
            'a',
            attrs={
                'class': 'labelButton waves-effect waves-light btn-small ' + labelControl['color'],
                'onClick': 'annotate(' + str(rowIndex) + ', "' + labelControl['label'] + '");',
            }
        )
        labelControlButton.string = labelControl['label']
        labelControlTag.append(labelControlButton)
    row.insert(0, labelControlTag)
    
def tagRow(row, rowIndex, soup, isHead=False):
    row['data-label'] = 'Header' if isHead else 'Data'
    row['data-row-index'] = rowIndex
    addLabelControls(row, rowIndex, soup)
    
def isHeaderRow(row):
    thTags = row.find_all('th', recursive=False)
    childCount = len(row.contents)
    return childCount == len(thTags) or row.parent.name == 'thead'

def tagRows(table, soup):
    rows = table.find_all('tr')
    annotations = []
    for rowIndex, row in enumerate(rows):
        isHeader = isHeaderRow(row)
        tagRow(row, rowIndex, soup, isHeader)  
        annotations.append('Header' if isHeader else 'Data')
    return annotations

def removeTableWidthLimitation(table):
    if not table.has_attr('style'):
        return
    tableStyle = parseStyle(table['style'])
    tableStyle['width'] = '100%'
    tableStyle['font-size'] = '100%'
    table['style'] = tableStyle.cssText
        
def extractTableInformation(table, pageID, tableIndex, pageTitle, soup, url):
    extractedInformation = {
        'pageID': pageID,
        'tableIndex': tableIndex,
        'pageTitle': pageTitle,
        'url': url
    }
    extractedInformation['html'] = table.prettify()
    annotations = tagRows(table, soup)
    removeTableWidthLimitation(table)
    extractedInformation['taggedHtml'] = table.prettify()
    extractedInformation['annotations'] = annotations
    extractedInformation['tableTitle'] = extractTableTitle(table)
    return extractedInformation

def hasNestedTable(table):
    return len(table.select('table')) > 0

def extractTables(page):
    soup = BeautifulSoup(page['html'])
    pageTitle = extractPageTitle(soup)
    
    tableTags = soup.select('table')
    tableTags = list(filter(lambda table: not hasNestedTable(table), tableTags))
    tableIDs = list(sample.loc[sample['url'] == page['url']]['tableNum'].values)
    tableIDs = list(filter(lambda tableID: tableID < len(tableTags), tableIDs))
    extractedTables = list(map(
        lambda tableID: extractTableInformation(tableTags[tableID], page['_id'], int(tableID), pageTitle, soup, page['url']), 
        tableIDs
    ))
    return extractedTables

In [58]:
client = MongoClient()
db = client.bob
wtcTables = db.wtcTables
for extractedTables in pages.apply(extractTables, axis='columns').values:
    if len(extractedTables) > 0:
        wtcTables.insert_many(extractedTables)
client.close()

# Select 300 tables for labeling

In [60]:
client = MongoClient()
db = client.bob
wtcTables = db.wtcTables
cursor = wtcTables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [63]:
labelTables = tables.sample(n=300)
labelTables1 = tables.head(150)
labelTables2 = tables.tail(150)

In [64]:
client = MongoClient()
db = client.bob
labelTables1Collection = db.wtcLabelTables1
labelTables1Collection.insert_many(labelTables1.to_dict('records'))
labelTables2Collection = db.wtcLabelTables2
labelTables2Collection.insert_many(labelTables2.to_dict('records'))
client.close()