# Table Header Detection

## Requirements
- Conda
- MongoDB instance
- PyMongo (will be installed by the notebook)

In [64]:
# import sys
# !conda install --yes --prefix {sys.prefix} pymongo
# !conda install --yes --prefix {sys.prefix} premailer
# pip install premailer

import os
import json
import re
import pandas as pd
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from cssutils import parseStyle
from premailer import Premailer
import time
from dateutil.parser import parse
import math

## Loading the seed data into mongoDB
- initial dataset [Wikipedia TabEL dataset](http://websail-fe.cs.northwestern.edu/TabEL/)
- dataset is lacking of some styling information
- we're crawling the wikipedia pages on our own
  - that should be feasible since we have to use labeled data only (both for training & testing)
  - we're taking the TabEL dataset pageID's as starting point, since we know that there should be at least one relational table

Each line of the TabEL dataset contains one JSON object representing a single table. However, the JSON objects are not contained within a JSON array. We need to wrap the single tables into an array first before we can parse the file as a whole.

In [4]:
def wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath):
    inputFile = open(inputFilePath, 'r')
    outputFile = open(outputFilePath, 'w')

    outputFile.write('[')

    previousLine = False
    for tableLineJsonObject in inputFile:
        if (previousLine):
            outputFile.write(previousLine + ',')
        previousLine = tableLineJsonObject
    if (previousLine):
        outputFile.write(previousLine)

    outputFile.write(']')

    inputFile.close()
    outputFile.close()

Check if TabEL dataset has been transformed into an array before. If not, we want to parse it now.

In [12]:
inputFilePath = os.path.join('data', 'wikipedia_0_5000.json')
outputFilePath = os.path.join('data', 'wikipedia_0_5000_fixed.json')
if not os.path.isfile(outputFilePath):
    wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath)

Parse JSON Array

In [83]:
tabEL = pd.read_json(os.path.join('data', 'wikipedia_0_5000_fixed.json'))
tabEL.head()

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId
0,10000032-1,4,11,1,[1],0.535975,10000032,Mid Antrim (Northern Ireland Parliament consti...,Members of Parliament,Members of Parliament,"[[{'cellID': -1, 'textTokens': [], 'text': '',...","[[{'cellID': -1, 'textTokens': [], 'text': 'El...",1
1,1000006-1,4,21,1,[],0.856769,1000006,Römer (crater),Satellite craters,Satellite craters,"[[{'cellID': -1, 'textTokens': [], 'text': 'A'...","[[{'cellID': -1, 'textTokens': [], 'text': 'Rö...",1
2,10000088-1,2,1,3,[],0.318258,10000088,Whispermoon,,Track listing,"[[{'cellID': -1, 'textTokens': [], 'text': 'Al...","[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...",1
3,10000218-1,2,6,1,[],0.553872,10000218,Khalsa Diwan Society Vancouver,First executive committee,First executive committee,"[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ti...",1
4,10000228-1,2,7,1,[1],0.951118,10000228,Julien Leparoux,Year-end charts,Year-end charts,"[[{'cellID': -1, 'textTokens': [], 'text': 'Na...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ch...",1


Get 1000 unique page IDs and fetch the HTML content for it.

In [101]:
pageIDSample = tabEL[['pgId']].sample(n=1000)

## Crawl the wikipedia pages and fetch all occurring tables
We use the pageID's from the TabEL dataset and crawl the wikipedia html. One page might include multiple tables. We only extract HTML tables with the class `wikitable`. The style from the CSS file gets parsed into inline style.

In [129]:
BASE_URL = 'https://en.wikipedia.org/'
wikipediaCSSFilePath = os.path.join('data', 'wikipedia.css')
instance = Premailer(base_url=BASE_URL)

def crawl(tabEL):
    payload = { 'curid': str(tabEL['pgId']) }
    html = requests.get(BASE_URL, params=payload).text
    htmlWithInlineCSS = instance.transform(str(html))
    return htmlWithInlineCSS

In [None]:
pageIDSample['HTML'] = pageIDSample.apply(crawl, axis='columns')

In [206]:
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\n', '')
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\t', '')

Since crawling is time expensive we store the data into file and db first.

In [134]:
client = MongoClient()
db = client.bob
pages = db.pages
pages.insert_many(pageIDSample.to_dict('records'))
pageIDSample.to_json(os.path.join("data", "crawled.json"))
client.close()

<pymongo.results.InsertManyResult at 0x1e7bd5948>

In [107]:
client = MongoClient()
db = client.bob
pages = db.pages
cursor = pages.find({})
pageIDSample = pd.DataFrame(list(cursor))
client.close()

In [108]:
pageIDSample.head()

Unnamed: 0,HTML,_id,pgId
0,"<!DOCTYPE html><html class=""client-nojs"" lang=...",5cf04ad51ae12a78ed913549,10041828
1,"<!DOCTYPE html><html class=""client-nojs"" lang=...",5cf04ad51ae12a78ed91354a,10086127
2,"<!DOCTYPE html><html class=""client-nojs"" lang=...",5cf04ad51ae12a78ed91354b,1008145
3,"<!DOCTYPE html><html class=""client-nojs"" lang=...",5cf04ad51ae12a78ed91354c,1012548
4,"<!DOCTYPE html><html class=""client-nojs"" lang=...",5cf04ad51ae12a78ed91354d,10128185


Now we extract the tables along with some metadata. For each row we assign an unique ID (the index of the row within the table) and a tag (whether the row includes `th-tags` only or is contained within a `thead`)

In [159]:
HEADLINE_PATTERN = re.compile('(h|H)\d')
LABEL_CONTROLS = [
    {
        'label': 'Header',
        'color': 'light-blue'
    }, {
        'label': 'Data',
        'color': 'lime'
    }, {
        'label': 'Other',
        'color': 'orange'
    }
];

def extractPageTitle(soup):
    headlines = soup.select('h1')
    return headlines[0].text if len(headlines) > 0 else 'N/A'

def extractTableTitle(table):
    for sibling in table.previous_siblings:
        if (sibling is not None and sibling.name is not None and HEADLINE_PATTERN.match(sibling.name)):
            return sibling.text
    return 'N/A'

def addLabelControls(row, rowIndex, soup):
    labelControlTag = soup.new_tag(
        'th',
        attrs={
            'class': 'flex space-evenly'
        }
    )
    for labelControl in LABEL_CONTROLS:
        labelControlButton = soup.new_tag(
            'a',
            attrs={
                'class': 'labelButton waves-effect waves-light btn-small ' + labelControl['color'],
                'onClick': 'annotate(' + str(rowIndex) + ', "' + labelControl['label'] + '");',
            }
        )
        labelControlButton.string = labelControl['label']
        labelControlTag.append(labelControlButton)
    row.insert(0, labelControlTag)
    
def tagRow(row, rowIndex, soup, isHead=False):
    row['data-label'] = 'Header' if isHead else 'Data'
    row['data-row-index'] = rowIndex
    addLabelControls(row, rowIndex, soup)
    
def isHeaderRow(row):
    thTags = row.find_all('th', recursive=False)
    childCount = len(row.contents)
    return childCount == len(thTags) or row.parent.name == 'thead'

def tagRows(table, soup):
    rows = table.find_all('tr')
    annotations = []
    for rowIndex, row in enumerate(rows):
        isHeader = isHeaderRow(row)
        tagRow(row, rowIndex, soup, isHeader)  
        annotations.append('Header' if isHeader else 'Data')
    return annotations

def removeTableWidthLimitation(table):
    if not table.has_attr('style'):
        return
    tableStyle = parseStyle(table['style'])
    tableStyle['width'] = '100%'
    tableStyle['font-size'] = '100%'
    table['style'] = tableStyle.cssText
        
def extractTableInformation(table, pageID, tableIndex, pageTitle, soup):
    extractedInformation = {
        'pageID': pageID,
        'tableIndex': tableIndex,
        'pageTitle': pageTitle
    }
    extractedInformation['html'] = table.prettify()
    annotations = tagRows(table, soup)
    removeTableWidthLimitation(table)
    extractedInformation['taggedHtml'] = table.prettify()
    extractedInformation['annotations'] = annotations
    extractedInformation['tableTitle'] = extractTableTitle(table)
    return extractedInformation

def hasNestedTable(table):
    return len(table.select('table')) > 0

def extractTables(page):
    soup = BeautifulSoup(page['HTML'])
    pageTitle = extractPageTitle(soup)
    wikiTables = soup.select('.wikitable')
    extractedTables = []
    for tableIndex, table in enumerate(wikiTables):
        if hasNestedTable(table):
            continue
        extractedTable = extractTableInformation(table, page['pgId'], tableIndex, pageTitle, soup)
        extractedTables.append(extractedTable)
    return extractedTables

In [160]:
client = MongoClient()
db = client.bob
tables = db.tables
for extractedTables in pageIDSample.apply(extractTables, axis='columns').values:
    if len(extractedTables) > 0:
        tables.insert_many(extractedTables)
client.close()

ERROR	Property: Invalid value for "CSS Level 2.1" property: 1000 [1:1: width]


The data can now get labeled using the provided [labeling tool](https://github.com/RichStone/web-tables-header-detection/tree/master/Labeling%20Tool).

# Feature Extraction

In [12]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [53]:
tables.head()

Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml
0,5cf28fdb1ae12a2691e6c562,1559651000000.0,"[Header, Header, Header, Data]","[{'row': 0, 'cell': 0, 'isMerged': True, 'isCe...","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo..."
1,5cf28fdb1ae12a2691e6c563,1559651000000.0,"[Header, Data]","[{'row': 0, 'cell': 0, 'isMerged': False, 'isC...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
2,5cf28fdb1ae12a2691e6c564,1559651000000.0,"[Header, Data]","[{'row': 0, 'cell': 0, 'isMerged': False, 'isC...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
3,5cf28fdb1ae12a2691e6c565,1559651000000.0,"[Header, Data, Data, Data, Data]","[{'row': 0, 'cell': 0, 'isMerged': False, 'isC...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce..."
4,5cf28fdb1ae12a2691e6c566,1559651000000.0,"[Header, Data, Data, Data, Data]","[{'row': 0, 'cell': 0, 'isMerged': False, 'isC...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce..."


In [112]:
SHORT_TEXT_THRESHOLD = 20
LONG_TEXT_THRESHOLD = 40

def isInt(value):
    try: 
        int(value)
        return True
    except ValueError:
        return False
    
def getRowSpan(cell):
    if cell.has_attr('rowspan') and isInt(cell['rowspan']):
        return int(cell['rowspan'])
    return 1
    
def getColSpan(cell):
    if cell.has_attr('colspan') and isInt(cell['colspan']):
        return int(cell['colspan'])
    return 1

def isMerged(cell):
    return (
        getColSpan(cell) > 1 or
        getRowSpan(cell) > 1
    )

def isCenterAligned(cell, style):
    return (
        (cell.has_attr('align') and cell['align'] == 'center') or
        (style is not None and 'text-align' in style and style['text-align'] == 'center')
    )

def isThOrInTHead(cell):
    row = cell.parent
    rowParent = row.parent
    return (
        cell.name == 'th' or
        rowParent.name == 'thead'
    )

def extractLayoutFeatures(cell, style):
    return {
        'isMerged': isMerged(cell),
        'isCenterAligned': isCenterAligned(cell, style),
        'isTHOrInTHead': isThOrInTHead(cell)
    }

def isBold(cell, style):
    return bool(
        style is not None and (
            style['font-weight'] == 'bold' or 
            style['font-style'] == 'bold'
        ) or
        cell.find('b') or
        cell.find('strong')
    )

def isItalic(cell, style):
    return bool(cell.find('i'))

def isUnderlined(cell, style):
    return (
        cell.find('u') or
        style is not None and (
            style['text-decoration'] == 'underline' or
            style['font-style'] == 'bold'
        )
    )

def isColored(cell, style):
    return (
        style is not None and (
            'background-color' in style or
            'color' in style
        )
    )

def extractStyleFeatures(cell, style):
    return {
        'isBold': isBold(cell, style),
        'isItalic': isItalic(cell, style),
        'isUnderlined': isUnderlined(cell, style)
    }

def getCellStyle(cell):
    return parseStyle(cell['style']) if cell.has_attr('style') else None

def getContentLength(cell):
    return len(re.sub('\s+',' ', cell.get_text()).split())

def isEmpty(cell):
    return getContentLength(cell) == 0

def isText(cell):
    return cell.get_text().isalpha()

def isNumeric(cell):
    return cell.get_text().isdigit()

def isDate(cell):
    try: 
        parse(cell.get_text(), fuzzy=False)
        return True
    except (ValueError, OverflowError):
        return False
    
def isShortText(cell):
    return getContentLength(cell) <= SHORT_TEXT_THRESHOLD

def isLongText(cell):
    return getContentLength(cell) > LONG_TEXT_THRESHOLD

def isTotal(cell):
    return cell.get_text().lower() == 'total'

def extractValueFeatures(cell):
    return {
        'isEmpty': isEmpty(cell),
        'isText': isText(cell),
        'isNumeric': isNumeric(cell),
        'isDate': isDate(cell),
        'isShortText': isShortText(cell),
        'isLongText': isLongText(cell),
        'isTotal': isTotal(cell)
    }

def mapDictBoolValuesToInt(dictionary):
    return { key: int(value) for key, value in dictionary.items() }

def applyColSpanFactor(dictionary, colSpan):
    return { key: value * colSpan for key, value in dictionary.items() }

def extractCellFeatures(cell, startRowIndex):
    cellStyle = getCellStyle(cell)
    boolCellFeatures = {
        **extractLayoutFeatures(cell, cellStyle),
        **extractStyleFeatures(cell, cellStyle),
        **extractValueFeatures(cell)
    }
    intCellFeatures = mapDictBoolValuesToInt(boolCellFeatures)
    # intCellFeatures = applyColSpanFactor(intCellFeatures, getColSpan(cell))
    cellFeatures = []
    rowSpan = getRowSpan(cell)
    for rowIndex in range(startRowIndex, startRowIndex + rowSpan):
        featureCopy = dict(intCellFeatures)
        featureCopy['row'] = rowIndex
        featureCopy['colCount'] = 1 # set to one since we add a column
        cellFeatures.append(featureCopy)
    return cellFeatures

def merge(featuresA, featuresB):
    return { k: featuresA.get(k, 0) + featuresB.get(k, 0) for k in set(featuresA) | set(featuresB) }

def stringifyDictKeys(dictionary):
    return { str(key): value for key, value in dictionary.items() }

def extractFeatures(table):
    soup = BeautifulSoup(table['html'])
    rows = soup.select('tr')
    rowsFeatures = {}
    for rowIndex, row in enumerate(rows):
        colCount = 0
        for cell in row.children:
            if type(cell) is not Tag:
                continue
            colCount += 1
            for rowFeatures in extractCellFeatures(cell, rowIndex):
                rowFeatureIndex = rowFeatures.pop('row')
                merged = merge(rowsFeatures.get(rowFeatureIndex, {}), rowFeatures)
                rowsFeatures[rowFeatureIndex] = merged
        rowsFeatures[rowIndex]['colCount'] = rowsFeatures[rowIndex].get('colCount', 0) + colCount
    rowsFeatures = stringifyDictKeys(rowsFeatures)
    return rowsFeatures

In [113]:
tables['features'] = tables.apply(extractFeatures, axis='columns')
tables.head()

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: In

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: S

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: I

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error in Property: background: 
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value:  
ERROR	CSSStyleDeclaration: Syntax Error 

ERROR	CSSStyleDeclaration: Unexpected token, ignoring upto '"'. [1:18: "]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CS

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
INFO	CSSStyleDeclaration: Stripped standalone semicolon: ;
ERROR	CSSStyleDeclaration: Unexpected token, ignoring upto '"'. [1:18: "]
ERROR	CSSStyleDeclaration: Unexpected token, ignoring upto '"'. [1:18: "]
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content 

ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, 

ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Prop

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value:

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	Property

Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml
0,5cf28fdb1ae12a2691e6c562,1559651000000.0,"[Header, Header, Header, Data]","{'0': {'isBold': 0, 'isItalic': 0, 'isShortTex...","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo..."
1,5cf28fdb1ae12a2691e6c563,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 3, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
2,5cf28fdb1ae12a2691e6c564,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 6, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
3,5cf28fdb1ae12a2691e6c565,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 11, 'isCe...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce..."
4,5cf28fdb1ae12a2691e6c566,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 9, 'isCen...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce..."


In [114]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
tablesCollection.insert_many(tables.to_dict('records'))
client.close()

## Logarithmic Binning

In [68]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml
0,5cf28fdb1ae12a2691e6c562,1559651000000.0,"[Header, Header, Header, Data]","{'0': {'isBold': 0, 'isItalic': 0, 'isShortTex...","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo..."
1,5cf28fdb1ae12a2691e6c563,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 3, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
2,5cf28fdb1ae12a2691e6c564,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 6, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
3,5cf28fdb1ae12a2691e6c565,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 11, 'isCe...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce..."
4,5cf28fdb1ae12a2691e6c566,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 9, 'isCen...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce..."


In [117]:
def calcA(c, r):
    if (c == 0):
        return 0
    if (c == r):
        return r
    if (c > r / 2.0):
        return math.floor(math.log2(r - c) + 1)
    return math.floor(math.log2(c) + 1)

def calcB(c, r):
    return math.floor(math.log2(r))

def isInSameBin(rowA, rowB, featureKey):
    return (
        calcB(rowA[featureKey], rowA['colCount']) == calcB(rowB[featureKey], rowB['colCount']) and 
        calcA(rowA[featureKey], rowA['colCount']) == calcA(rowB[featureKey], rowB['colCount'])
    )

def logBinTable(table):
    logBins = []
    for rowIndex, row in table['features'].items():
        if rowIndex == '0':
            continue
        prevRowIndex = int(rowIndex) - 1
        logBin = {}
        for featureKey, feature in row.items():
            if featureKey == 'colCount':
                continue
            logBin[featureKey] = isInSameBin(row, table['features'][str(prevRowIndex)], featureKey)
        logBins.append(logBin)
    return logBins

In [118]:
tables['logBin'] = tables.apply(logBinTable, axis='columns')
tables.head()

Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml,logBin
0,5cf28fdb1ae12a2691e6c562,1559651000000.0,"[Header, Header, Header, Data]","{'0': {'isBold': 0, 'isItalic': 0, 'isShortTex...","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo...","[{'isBold': True, 'isItalic': True, 'isShortTe..."
1,5cf28fdb1ae12a2691e6c563,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 3, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","[{'isItalic': True, 'isShortText': True, 'isCe..."
2,5cf28fdb1ae12a2691e6c564,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 6, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","[{'isItalic': True, 'isShortText': True, 'isCe..."
3,5cf28fdb1ae12a2691e6c565,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 11, 'isCe...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce...","[{'isItalic': True, 'isShortText': True, 'isCe..."
4,5cf28fdb1ae12a2691e6c566,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 9, 'isCen...","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce...","[{'isItalic': True, 'isShortText': True, 'isCe..."


In [119]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
tablesCollection.insert_many(tables.to_dict('records'))
client.close()