# Table Header Detection

## Requirements
- Conda
- MongoDB instance
- PyMongo (will be installed by the notebook)

In [2]:
# import sys
# !conda install --yes --prefix {sys.prefix} pymongo
# !conda install --yes --prefix {sys.prefix} premailer
# pip install premailer

import os
import json
import re
import pandas as pd
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from cssutils import parseStyle
from premailer import Premailer
import time
from dateutil.parser import parse
import math

## Loading the seed data into mongoDB
- initial dataset [Wikipedia TabEL dataset](http://websail-fe.cs.northwestern.edu/TabEL/)
- dataset is lacking of some styling information
- we're crawling the wikipedia pages on our own
  - that should be feasible since we have to use labeled data only (both for training & testing)
  - we're taking the TabEL dataset pageID's as starting point, since we know that there should be at least one relational table

Each line of the TabEL dataset contains one JSON object representing a single table. However, the JSON objects are not contained within a JSON array. We need to wrap the single tables into an array first before we can parse the file as a whole.

In [4]:
def wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath):
    inputFile = open(inputFilePath, 'r')
    outputFile = open(outputFilePath, 'w')

    outputFile.write('[')

    previousLine = False
    for tableLineJsonObject in inputFile:
        if (previousLine):
            outputFile.write(previousLine + ',')
        previousLine = tableLineJsonObject
    if (previousLine):
        outputFile.write(previousLine)

    outputFile.write(']')

    inputFile.close()
    outputFile.close()

Check if TabEL dataset has been transformed into an array before. If not, we want to parse it now.

In [12]:
inputFilePath = os.path.join('data', 'wikipedia_0_5000.json')
outputFilePath = os.path.join('data', 'wikipedia_0_5000_fixed.json')
if not os.path.isfile(outputFilePath):
    wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath)

Parse JSON Array

In [3]:
tabEL = pd.read_json(os.path.join('data', 'wikipedia_0_5000_fixed.json'))
tabEL.head()

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId
0,10000032-1,4,11,1,[1],0.535975,10000032,Mid Antrim (Northern Ireland Parliament consti...,Members of Parliament,Members of Parliament,"[[{'cellID': -1, 'textTokens': [], 'text': '',...","[[{'cellID': -1, 'textTokens': [], 'text': 'El...",1
1,1000006-1,4,21,1,[],0.856769,1000006,Römer (crater),Satellite craters,Satellite craters,"[[{'cellID': -1, 'textTokens': [], 'text': 'A'...","[[{'cellID': -1, 'textTokens': [], 'text': 'Rö...",1
2,10000088-1,2,1,3,[],0.318258,10000088,Whispermoon,,Track listing,"[[{'cellID': -1, 'textTokens': [], 'text': 'Al...","[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...",1
3,10000218-1,2,6,1,[],0.553872,10000218,Khalsa Diwan Society Vancouver,First executive committee,First executive committee,"[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ti...",1
4,10000228-1,2,7,1,[1],0.951118,10000228,Julien Leparoux,Year-end charts,Year-end charts,"[[{'cellID': -1, 'textTokens': [], 'text': 'Na...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ch...",1


Get 1000 unique page IDs and fetch the HTML content for it.

In [22]:
uniquePageIDs = pd.DataFrame(tabEL['pgId'].unique(), columns=['pgId'])
print('Number of rows: ' + str(uniquePageIDs.shape[0]))

Number of rows: 2719


In [45]:
def pickRandomSample():
    pageIDSample = uniquePageIDs.sample(n=1000)
    pageIDSample.reset_index(inplace=True)
    pageIDSample.drop(axis='columns', labels='index', inplace=True)

pickRandomSample()
pageIDSample.head()

Unnamed: 0,pgId
0,1000141
1,10126252
2,1011342
3,10025676
4,1013210


## Crawl the wikipedia pages and fetch all occurring tables
We use the pageID's from the TabEL dataset and crawl the wikipedia html. One page might include multiple tables. We only extract HTML tables with the class `wikitable`. The style from the CSS file gets parsed into inline style.

In [58]:
BASE_URL = 'https://en.wikipedia.org'
wikipediaCSSFilePath = os.path.join('data', 'wikipedia.css')
cssFilePath = os.path.join('data', 'wikipedia.css')
cssFile = open(cssFilePath, 'r')
css = cssFile.read()
instance = Premailer(
    base_url=BASE_URL,
    exclude_pseudoclasses=False,
    include_star_selectors=True,
    disable_validation=True,
    css_text=css,
    allow_network=False,
    cssutils_logging_level='CRITICAL'
)

def crawl(tabEL):
    print(tabEL.name)
    payload = { 'curid': str(tabEL['pgId']) }
    html = requests.get(BASE_URL, params=payload).text
    htmlWithInlineCSS = instance.transform(str(html))
    return htmlWithInlineCSS

In [59]:
pageIDSample['HTML'] = pageIDSample.apply(crawl, axis='columns')
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\n', '')
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\t', '')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

Since crawling is time expensive we store the data into file and db first.

In [60]:
client = MongoClient()
db = client.bob
pages = db.pages
pages.insert_many(pageIDSample.to_dict('records'))
pageIDSample.to_json(os.path.join("data", "crawled.json"))
client.close()

In [2]:
client = MongoClient()
db = client.bob
pages = db.pages
cursor = pages.find({})
pageIDSample = pd.DataFrame(list(cursor))
client.close()

In [61]:
pageIDSample.head()

Unnamed: 0,pgId,HTML
0,1000141,"<!DOCTYPE html><html class=""client-nojs"" lang=..."
1,10126252,"<!DOCTYPE html><html class=""client-nojs"" lang=..."
2,1011342,"<!DOCTYPE html><html class=""client-nojs"" lang=..."
3,10025676,"<!DOCTYPE html><html class=""client-nojs"" lang=..."
4,1013210,"<!DOCTYPE html><html class=""client-nojs"" lang=..."


Now we extract the tables along with some metadata. For each row we assign an unique ID (the index of the row within the table) and a tag (whether the row includes `th-tags` only or is contained within a `thead`)

In [62]:
HEADLINE_PATTERN = re.compile('(h|H)\d')
LABEL_CONTROLS = [
    {
        'label': 'Header',
        'color': 'light-blue'
    }, {
        'label': 'Data',
        'color': 'lime'
    }, {
        'label': 'Other',
        'color': 'orange'
    }
];

def extractPageTitle(soup):
    headlines = soup.select('h1')
    return headlines[0].text if len(headlines) > 0 else 'N/A'

def extractTableTitle(table):
    for sibling in table.previous_siblings:
        if (sibling is not None and sibling.name is not None and HEADLINE_PATTERN.match(sibling.name)):
            return sibling.text
    return 'N/A'

def addLabelControls(row, rowIndex, soup):
    labelControlTag = soup.new_tag(
        'th',
        attrs={
            'class': 'flex space-evenly'
        }
    )
    for labelControl in LABEL_CONTROLS:
        labelControlButton = soup.new_tag(
            'a',
            attrs={
                'class': 'labelButton waves-effect waves-light btn-small ' + labelControl['color'],
                'onClick': 'annotate(' + str(rowIndex) + ', "' + labelControl['label'] + '");',
            }
        )
        labelControlButton.string = labelControl['label']
        labelControlTag.append(labelControlButton)
    row.insert(0, labelControlTag)
    
def tagRow(row, rowIndex, soup, isHead=False):
    row['data-label'] = 'Header' if isHead else 'Data'
    row['data-row-index'] = rowIndex
    addLabelControls(row, rowIndex, soup)
    
def isHeaderRow(row):
    thTags = row.find_all('th', recursive=False)
    childCount = len(row.contents)
    return childCount == len(thTags) or row.parent.name == 'thead'

def tagRows(table, soup):
    rows = table.find_all('tr')
    annotations = []
    for rowIndex, row in enumerate(rows):
        isHeader = isHeaderRow(row)
        tagRow(row, rowIndex, soup, isHeader)  
        annotations.append('Header' if isHeader else 'Data')
    return annotations

def removeTableWidthLimitation(table):
    if not table.has_attr('style'):
        return
    tableStyle = parseStyle(table['style'])
    tableStyle['width'] = '100%'
    tableStyle['font-size'] = '100%'
    table['style'] = tableStyle.cssText
        
def extractTableInformation(table, pageID, tableIndex, pageTitle, soup):
    extractedInformation = {
        'pageID': pageID,
        'tableIndex': tableIndex,
        'pageTitle': pageTitle
    }
    extractedInformation['html'] = table.prettify()
    annotations = tagRows(table, soup)
    removeTableWidthLimitation(table)
    extractedInformation['taggedHtml'] = table.prettify()
    extractedInformation['annotations'] = annotations
    extractedInformation['tableTitle'] = extractTableTitle(table)
    return extractedInformation

def hasNestedTable(table):
    return len(table.select('table')) > 0

def extractTables(page):
    soup = BeautifulSoup(page['HTML'])
    pageTitle = extractPageTitle(soup)
    wikiTables = soup.select('.wikitable')
    extractedTables = []
    for tableIndex, table in enumerate(wikiTables):
        if hasNestedTable(table):
            continue
        extractedTable = extractTableInformation(table, page['pgId'], tableIndex, pageTitle, soup)
        extractedTables.append(extractedTable)
    return extractedTables

In [63]:
client = MongoClient()
db = client.bob
tables = db.tables
for extractedTables in pageIDSample.apply(extractTables, axis='columns').values:
    if len(extractedTables) > 0:
        tables.insert_many(extractedTables)
client.close()

The data can now get labeled using the provided [labeling tool](https://github.com/RichStone/web-tables-header-detection/tree/master/Labeling%20Tool).

# Feature Extraction

In [23]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [24]:
tables.head()

Unnamed: 0,_id,annotations,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml
0,5cf836ef1ae12a2caf23946f,"[Header, Header, Header, Data]","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo..."
1,5cf836ef1ae12a2caf239470,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
2,5cf836ef1ae12a2caf239471,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
3,5cf836ef1ae12a2caf239472,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce..."
4,5cf836ef1ae12a2caf239473,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce..."


In [25]:
SHORT_TEXT_THRESHOLD = 20
LONG_TEXT_THRESHOLD = 40

def isInt(value):
    try: 
        int(value)
        return True
    except ValueError:
        return False
    
def getRowSpan(cell):
    if cell.has_attr('rowspan') and isInt(cell['rowspan']):
        return int(cell['rowspan'])
    return 1
    
def getColSpan(cell):
    if cell.has_attr('colspan') and isInt(cell['colspan']):
        return int(cell['colspan'])
    return 1

def isMerged(cell):
    return (
        getColSpan(cell) > 1 or
        getRowSpan(cell) > 1
    )

def isCenterAligned(cell, style):
    return (
        (cell.has_attr('align') and cell['align'] == 'center') or
        (style is not None and 'text-align' in style and style['text-align'] == 'center')
    )

def isThOrInTHead(cell):
    row = cell.parent
    rowParent = row.parent
    return (
        cell.name == 'th' or
        rowParent.name == 'thead'
    )

def extractLayoutFeatures(cell, style):
    return {
        'isMerged': isMerged(cell),
        'isCenterAligned': isCenterAligned(cell, style),
        'isTHOrInTHead': isThOrInTHead(cell)
    }

def isBold(cell, style):
    return bool(
        style is not None and (
            style['font-weight'] == 'bold' or 
            style['font-style'] == 'bold'
        ) or
        cell.find('b') or
        cell.find('strong')
    )

def isItalic(cell, style):
    return bool(cell.find('i'))

def isUnderlined(cell, style):
    return (
        cell.find('u') or
        style is not None and (
            style['text-decoration'] == 'underline' or
            style['font-style'] == 'bold'
        )
    )

def isColored(cell, style):
    return (
        style is not None and (
            'background-color' in style or
            'color' in style
        )
    )

def extractStyleFeatures(cell, style):
    return {
        'isBold': isBold(cell, style),
        'isItalic': isItalic(cell, style),
        'isUnderlined': isUnderlined(cell, style)
    }

def getCellStyle(cell):
    return parseStyle(cell['style']) if cell.has_attr('style') else None

def getContentLength(cell):
    return len(re.sub('\s+',' ', cell.get_text()).split())

def isEmpty(cell):
    return getContentLength(cell) == 0

def isText(cell):
    return cell.get_text().isalpha()

def isNumeric(cell):
    return cell.get_text().isdigit()

def isDate(cell):
    try: 
        parse(cell.get_text(), fuzzy=False)
        return True
    except (ValueError, OverflowError):
        return False
    
def isShortText(cell):
    return getContentLength(cell) <= SHORT_TEXT_THRESHOLD

def isLongText(cell):
    return getContentLength(cell) > LONG_TEXT_THRESHOLD

def isTotal(cell):
    return cell.get_text().lower() == 'total'

def extractValueFeatures(cell):
    return {
        'isEmpty': isEmpty(cell),
        'isText': isText(cell),
        'isNumeric': isNumeric(cell),
        'isDate': isDate(cell),
        'isShortText': isShortText(cell),
        'isLongText': isLongText(cell),
        'isTotal': isTotal(cell)
    }

def mapDictBoolValuesToInt(dictionary):
    return { key: int(value) for key, value in dictionary.items() }

def applyColSpanFactor(dictionary, colSpan):
    return { key: value * colSpan for key, value in dictionary.items() }

def extractCellFeatures(cell, startRowIndex):
    cellStyle = getCellStyle(cell)
    boolCellFeatures = {
        **extractLayoutFeatures(cell, cellStyle),
        **extractStyleFeatures(cell, cellStyle),
        **extractValueFeatures(cell)
    }
    intCellFeatures = mapDictBoolValuesToInt(boolCellFeatures)
    # intCellFeatures = applyColSpanFactor(intCellFeatures, getColSpan(cell))
    cellFeatures = []
    rowSpan = getRowSpan(cell)
    for rowIndex in range(startRowIndex, startRowIndex + rowSpan):
        featureCopy = dict(intCellFeatures)
        featureCopy['row'] = rowIndex
        featureCopy['colCount'] = 1
        cellFeatures.append(featureCopy)
    return cellFeatures

def merge(featuresA, featuresB):
    return { k: featuresA.get(k, 0) + featuresB.get(k, 0) for k in set(featuresA) | set(featuresB) }

def stringifyDictKeys(dictionary):
    return { str(key): value for key, value in dictionary.items() }

def extractFeatures(table):
    soup = BeautifulSoup(table['html'])
    rows = soup.select('tr')
    rowsFeatures = {}
    for rowIndex, row in enumerate(rows):
        for cell in row.children:
            if type(cell) is not Tag:
                continue
            for rowFeatures in extractCellFeatures(cell, rowIndex):
                rowFeatureIndex = rowFeatures.pop('row')
                merged = merge(rowsFeatures.get(rowFeatureIndex, {}), rowFeatures)
                rowsFeatures[rowFeatureIndex] = merged
    rowsFeatures = stringifyDictKeys(rowsFeatures)
    return rowsFeatures

In [26]:
tables['features'] = tables.apply(extractFeatures, axis='columns')
tables.head()

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: In

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	No content to parse.
ERROR	PropertyValue

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value:

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value:

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	CSSStyleDeclaration: Unexpected token, ignoring upto '"'. [1:18: "]
ERROR	CSSStyleDeclaration: Unexpected token, ignoring upto '"'. [1:18: "]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 

ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%2221%22 height=%229%22 viewBox=%220 0 21 9%22%3E %3Cpath d=%22M14.5 5l-4 4-4-4zm0-1l-4-4-4 4z%22/%3E %3C/svg%3E") [1:1: background-image]
ERROR	Property: Invalid value for "CSS Level 2.1" property: linear-gradient(transparent, transparent), url("data:im

ERROR	Property: No property value found: background-color: [1:17: :]
ERROR	CSSStyleDeclaration: Syntax Error in Property: background-color:
ERROR	Property: No property value found: background-color: [1:17: :]
ERROR	CSSStyleDeclaration: Syntax Error in Property: background-color:
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '#', 1, 12)
ERROR	No content to parse.
ERROR	PropertyValue: Unknown syntax or no value: #
ERROR	CSSStyleDeclaration: Syntax Error in Property: background:#
ERROR	PropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CS

INFO	CSSStyleDeclaration: Stripped standalone semicolon: ;
INFO	CSSStyleDeclaration: Stripped standalone semicolon: ;
INFO	CSSStyleDeclaration: Stripped standalone semicolon: ;
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:56: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:43: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:57: vertical-align]
ERROR	Propert

ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 54)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 22)
ERROR	PropertyValue: Unknown syntax or no value: border-top:1px solid darkgray
ERROR	CSSStyleDeclaration: Syntax Error in Property: border-top:border-top:1px solid darkgray
ERROR	PropertyValue: No match: ('CHAR', ':', 1, 39)
ERROR	PropertyValue: Unknown syntax or no value:

ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: auto [1:1: text-align]
ERRO

Unnamed: 0,_id,annotations,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml,features
0,5cf836ef1ae12a2caf23946f,"[Header, Header, Header, Data]","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo...","{'0': {'isEmpty': 0, 'isBold': 0, 'colCount': ..."
1,5cf836ef1ae12a2caf239470,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","{'0': {'isBold': 0, 'colCount': 3, 'isTHOrInTH..."
2,5cf836ef1ae12a2caf239471,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","{'0': {'isBold': 0, 'colCount': 6, 'isTHOrInTH..."
3,5cf836ef1ae12a2caf239472,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce...","{'0': {'isBold': 0, 'colCount': 11, 'isTHOrInT..."
4,5cf836ef1ae12a2caf239473,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce...","{'0': {'isBold': 0, 'colCount': 9, 'isTHOrInTH..."


In [28]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
tablesCollection.insert_many(tables.to_dict('records'))
client.close()

## Logarithmic Binning

In [5]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

Unnamed: 0,_id,annotatedAt,annotations,features,html,logBin,pageID,pageTitle,tableIndex,tableTitle,taggedHtml
0,5cf28fdb1ae12a2691e6c562,1559651000000.0,"[Header, Header, Header, Data]","{'0': {'isBold': 0, 'isItalic': 0, 'isShortTex...","<table class=""wikitable floatright"" style=""flo...","[{'isBold': True, 'isItalic': True, 'isShortTe...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo..."
1,5cf28fdb1ae12a2691e6c563,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 3, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...","[{'isItalic': True, 'isShortText': True, 'isCe...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
2,5cf28fdb1ae12a2691e6c564,1559651000000.0,"[Header, Data]","{'0': {'isItalic': 0, 'isShortText': 6, 'isCen...","<table class=""wikitable"">\n <tbody>\n <tr>\n ...","[{'isItalic': True, 'isShortText': True, 'isCe...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat..."
3,5cf28fdb1ae12a2691e6c565,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 11, 'isCe...","<table class=""wikitable"" style=""text-align:cen...","[{'isItalic': True, 'isShortText': True, 'isCe...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce..."
4,5cf28fdb1ae12a2691e6c566,1559651000000.0,"[Header, Data, Data, Data, Data]","{'0': {'isItalic': 0, 'isShortText': 9, 'isCen...","<table class=""wikitable"" style=""text-align:cen...","[{'isItalic': True, 'isShortText': True, 'isCe...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce..."


In [27]:
def calcA(c, r):
    if (c == 0):
        return 0
    if (c == r):
        return r
    if (c > r / 2.0):
        return math.floor(math.log2(r - c) + 1)
    return math.floor(math.log2(c) + 1)

def calcB(c, r):
    return math.floor(math.log2(r))

def isInSameBin(rowA, rowB, featureKey):
    return (
        calcB(rowA[featureKey], rowA['colCount']) == calcB(rowB[featureKey], rowB['colCount']) and 
        calcA(rowA[featureKey], rowA['colCount']) == calcA(rowB[featureKey], rowB['colCount'])
    )

def logBinTable(table):
    logBins = {}
    for rowIndex, row in table['features'].items():
        logBin = dict(row)
        colCount = logBin.pop('colCount')
        logBin = { 
            featureKey: { 
                'a': calcA(feature, colCount),
                'b': calcB(feature, colCount)
            } for featureKey, feature in logBin.items() 
        }
        logBins[rowIndex] = logBin
    return logBins

In [28]:
tables['logBin'] = tables.apply(logBinTable, axis='columns')
tables.head()

Unnamed: 0,_id,annotations,html,pageID,pageTitle,tableIndex,tableTitle,taggedHtml,features,logBin
0,5cf836ef1ae12a2caf23946f,"[Header, Header, Header, Data]","<table class=""wikitable floatright"" style=""flo...",10041828,Memories & Dust,0,,"<table class=""wikitable floatright"" style=""flo...","{'0': {'isEmpty': 0, 'isBold': 0, 'colCount': ...","{'0': {'isEmpty': {'a': 0, 'b': 0}, 'isBold': ..."
1,5cf836ef1ae12a2caf239470,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10041828,Memories & Dust,1,Charts[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","{'0': {'isBold': 0, 'colCount': 3, 'isTHOrInTH...","{'0': {'isBold': {'a': 0, 'b': 1}, 'isTHOrInTH..."
2,5cf836ef1ae12a2caf239471,"[Header, Data]","<table class=""wikitable"">\n <tbody>\n <tr>\n ...",10086127,Sant Esteve de Palautordera,0,Demography[edit],"<table class=""wikitable"">\n <tbody>\n <tr dat...","{'0': {'isBold': 0, 'colCount': 6, 'isTHOrInTH...","{'0': {'isBold': {'a': 0, 'b': 2}, 'isTHOrInTH..."
3,5cf836ef1ae12a2caf239472,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,0,Euro 2000[edit],"<table class=""wikitable"" style=""text-align: ce...","{'0': {'isBold': 0, 'colCount': 11, 'isTHOrInT...","{'0': {'isBold': {'a': 0, 'b': 3}, 'isTHOrInTH..."
4,5cf836ef1ae12a2caf239473,"[Header, Data, Data, Data, Data]","<table class=""wikitable"" style=""text-align:cen...",1008145,Slovenia national football team,1,2002 World Cup[edit],"<table class=""wikitable"" style=""text-align: ce...","{'0': {'isBold': 0, 'colCount': 9, 'isTHOrInTH...","{'0': {'isBold': {'a': 0, 'b': 3}, 'isTHOrInTH..."


In [29]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
tablesCollection.insert_many(tables.to_dict('records'))
client.close()