# Table Header Detection

## Requirements
- Conda or pip
- MongoDB instance
- PyMongo (will be installed by the notebook)

In [1]:
#import sys
#!conda install --yes --prefix {sys.prefix} pymongo
#!conda install --yes --prefix {sys.prefix} premailer

#!{sys.executable} -m pip install numpy --upgrade
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install cssutils
#!{sys.executable} -m pip install premailer
#!{sys.executable} -m pip install python-crfsuite

import os
import json
import re
import pandas as pd
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from cssutils import parseStyle
from premailer import Premailer
import time
from dateutil.parser import parse
from datetime import datetime
import math
import pycrfsuite
from sklearn.model_selection import train_test_split
from collections import Counter
from functools import reduce

## Loading the seed data into mongoDB
- initial dataset [Wikipedia TabEL dataset](http://websail-fe.cs.northwestern.edu/TabEL/)
- dataset is lacking of some styling information
- we're crawling the wikipedia pages on our own
  - that should be feasible since we have to use labeled data only (both for training & testing)
  - we're taking the TabEL dataset pageID's as starting point, since we know that there should be at least one relational table

Each line of the TabEL dataset contains one JSON object representing a single table. However, the JSON objects are not contained within a JSON array. We need to wrap the single tables into an array first before we can parse the file as a whole.

In [None]:
def wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath):
    inputFile = open(inputFilePath, 'r')
    outputFile = open(outputFilePath, 'w')

    outputFile.write('[')

    previousLine = False
    for tableLineJsonObject in inputFile:
        if (previousLine):
            outputFile.write(previousLine + ',')
        previousLine = tableLineJsonObject
    if (previousLine):
        outputFile.write(previousLine)

    outputFile.write(']')

    inputFile.close()
    outputFile.close()

Check if TabEL dataset has been transformed into an array before. If not, we want to parse it now.

In [None]:
inputFilePath = os.path.join('data', 'wikipedia_0_50000.json')
outputFilePath = os.path.join('data', 'wikipedia_0_50000_fixed.json')
if not os.path.isfile(outputFilePath):
    wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath)

Parse JSON Array

In [None]:
tabEL = pd.read_json(os.path.join('data', 'wikipedia_0_50000_fixed.json'))
tabEL.head()

Get 1000 unique page IDs and fetch the HTML content for it. (Update: We skip selecting only 1000 here, since we want a broader selection of pages/ tables.)

In [None]:
uniquePageIDs = pd.DataFrame(tabEL['pgId'].unique(), columns=['pgId'])
print('Number of pages: ' + str(uniquePageIDs.shape[0]))

In [None]:
def pickRandomSample():
    pageIDSample = uniquePageIDs.sample(n=1000)
    pageIDSample.reset_index(inplace=True)
    pageIDSample.drop(axis='columns', labels='index', inplace=True)

# pickRandomSample()

pageIDSample = uniquePageIDs
pageIDSample.head()

## Crawl the wikipedia pages and fetch all occurring tables
We use the pageID's from the TabEL dataset and crawl the wikipedia html. One page might include multiple tables. We only extract HTML tables with the class `wikitable`. The style from the CSS file gets parsed into inline style.

In [None]:
BASE_URL = 'https://en.wikipedia.org'
wikipediaCSSFilePath = os.path.join('data', 'wikipedia.css')
cssFilePath = os.path.join('data', 'wikipedia.css')
cssFile = open(cssFilePath, 'r')
css = cssFile.read()
instance = Premailer(
    base_url=BASE_URL,
    exclude_pseudoclasses=False,
    include_star_selectors=True,
    disable_validation=True,
    css_text=css,
    allow_network=False,
    cssutils_logging_level='CRITICAL'
)

def crawl(tabEL):
    print(tabEL.name)
    payload = { 'curid': str(tabEL['pgId']) }
    html = requests.get(BASE_URL, params=payload).text
    htmlWithInlineCSS = instance.transform(str(html))
    return htmlWithInlineCSS

In [None]:
client = MongoClient()
db = client.bob
pages = db.pages

def batchCrawl():
    pageIDSample['HTML'] = pageIDSample.apply(crawl, axis='columns')
    pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\n', '')
    pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\t', '')
    pages.insert_many(pageIDSample.to_dict('records'))
    # pageIDSample.to_json(os.path.join("data", "crawled.json"))
    client.close()

def sequenceCrawl():
    for index, row in pageIDSample.iterrows():
        inlineHTML = crawl(row)
        inlineHTML = inlineHTML.replace('\n', '')
        inlineHTML = inlineHTML.replace('\t', '')
        row['HTML'] = inlineHTML
        pages.insert_one(row.to_dict())
    client.close()

print('Begin time: ' + str(datetime.now()))
sequenceCrawl()

In [None]:
client = MongoClient()
db = client.bob
pages = db.pages
pages.insert_many(pageIDSample.to_dict('records'))
# pageIDSample.to_json(os.path.join("data", "crawled.json"))
client.close()

In [None]:
client = MongoClient()
db = client.bob
pages = db.pages
cursor = pages.find({})
pageIDSample = pd.DataFrame(list(cursor))
client.close()

In [None]:
pageIDSample.head()

Now we extract the tables along with some metadata. For each row we assign an unique ID (the index of the row within the table) and a tag (whether the row includes `th-tags` only or is contained within a `thead`)

In [None]:
HEADLINE_PATTERN = re.compile('(h|H)\d')
LABEL_CONTROLS = [
    {
        'label': 'Header',
        'color': 'light-blue'
    }, {
        'label': 'Data',
        'color': 'lime'
    }, {
        'label': 'Other',
        'color': 'orange'
    }
];

def extractPageTitle(soup):
    headlines = soup.select('h1')
    return headlines[0].text if len(headlines) > 0 else 'N/A'

def extractTableTitle(table):
    for sibling in table.previous_siblings:
        if (sibling is not None and sibling.name is not None and HEADLINE_PATTERN.match(sibling.name)):
            return sibling.text
    return 'N/A'

def addLabelControls(row, rowIndex, soup):
    labelControlTag = soup.new_tag(
        'th',
        attrs={
            'class': 'flex space-evenly'
        }
    )
    for labelControl in LABEL_CONTROLS:
        labelControlButton = soup.new_tag(
            'a',
            attrs={
                'class': 'labelButton waves-effect waves-light btn-small ' + labelControl['color'],
                'onClick': 'annotate(' + str(rowIndex) + ', "' + labelControl['label'] + '");',
            }
        )
        labelControlButton.string = labelControl['label']
        labelControlTag.append(labelControlButton)
    row.insert(0, labelControlTag)
    
def tagRow(row, rowIndex, soup, isHead=False):
    row['data-label'] = 'Header' if isHead else 'Data'
    row['data-row-index'] = rowIndex
    addLabelControls(row, rowIndex, soup)
    
def isHeaderRow(row):
    thTags = row.find_all('th', recursive=False)
    childCount = len(row.contents)
    return childCount == len(thTags) or row.parent.name == 'thead'

def tagRows(table, soup):
    rows = table.find_all('tr')
    annotations = []
    for rowIndex, row in enumerate(rows):
        isHeader = isHeaderRow(row)
        tagRow(row, rowIndex, soup, isHeader)  
        annotations.append('Header' if isHeader else 'Data')
    return annotations

def removeTableWidthLimitation(table):
    if not table.has_attr('style'):
        return
    tableStyle = parseStyle(table['style'])
    tableStyle['width'] = '100%'
    tableStyle['font-size'] = '100%'
    table['style'] = tableStyle.cssText
        
def extractTableInformation(table, pageID, tableIndex, pageTitle, soup):
    extractedInformation = {
        'pageID': pageID,
        'tableIndex': tableIndex,
        'pageTitle': pageTitle
    }
    extractedInformation['html'] = table.prettify()
    annotations = tagRows(table, soup)
    removeTableWidthLimitation(table)
    extractedInformation['taggedHtml'] = table.prettify()
    extractedInformation['annotations'] = annotations
    extractedInformation['tableTitle'] = extractTableTitle(table)
    return extractedInformation

def hasNestedTable(table):
    return len(table.select('table')) > 0

def extractTables(page):
    soup = BeautifulSoup(page['HTML'])
    pageTitle = extractPageTitle(soup)
    wikiTables = soup.select('table.wikitable')
    extractedTables = []
    for tableIndex, table in enumerate(wikiTables):
        if hasNestedTable(table):
            continue
        extractedTable = extractTableInformation(table, page['pgId'], tableIndex, pageTitle, soup)
        extractedTables.append(extractedTable)
    return extractedTables

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
for extractedTables in pageIDSample.apply(extractTables, axis='columns').values:
    if len(extractedTables) > 0:
        tables.insert_many(extractedTables)
client.close()

The data can now get labeled using the provided [labeling tool](https://github.com/RichStone/web-tables-header-detection/tree/master/Labeling%20Tool). However, we may use the feature extraction to enhance the table selection process.

# Feature Extraction

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [None]:
tables.head()

In [None]:
SHORT_TEXT_THRESHOLD = 20
LONG_TEXT_THRESHOLD = 255

def isInt(value):
    try: 
        int(value)
        return True
    except ValueError:
        return False
    
def getRowSpan(cell):
    if cell.has_attr('rowspan') and isInt(cell['rowspan']):
        return int(cell['rowspan'])
    return 1
    
def getColSpan(cell):
    if cell.has_attr('colspan') and isInt(cell['colspan']):
        return int(cell['colspan'])
    return 1

def isMerged(cell):
    return (
        getColSpan(cell) > 1 or
        getRowSpan(cell) > 1
    )

def isCenterAligned(cell, style):
    return (
        (cell.has_attr('align') and cell['align'] == 'center') or
        (style is not None and 'text-align' in style and style['text-align'] == 'center')
    )

def isThOrInTHead(cell):
    row = cell.parent
    rowParent = row.parent
    return (
        cell.name == 'th' or
        rowParent.name == 'thead'
    )

def extractLayoutFeatures(cell, style):
    return {
        'hasColSpan': getColSpan(cell) > 1,
        'hasRowSpan': getRowSpan(cell) > 1,
        'isMerged': isMerged(cell),
        'isCenterAligned': isCenterAligned(cell, style),
        'isTHOrInTHead': isThOrInTHead(cell)
    }

def isCompletlyBold(cell, style):
    return style is not None and (
            style['font-weight'] == 'bold' or 
            style['font-style'] == 'bold')

def isPartiallyBold(cell, style):
    return bool(cell.find('b') or
        cell.find('strong'))

def isBold(cell, style):
    return isCompletlyBold(cell, style) or isPartiallyBold(cell, style)

def isItalic(cell, style):
    return bool(cell.find('i'))

def isUnderlined(cell, style):
    return bool(
        cell.find('u') or
        style is not None and (
            style['text-decoration'] == 'underline' or
            style['font-style'] == 'bold'
        )
    )

def isColored(cell, style):
    return (
        style is not None and (
            'background-color' in style or
            'color' in style
        )
    )

def extractStyleFeatures(cell, style):
    return {
        'isCompletlyBold': isCompletlyBold(cell, style),
        'isPartiallyBold': isPartiallyBold(cell, style),
        'isBold': isBold(cell, style),
        'isItalic': isItalic(cell, style),
        'isUnderlined': isUnderlined(cell, style)
    }

def getCellStyle(cell):
    return parseStyle(cell['style']) if cell.has_attr('style') else None

def getContentLength(cell):
    return len(re.sub('\s+',' ', cell.get_text()).split())

def isEmpty(cell):
    return getContentLength(cell) == 0

def isText(cell):
    return cell.get_text().isalpha()

def isNumeric(cell):
    return cell.get_text().isdigit()

def isDate(cell):
    try: 
        parse(cell.get_text(), fuzzy=False)
        return True
    except (ValueError, OverflowError):
        return False
    
def isShortText(cell):
    return getContentLength(cell) <= SHORT_TEXT_THRESHOLD

def isLongText(cell):
    return getContentLength(cell) > LONG_TEXT_THRESHOLD

def isTotal(cell):
    return cell.get_text().lower() == 'total'

def extractValueFeatures(cell):
    return {
        'isEmpty': isEmpty(cell),
        'isText': isText(cell),
        'isNumeric': isNumeric(cell),
        'isDate': isDate(cell),
        'isShortText': isShortText(cell),
        'isLongText': isLongText(cell),
        'isTotal': isTotal(cell)
    }

def mapDictBoolValuesToInt(dictionary):
    return { key: int(value) for key, value in dictionary.items() }

def applyColSpanFactor(dictionary, colSpan):
    return { key: value * colSpan for key, value in dictionary.items() }

def merge(featuresA, featuresB):
    return { k: featuresA.get(k, 0) + featuresB.get(k, 0) for k in set(featuresA) | set(featuresB) }

def stringifyDictKeys(dictionary):
    return { str(key): value for key, value in dictionary.items() }

def numNormalisedCols(row):
    numCols = 0
    for cell in row.children:
        if type(cell) is Tag:
            numCols += getColSpan(cell)
    return numCols
            
def getSimilarity(feature, cell, neighbour, suffix):
    similarity = {}
    similarity[feature + 'A' + suffix] = cell[feature] and neighbour[feature]
    similarity[feature + 'B' + suffix] = cell[feature] and not neighbour[feature]
    return similarity
    
def extractSimilarityFeatures(cell, neighbour, suffix, featureNames):
    similarityFeatures = {}
    for feature in featureNames:
        similarityFeatures = {
            **getSimilarity(feature, cell, neighbour, suffix),
            **similarityFeatures
        }
    return similarityFeatures   
    
def getRowSimilarityFeatures(normalizedFeatureTable, rowIndex, row):
    newRow = []
    numRows = len(normalizedFeatureTable)
    for cellIndex, cell in enumerate(row):
        features = cell
        featureNames = [key for key in features]
        if rowIndex > 0:
            features = {
                **extractSimilarityFeatures(cell, normalizedFeatureTable[rowIndex - 1][cellIndex], 'u', featureNames), 
                **features
                }
        if rowIndex < numRows - 1:
            features = {
                **extractSimilarityFeatures(cell, normalizedFeatureTable[rowIndex + 1][cellIndex], 'l', featureNames),
                **features
            }
        intCellFeatures = mapDictBoolValuesToInt(features)
        newRow.append(intCellFeatures)
        return newRow
    
def addSimilarityFeatures(normalizedFeatureTable):
    normalizedTableWithSimilarity = []
    for rowIndex, row in enumerate(normalizedFeatureTable):
        newRow = getRowSimilarityFeatures(normalizedFeatureTable, rowIndex, row)
        normalizedTableWithSimilarity.append(newRow)        
    return normalizedTableWithSimilarity
    
def cleanOfEmptyCells(table):
    lastEmptyCellIndex = len(table[0])
    for row in table:
        for cellIndex,cell in enumerate(row):
            if cell == 'empty cell':
                lastEmptyCellIndex = min(lastEmptyCellIndex, cellIndex)
    newTable =[]
    for row in table:
        newTable.append(row[:lastEmptyCellIndex])
    return newTable

def getBoolCellFeatures(cell):
    cellStyle = getCellStyle(cell)
    boolCellFeatures = {
        **extractLayoutFeatures(cell, cellStyle),
        **extractStyleFeatures(cell, cellStyle),
        **extractValueFeatures(cell)
    }
    boolCellFeatures['colCount'] = 1
    return boolCellFeatures

def fillNormalizedFeatureTable(rows, numRows, numCols):
    normalizedTable = [['empty cell' for i in range(numCols)] for j in range(numRows)]
    for rowIndex, row in enumerate(rows):
        cellIndex = 0
        for cell in row.children:
            if type(cell) is not Tag:
                continue
            boolCellFeatures = getBoolCellFeatures(cell)
            # normalize
            colSpan = getColSpan(cell)
            rowSpan = getRowSpan(cell)
            # find the position of the cell
            while cellIndex < numCols and normalizedTable[rowIndex][cellIndex] != 'empty cell':
                cellIndex += 1
            for rIndex in range(rowIndex, min(rowIndex + rowSpan, numRows)):
                for cIndex in range(cellIndex, min(cellIndex + colSpan, numCols)):
                    normalizedTable[rIndex][cIndex] = boolCellFeatures
            cellIndex += colSpan
    return normalizedTable
        
def normalizedFeatureTable(table):
    soup = BeautifulSoup(table['html'])
    rows = soup.select('tr')
    # initialize normalized feature table
    numRows = len(rows)
    numCols = numNormalisedCols(rows[0])
    normalizedTable = fillNormalizedFeatureTable(rows, numRows, numCols)
    normalizedTable = cleanOfEmptyCells(normalizedTable)
    return addSimilarityFeatures(normalizedTable)    
            
def extractFeatures(table):
    featureTable = normalizedFeatureTable(table)
    rowFeatureTable = {}
    for rowIndex, row in enumerate(featureTable):
        # count how often every feature is true in a row
        rowFeatures = {}
        for cellFeatures in row:
            rowFeatures = merge(rowFeatures, cellFeatures)
        rowFeatureTable[rowIndex] = rowFeatures
    rowFeatureTable = stringifyDictKeys(rowFeatureTable)
    return rowFeatureTable

In [None]:
tables['features'] = tables.apply(extractFeatures, axis='columns')
tables.head()

In [None]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
dictTables = tables.to_dict('records')
for table in dictTables:
    tablesCollection.replace_one({'_id': table['_id']}, table, True)
client.close()

## Select Tables for Labeling
70% of all tables are considered 'simple'. We don't want to spend too much time labeling only simple tables. We wan't to have more interesting tables and are therefor making a thoughtful selection of tables.
Goal: 1000 Tables in total, where ideally 250 are randomly selected, 250 have no header, 250 have at least one merged cell and 250 tables that do have bold cells which are not located in the header.

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [None]:
tables = tables.loc[lambda tables: tables['features'].apply(lambda features: len(features) > 1)]
print('Amount of tables with row count greater 1: ' + str(tables.shape[0]))

In [None]:
def hasTableAHeader(tableFeatures):
    for rowIndex, rowFeatures in tableFeatures.items():  
        if (rowFeatures['isTHOrInTHead'] > 0):
            return True
    return False

tablesWithNoHeader = tables.loc[lambda tables: tables['features'].apply(lambda features: not hasTableAHeader(features))]
print('Amount of tables without a header: ' + str(tablesWithNoHeader.shape[0]))

In [None]:
def hasBoldStyleOutsideHead(tableFeatures):
    for rowIndex, rowFeatures in tableFeatures.items():  
        if (rowFeatures['isBold'] > 0 and rowFeatures['isTHOrInTHead'] == 0):
            return True
    return False

tablesWithBoldStyleOutsideHead = tables.loc[lambda tables: tables['features'].apply(hasBoldStyleOutsideHead)]
print('Amount of tables with bold styles outside of header: ' + str(tablesWithBoldStyleOutsideHead.shape[0]))

In [None]:
def hasMergedCellsOutsideHead(tableFeatures):
    for rowIndex, rowFeatures in tableFeatures.items():  
        if (rowFeatures['isMerged'] > 0 and rowFeatures['isTHOrInTHead'] == 0):
            return True
    return False

tablesWithMergedCellsOutsideHead = tables.loc[lambda tables: tables['features'].apply(lambda features: hasMergedCellsOutsideHead(features))]
print('Amount of tables with merged cells outside of header: ' + str(tablesWithMergedCellsOutsideHead.shape[0]))

In [None]:
labelTables = tablesWithNoHeader.sample(n=250)

reducedTablesWithMergedCellsOutsideHead = tablesWithMergedCellsOutsideHead.loc[~tablesWithMergedCellsOutsideHead.index.isin(list(labelTables.index))]
labelTables = pd.concat([labelTables, reducedTablesWithMergedCellsOutsideHead.sample(n=250)])

reducedTablesWithBoldStyleOutsideHead = tablesWithBoldStyleOutsideHead.loc[~tablesWithBoldStyleOutsideHead.index.isin(list(labelTables.index))]
labelTables = pd.concat([labelTables, reducedTablesWithBoldStyleOutsideHead.sample(n=250)])

remainingTables = tables.loc[~tables.index.isin(list(labelTables.index))]
labelTables = pd.concat([labelTables, remainingTables.sample(n=250)])
labelTables = labelTables.sample(frac=1).reset_index(drop=True) #shuffle
labelTables.shape[0]

In [None]:
labelTables1 = labelTables.head(500)
labelTables2 = labelTables.tail(500)

In [None]:
client = MongoClient()
db = client.bob
labelTables1Collection = db.labelTables1
labelTables1Collection.insert_many(labelTables1.to_dict('records'))
labelTables2Collection = db.labelTables2
labelTables2Collection.insert_many(labelTables2.to_dict('records'))
client.close()

## Logarithmic Binning

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

In [None]:
def calcA(c, r):
    if (c == 0):
        return 0
    if (c == r):
        return r
    if (c > r / 2.0):
        return math.floor(math.log2(r - c) + 1)
    return math.floor(math.log2(c) + 1)

def calcB(c, r):
    return math.floor(math.log2(r))

def isInSameBin(rowA, rowB, featureKey):
    return (
        calcB(rowA[featureKey], rowA['colCount']) == calcB(rowB[featureKey], rowB['colCount']) and 
        calcA(rowA[featureKey], rowA['colCount']) == calcA(rowB[featureKey], rowB['colCount'])
    )

def logBinTable(table):
    if len(table['features']) == 0:
        return []
    logBins = {}
    for rowIndex, row in table['features'].items():
        logBin = dict(row)
        colCount = logBin.pop('colCount')
        logBin = { 
            featureKey: { 
                'a': calcA(feature, colCount),
                'b': calcB(feature, colCount)
            } for featureKey, feature in logBin.items() 
        }
        logBins[rowIndex] = logBin
    return logBins

In [None]:
tables['logBin'] = tables.apply(logBinTable, axis='columns')
tables.head()

In [None]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
dictTables = tables.to_dict('records')
for table in dictTables:
    tablesCollection.replace_one({'_id': table['_id']}, table, True)
client.close()

# Conditional Random Fields

In [50]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
client.close()
print('Number of labeled tables: ' + str(len(tables)))
tables.head()

Number of labeled tables: 975


Unnamed: 0,_id,annotatedAt,annotations,features,html,logBin,pageID,pageTitle,predictions,skipped,tableIndex,tableTitle,taggedHtml
0,5d020c18e7ee03eab39ab0ee,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027342.0,1996 CONCACAF Gold Cup,,,1.0,Venues[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
1,5d020c18e7ee03eab39ab0fa,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...","{'0': {'isDateBl': {'a': 0, 'b': 3}, 'isMerged...",1027385.0,1998 CONCACAF Gold Cup,,,4.0,Group C[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
2,5d020c18e7ee03eab39ab111,1560430000000.0,"[Data, Data, Data, Data, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable"" sty...","{'0': {'isDateBl': {'a': 0, 'b': 0}, 'isMerged...",1027435.0,Tomb of the Unknown Soldier (Warsaw),,,0.0,Battles currently featured on the stone tablet...,"<table bgcolor=""#f8f9fa"" class=""wikitable"" sty..."
3,5d020c18e7ee03eab39ab112,1560430000000.0,"[Header, Data, Other, Data, Other, Data, Other...","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027443.0,Groovie Goolies,,,0.0,Episodes[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."
4,5d020c18e7ee03eab39ab11b,1560430000000.0,"[Header, Data, Other]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027471.0,List of The Flintstones episodes,,,1.0,Pilot (1959)[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."


Split labled data into training and test set using ratio 70/30 and store the ids of the tables of each set afterwards to make different test runs comparable.

In [51]:
tableIDs = tables['_id'].astype(str)
trainTables, testTables = train_test_split(tableIDs, test_size=0.3)
print('Amount of training tables: ' + str(len(trainTables)))
print('Amount of testing tables: ' + str(len(testTables)))

Amount of training tables: 682
Amount of testing tables: 293


In [52]:
trainTables.to_json(os.path.join('..', 'data', 'train.json'))
testTables.to_json(os.path.join('..', 'data', 'test.json'))

In [53]:
def getLogBin(tables):
    return [list(binDictionary.values()) for binDictionary in list(tables['logBin'].values)]

def getAnnotations(tables):
    return list(tables['annotations'].values)

Add normalized rowIndex to logBin features for better comparison to random forests

In [54]:
def addRowIndex(tableFeatures):
    for table in tableFeatures:
        for rowIndex, row in enumerate(table):
            row['normalizedRowIndex'] = rowIndex / len(table)
    return tableFeatures

# Train

In [55]:
trainTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'train.json'), 'r')).values(), columns=['_id'])
testTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'test.json'), 'r')).values(), columns=['_id'])
trainTables = trainTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])
testTables = testTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])

In [56]:
trainFeatures = getLogBin(trainTables)
trainLabels = getAnnotations(trainTables)

In [57]:
trainFeatures = addRowIndex(trainFeatures)

In [58]:
trainer = pycrfsuite.Trainer(verbose=False)

for features, labels in zip(trainFeatures, trainLabels):
    trainer.append(features, labels)

# params copied from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train('../data/everything.crfsuite')

# Test

In [59]:
testFeatures = getLogBin(testTables)
testLabels = getAnnotations(testTables)

In [60]:
testFeatures = addRowIndex(testFeatures)

In [61]:
tagger = pycrfsuite.Tagger()
tagger.open('../data/everything.crfsuite')
predictions = [tagger.tag(features) for features in testFeatures]

In [62]:
tables = db.tables
tables.find
for tableId, prediction in zip(list(testTables['_id'].values), predictions):
    tables.update_one({'_id': tableId}, {'$set': {'predictions': prediction}})

# Evaluation

In [63]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"Header": 0, "Data": 1, "Other": 2}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in predictions for tag in row])
truths = np.array([labels[tag] for row in testLabels for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["Header", "Data", "Other"]))


              precision    recall  f1-score   support

      Header       0.93      0.95      0.94       329
        Data       0.99      0.99      0.99      4523
       Other       0.82      0.75      0.78       237

   micro avg       0.98      0.98      0.98      5089
   macro avg       0.91      0.90      0.90      5089
weighted avg       0.98      0.98      0.98      5089



## Number of rows per cell type

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

In [None]:
totalCounter = Counter([item for sublist in list(tables['annotations'].values) for item in sublist])
trainCounter = Counter([item for sublist in list(trainTables['annotations'].values) for item in sublist])
testCounter = Counter([item for sublist in list(testTables['annotations'].values) for item in sublist])

In [None]:
def getRatio(counter):
    total = sum(list(counter.values()))
    for annotation in counter:
        print(annotation + ': ' + str(counter[annotation] * 100.0 / total))
    print()
    
print('Total')
getRatio(totalCounter)
print('Train')
getRatio(trainCounter)
print('Test')
getRatio(testCounter)

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"predictions" : {"$exists" : True}})
predictedTables = pd.DataFrame(list(cursor))
client.close()
predictedTables.head()

In [None]:
def getWronglyPredicted(table):
    incorrect = []
    for rowIndex, label in enumerate(table['annotations']):
        if(label != table['predictions'][rowIndex]):
            incorrect.append({
                'annotated': label,
                'predicted': table['predictions'][rowIndex]
            })
    return incorrect

def getCorrectlyPredictedCount(table):
    return len(table['annotations']) - table['predictedIncorrectlyCount']
    
def isWholeTableCorrectlyPredicted(table):
    return table['predictedCorrectlyCount'] == len(table['annotations'])

def getIncorrectlyPredictedCount(table):
    return len(table['wronglyPredicted'])

In [None]:
predictedTables['wronglyPredicted'] = predictedTables.apply(getWronglyPredicted, axis='columns')
predictedTables['predictedIncorrectlyCount'] = predictedTables.apply(getIncorrectlyPredictedCount, axis='columns')
predictedTables['predictedCorrectlyCount'] = predictedTables.apply(getCorrectlyPredictedCount, axis='columns')
predictedTables['predictedCorrectly'] = predictedTables.apply(isWholeTableCorrectlyPredicted, axis='columns')

In [None]:
correctlyPredictedTables = predictedTables.loc[predictedTables['predictedCorrectly']].shape[0]
print('Correctly predicted table count: ' + str(correctlyPredictedTables))

In [None]:
incorrectPredictedTables = predictedTables.loc[~predictedTables['predictedCorrectly']]
incorrectPredictedTables.reset_index(inplace=True)
incorrectPredictedTableCount = incorrectPredictedTables.shape[0]
print('Incorrect predicted table count: ' + str(incorrectPredictedTableCount))

In [None]:
def labelBars(plot):
    for p in plot.patches:
        plot.annotate(
            np.round(p.get_height(), decimals=2),
            (
                p.get_x() + p.get_width() / 2.,
                p.get_height()
            ),
            ha='center',
            va='center',
            xytext=(0, 10),
            textcoords='offset points'
     )

In [None]:
numberOfIncorrectRowsPerTablePlot = incorrectPredictedTables['predictedIncorrectlyCount'].value_counts().plot(
    kind='bar', 
    figsize=(20, 10)
)
labelBars(numberOfIncorrectRowsPerTablePlot)
numberOfIncorrectRowsPerTablePlot.set(
   xlabel='Amount of incorrectly labeled rows per table',
    ylabel='Amount of tables',
    title='Incorrectly labled rows per table'
)

In [None]:
def getRowCount(table):
    return len(table['annotations'])

rowCountPerTablePlot = incorrectPredictedTables.apply(getRowCount, axis='columns').value_counts().plot(
    kind='bar', 
    figsize=(20, 10)
)
labelBars(rowCountPerTablePlot)
rowCountPerTablePlot.set(
   xlabel='Table size (total table row count)',
    ylabel='Amount of tables',
    title='Dependency between table size (total row count) and prediction correctness'
)

In [None]:
wronglyPredicted = list(incorrectPredictedTables['wronglyPredicted'])
wronglyPredicted = [item for sublist in wronglyPredicted for item in sublist]
wronglyPredicted = pd.DataFrame(wronglyPredicted)
print('Count of which row type got predicted incorrectly:')
wronglyPredicted.groupby('annotated').count()

In [None]:
print('Table ids of incorrectly predicted tables')
pd.set_option('display.max_colwidth', -1)
incorrectPredictedTables[['_id', 'predictions']]

## Possible reasons for incorrect labeling (1):
- background color not taken into account enough -> to less examples where background color indicates Header or to many example where a colored cell is not a Header cell
- It's a legend and marked as data while we would label it as 'Other' -> taking into account the occurrence of characters like '=' ':' could help?
- merged cell doesn't seem to be a good indicator that cell should be 'Other' instead of 'Data'
- group header mistaken as real header
- maybe the tables (with many rows) in the test set had no header and therefor the size was more important? (need to check if row/col count is taken as feature)
- 'bold' style doesn't indicate if it's a header for sure -> tr/thead feature is more important -> if tr/thead is missing, but cell is bold the row still gets marked as 'Data' instead of 'Header' -> introduce feature accross whole row for bold too

# Random Forests

In [34]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
client.close()

In [35]:
trainTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'train.json'), 'r')).values(), columns=['_id'])
testTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'test.json'), 'r')).values(), columns=['_id'])
trainTables = trainTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])
testTables = testTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])


## Feature preperation

In [36]:
# random forests expect float values so logarithmic binning doesn't work
def normalizeRow(rowDict):
    colCount = rowDict['colCount']
    rowDict.pop('colCount')
    for feature in rowDict:
        rowDict[feature] = rowDict[feature]/colCount
    return rowDict
 
def flattenRowFeatures(tables):
    tableFeatures = [tableFeatures for tableFeatures in tables["features"]]
    rowFeatures = []
    # reduce features to one table with features per row
    for table in tableFeatures:  
        for rowKey in table:
            rowDict = table[rowKey]
            rowDict = normalizeRow(rowDict)
            rowDict['normalizedRowIndex'] = int(rowKey)/len(table)
            rowFeatures.append(rowDict)

    rowFeatures = pd.DataFrame(rowFeatures)
    return rowFeatures
    
def removeSimilarityFeatures(rowFeatures):
    # clean of features comparing neighbouring rows (because they are NaN in first and last row)
    columnsToDrop = []
    for columnName in rowFeatures:
        if columnName[-2:] == "Al" or columnName[-2:] == "Au" or columnName[-2:] == "Bl" or columnName[-2:] == "Bu":
            columnsToDrop.append(columnName)
    rowFeatures = rowFeatures.drop(columns = columnsToDrop)
    return rowFeatures

def cleanNanFeatures(rowFeatures):
    for column in rowFeatures:
        for value in column:
            if value == "NaN":
                value = -1
    return rowFeatures
    
def getRandomForestFeatures(tables):
    rowFeatures = flattenRowFeatures(tables)
    rowFeatures = cleanNaNFeatures(rowFeatures)
    return rowFeatures

def flattenAnnotations(tables):
    tableLables = [tableAnnotations for tableAnnotations in tables["annotations"]]
    rowLables = reduce(list.__add__, tableLables)
    labels = {"Header": 0, "Data": 1, "Other": 2}
    rowLables = [labels[lable] for lable in rowLables]
    return rowLables

## Train

In [37]:
X_train = getRandomForestFeatures(trainTables)
X_test = getRandomForestFeatures(testTables)
y_train = flattenAnnotations(trainTables)
y_test = flattenAnnotations(testTables)

In [38]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(X_train, y_train);

## Test

In [39]:
predictions = rf.predict(X_test)
absolutePredictions = [round(p) for p in predictions]

## Evaluate

In [40]:
import numpy as np
from sklearn.metrics import classification_report

# Print out the classification report
print(classification_report(
    y_test, absolutePredictions,
    target_names=["Header", "Data", "Other"]))

              precision    recall  f1-score   support

      Header       0.95      0.96      0.95       318
        Data       0.99      0.98      0.99      3718
       Other       0.78      0.86      0.82       256

   micro avg       0.97      0.97      0.97      4292
   macro avg       0.91      0.93      0.92      4292
weighted avg       0.97      0.97      0.97      4292



In [41]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

feature_list = list(X_train.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('{:20} Importance: {}'.format(*pair)) for pair in feature_importances];

normalizedRowIndex   Importance: 0.5
isMerged             Importance: 0.13
isTHOrInTHead        Importance: 0.12
isDate               Importance: 0.09
isShortText          Importance: 0.09
isBold               Importance: 0.03
isCenterAligned      Importance: 0.02
isEmpty              Importance: 0.01
isItalic             Importance: 0.01
isLongText           Importance: 0.0
isNumeric            Importance: 0.0
isText               Importance: 0.0
isTotal              Importance: 0.0
isUnderlined         Importance: 0.0
