# Table Header Detection

## Requirements
- Conda or pip
- MongoDB instance
- PyMongo (will be installed by the notebook)

In [2]:
#import sys
#!conda install --yes --prefix {sys.prefix} pymongo
#!conda install --yes --prefix {sys.prefix} premailer

#!{sys.executable} -m pip install numpy --upgrade
#!{sys.executable} -m pip install pandas
#!{sys.executable} -m pip install cssutils
#!{sys.executable} -m pip install premailer
#!{sys.executable} -m pip install python-crfsuite

import os
import json
import re
import pandas as pd
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from cssutils import parseStyle
from premailer import Premailer
import time
from dateutil.parser import parse
import math
import pycrfsuite
from sklearn.model_selection import train_test_split


## Loading the seed data into mongoDB
- initial dataset [Wikipedia TabEL dataset](http://websail-fe.cs.northwestern.edu/TabEL/)
- dataset is lacking of some styling information
- we're crawling the wikipedia pages on our own
  - that should be feasible since we have to use labeled data only (both for training & testing)
  - we're taking the TabEL dataset pageID's as starting point, since we know that there should be at least one relational table

Each line of the TabEL dataset contains one JSON object representing a single table. However, the JSON objects are not contained within a JSON array. We need to wrap the single tables into an array first before we can parse the file as a whole.

In [4]:
def wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath):
    inputFile = open(inputFilePath, 'r')
    outputFile = open(outputFilePath, 'w')

    outputFile.write('[')

    previousLine = False
    for tableLineJsonObject in inputFile:
        if (previousLine):
            outputFile.write(previousLine + ',')
        previousLine = tableLineJsonObject
    if (previousLine):
        outputFile.write(previousLine)

    outputFile.write(']')

    inputFile.close()
    outputFile.close()

Check if TabEL dataset has been transformed into an array before. If not, we want to parse it now.

In [12]:
inputFilePath = os.path.join('data', 'wikipedia_0_5000.json')
outputFilePath = os.path.join('data', 'wikipedia_0_5000_fixed.json')
if not os.path.isfile(outputFilePath):
    wrapJSONObjectLineIntoTable(inputFilePath, outputFilePath)

Parse JSON Array

In [83]:
tabEL = pd.read_json(os.path.join('data', 'wikipedia_0_5000_fixed.json'))
tabEL.head()

Unnamed: 0,_id,numCols,numDataRows,numHeaderRows,numericColumns,order,pgId,pgTitle,sectionTitle,tableCaption,tableData,tableHeaders,tableId
0,10000032-1,4,11,1,[1],0.535975,10000032,Mid Antrim (Northern Ireland Parliament consti...,Members of Parliament,Members of Parliament,"[[{'cellID': -1, 'textTokens': [], 'text': '',...","[[{'cellID': -1, 'textTokens': [], 'text': 'El...",1
1,1000006-1,4,21,1,[],0.856769,1000006,Römer (crater),Satellite craters,Satellite craters,"[[{'cellID': -1, 'textTokens': [], 'text': 'A'...","[[{'cellID': -1, 'textTokens': [], 'text': 'Rö...",1
2,10000088-1,2,1,3,[],0.318258,10000088,Whispermoon,,Track listing,"[[{'cellID': -1, 'textTokens': [], 'text': 'Al...","[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...",1
3,10000218-1,2,6,1,[],0.553872,10000218,Khalsa Diwan Society Vancouver,First executive committee,First executive committee,"[[{'cellID': -1, 'textTokens': [], 'text': 'Pr...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ti...",1
4,10000228-1,2,7,1,[1],0.951118,10000228,Julien Leparoux,Year-end charts,Year-end charts,"[[{'cellID': -1, 'textTokens': [], 'text': 'Na...","[[{'cellID': -1, 'textTokens': [], 'text': 'Ch...",1


Get 1000 unique page IDs and fetch the HTML content for it.

In [101]:
pageIDSample = tabEL[['pgId']].sample(n=1000)

## Crawl the wikipedia pages and fetch all occurring tables
We use the pageID's from the TabEL dataset and crawl the wikipedia html. One page might include multiple tables. We only extract HTML tables with the class `wikitable`. The style from the CSS file gets parsed into inline style.

In [8]:
BASE_URL = 'https://en.wikipedia.org/'
wikipediaCSSFilePath = os.path.join('data', 'wikipedia.css')
instance = Premailer(base_url=BASE_URL)

cssFilePath = os.path.join('data', 'wikipedia.css')
cssFile = open(cssFilePath, 'r')
css = cssFile.read()
style = '<style>' + css + '</style>'

def inlineCSS(html):
    return instance.transform(html.replace('</head>', style + '</head>'))

def crawl(tabEL):
    payload = { 'curid': str(tabEL['pgId']) }
    html = requests.get(BASE_URL, params=payload).text
    htmlWithInlineCSS = inlineCSS(str(html))
    return htmlWithInlineCSS

In [None]:
pageIDSample['HTML'] = pageIDSample.apply(crawl, axis='columns')

In [206]:
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\n', '')
pageIDSample['HTML'] = pageIDSample['HTML'].str.replace('\t', '')

Since crawling is time expensive we store the data into file and db first.

In [17]:
client = MongoClient()
db = client.bob
pages = db.pages
pages.insert_many(pageIDSample.to_dict('records'))
pageIDSample.to_json(os.path.join("data", "crawled.json"))
client.close()

If we share data files this will load them into the database.

In [9]:
client = MongoClient()
db = client.bob
tables = db.tables
#tables.delete_many({})
input_file_path = '../data/new/tables.json'
file = open(input_file_path, 'r')
uniquePgIds = set()
for line in file:
    jsonTable = json.loads(line)
    oldId = jsonTable['_id']
    jsonTable['_id'] = oldId['$oid']
    
    tables.insert_one(jsonTable)
    
for table in tables:
    

In [2]:
client = MongoClient()
db = client.bob
pages = db.pages
cursor = pages.find({})
pageIDSample = pd.DataFrame(list(cursor))
client.close()

In [18]:
pageIDSample.head()

Unnamed: 0,HTML,pgId
0,"<!DOCTYPE html><html class=""client-nojs"" lang=...",10041828
1,"<!DOCTYPE html><html class=""client-nojs"" lang=...",10086127
2,"<!DOCTYPE html><html class=""client-nojs"" lang=...",1008145
3,"<!DOCTYPE html><html class=""client-nojs"" lang=...",1012548
4,"<!DOCTYPE html><html class=""client-nojs"" lang=...",10128185


Now we extract the tables along with some metadata. For each row we assign an unique ID (the index of the row within the table) and a tag (whether the row includes `th-tags` only or is contained within a `thead`)

In [None]:
HEADLINE_PATTERN = re.compile('(h|H)\d')
LABEL_CONTROLS = [
    {
        'label': 'Header',
        'color': 'light-blue'
    }, {
        'label': 'Data',
        'color': 'lime'
    }, {
        'label': 'Other',
        'color': 'orange'
    }
];

def extractPageTitle(soup):
    headlines = soup.select('h1')
    return headlines[0].text if len(headlines) > 0 else 'N/A'

def extractTableTitle(table):
    for sibling in table.previous_siblings:
        if (sibling is not None and sibling.name is not None and HEADLINE_PATTERN.match(sibling.name)):
            return sibling.text
    return 'N/A'

def addLabelControls(row, rowIndex, soup):
    labelControlTag = soup.new_tag(
        'th',
        attrs={
            'class': 'flex space-evenly'
        }
    )
    for labelControl in LABEL_CONTROLS:
        labelControlButton = soup.new_tag(
            'a',
            attrs={
                'class': 'labelButton waves-effect waves-light btn-small ' + labelControl['color'],
                'onClick': 'annotate(' + str(rowIndex) + ', "' + labelControl['label'] + '");',
            }
        )
        labelControlButton.string = labelControl['label']
        labelControlTag.append(labelControlButton)
    row.insert(0, labelControlTag)
    
def tagRow(row, rowIndex, soup, isHead=False):
    row['data-label'] = 'Header' if isHead else 'Data'
    row['data-row-index'] = rowIndex
    addLabelControls(row, rowIndex, soup)
    
def isHeaderRow(row):
    thTags = row.find_all('th', recursive=False)
    childCount = len(row.contents)
    return childCount == len(thTags) or row.parent.name == 'thead'

def tagRows(table, soup):
    rows = table.find_all('tr')
    annotations = []
    for rowIndex, row in enumerate(rows):
        isHeader = isHeaderRow(row)
        tagRow(row, rowIndex, soup, isHeader)  
        annotations.append('Header' if isHeader else 'Data')
    return annotations

def removeTableWidthLimitation(table):
    if not table.has_attr('style'):
        return
    tableStyle = parseStyle(table['style'])
    tableStyle['width'] = '100%'
    tableStyle['font-size'] = '100%'
    table['style'] = tableStyle.cssText
        
def extractTableInformation(table, pageID, tableIndex, pageTitle, soup):
    extractedInformation = {
        'pageID': pageID,
        'tableIndex': tableIndex,
        'pageTitle': pageTitle
    }
    extractedInformation['html'] = table.prettify()
    annotations = tagRows(table, soup)
    removeTableWidthLimitation(table)
    extractedInformation['taggedHtml'] = table.prettify()
    extractedInformation['annotations'] = annotations
    extractedInformation['tableTitle'] = extractTableTitle(table)
    return extractedInformation

def hasNestedTable(table):
    return len(table.select('table')) > 0

def extractTables(page):
    soup = BeautifulSoup(page['HTML'])
    pageTitle = extractPageTitle(soup)
    wikiTables = soup.select('.wikitable')
    extractedTables = []
    for tableIndex, table in enumerate(wikiTables):
        if hasNestedTable(table):
            continue
        extractedTable = extractTableInformation(table, page['pgId'], tableIndex, pageTitle, soup)
        extractedTables.append(extractedTable)
    return extractedTables

In [20]:
client = MongoClient()
db = client.bob
tables = db.tables
for extractedTables in pageIDSample.apply(extractTables, axis='columns').values:
    if len(extractedTables) > 0:
        tables.insert_many(extractedTables)
client.close()

ERROR	Property: Invalid value for "CSS Level 2.1" property: 1000 [1:1: width]


The data can now get labeled using the provided [labeling tool](https://github.com/RichStone/web-tables-header-detection/tree/master/Labeling%20Tool).

# Feature Extraction

In [12]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()

In [13]:
tables.head()

Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,skipped,tableIndex,tableTitle,taggedHtml
0,5d020c18e7ee03eab39ab0ee,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isNumeric': 0.0, 'isBold': 0.0, 'isCen...","<table align=""center"" bgcolor=""#f8f9fa"" class=...",1027342.0,1996 CONCACAF Gold Cup,,1.0,Venues[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
1,5d020c18e7ee03eab39ab0fa,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isNumeric': 0.0, 'isBold': 0.0, 'isCen...","<table align=""center"" bgcolor=""#f8f9fa"" class=...",1027385.0,1998 CONCACAF Gold Cup,,4.0,Group C[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
2,5d020c18e7ee03eab39ab111,1560430000000.0,"[Data, Data, Data, Data, Data, Data, Data]","{'0': {'isNumeric': 0.0, 'isBold': 0.0, 'isCen...","<table bgcolor=""#f8f9fa"" class=""wikitable"" sty...",1027435.0,Tomb of the Unknown Soldier (Warsaw),,0.0,Battles currently featured on the stone tablet...,"<table bgcolor=""#f8f9fa"" class=""wikitable"" sty..."
3,5d020c18e7ee03eab39ab112,1560430000000.0,"[Header, Data, Other, Data, Other, Data, Other...","{'0': {'isNumeric': 0.0, 'isBold': 0.0, 'isCen...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...",1027443.0,Groovie Goolies,,0.0,Episodes[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."
4,5d020c18e7ee03eab39ab11b,1560430000000.0,"[Header, Data, Other]","{'0': {'isNumeric': 0.0, 'isBold': 0.0, 'isCen...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...",1027471.0,List of The Flintstones episodes,,1.0,Pilot (1959)[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."


In [14]:
SHORT_TEXT_THRESHOLD = 20
LONG_TEXT_THRESHOLD = 40

def isInt(value):
    try: 
        int(value)
        return True
    except ValueError:
        return False
    
def getRowSpan(cell):
    if cell.has_attr('rowspan') and isInt(cell['rowspan']):
        return int(cell['rowspan'])
    return 1
    
def getColSpan(cell):
    if cell.has_attr('colspan') and isInt(cell['colspan']):
        return int(cell['colspan'])
    return 1

def isMerged(cell):
    return (
        getColSpan(cell) > 1 or
        getRowSpan(cell) > 1
    )

def isCenterAligned(cell, style):
    return (
        (cell.has_attr('align') and cell['align'] == 'center') or
        (style is not None and 'text-align' in style and style['text-align'] == 'center')
    )

def isThOrInTHead(cell):
    row = cell.parent
    rowParent = row.parent
    return (
        cell.name == 'th' or
        rowParent.name == 'thead'
    )

def extractLayoutFeatures(cell, style):
    return {
        'isMerged': isMerged(cell),
        'isCenterAligned': isCenterAligned(cell, style),
        'isTHOrInTHead': isThOrInTHead(cell)
    }

def isBold(cell, style):
    return bool(
        style is not None and (
            style['font-weight'] == 'bold' or 
            style['font-style'] == 'bold'
        ) or
        cell.find('b') or
        cell.find('strong')
    )

def isItalic(cell, style):
    return bool(cell.find('i'))

def isUnderlined(cell, style):
    return (
        cell.find('u') or
        style is not None and (
            style['text-decoration'] == 'underline' or
            style['font-style'] == 'bold'
        )
    )

def isColored(cell, style):
    return (
        style is not None and (
            'background-color' in style or
            'color' in style
        )
    )

def extractStyleFeatures(cell, style):
    return {
        'isBold': isBold(cell, style),
        'isItalic': isItalic(cell, style),
        'isUnderlined': isUnderlined(cell, style)
    }

def getCellStyle(cell):
    return parseStyle(cell['style']) if cell.has_attr('style') else None

def getContentLength(cell):
    return len(re.sub('\s+',' ', cell.get_text()).split())

def isEmpty(cell):
    return getContentLength(cell) == 0

def isText(cell):
    return cell.get_text().isalpha()

def isNumeric(cell):
    return cell.get_text().isdigit()

def isDate(cell):
    try: 
        parse(cell.get_text(), fuzzy=False)
        return True
    except (ValueError, OverflowError):
        return False
    
def isShortText(cell):
    return getContentLength(cell) <= SHORT_TEXT_THRESHOLD

def isLongText(cell):
    return getContentLength(cell) > LONG_TEXT_THRESHOLD

def isTotal(cell):
    return cell.get_text().lower() == 'total'

def extractValueFeatures(cell):
    return {
        'isEmpty': isEmpty(cell),
        'isText': isText(cell),
        'isNumeric': isNumeric(cell),
        'isDate': isDate(cell),
        'isShortText': isShortText(cell),
        'isLongText': isLongText(cell),
        'isTotal': isTotal(cell)
    }

def mapDictBoolValuesToInt(dictionary):
    return { key: int(value) for key, value in dictionary.items() }

def applyColSpanFactor(dictionary, colSpan):
    return { key: value * colSpan for key, value in dictionary.items() }

def merge(featuresA, featuresB):
    return { k: featuresA.get(k, 0) + featuresB.get(k, 0) for k in set(featuresA) | set(featuresB) }

def stringifyDictKeys(dictionary):
    return { str(key): value for key, value in dictionary.items() }

def numNormalisedCols(row):
    numCols = 0
    for cell in row.children:
        if type(cell) is Tag:
            numCols += getColSpan(cell)
    return numCols
            
def getSimilarity(feature, cell, neighbour, suffix):
    similarity = {}
    similarity[feature + 'A' + suffix] = cell[feature] and neighbour[feature]
    similarity[feature + 'B' + suffix] = cell[feature] and not neighbour[feature]
    return similarity
    
def extractSimilarityFeatures(cell, neighbour, suffix):
    similarityFeatures = {
        **getSimilarity('isMerged', cell, neighbour, suffix),
        **getSimilarity('isCenterAligned', cell, neighbour, suffix),
        **getSimilarity('isTHOrInTHead', cell, neighbour, suffix),
        **getSimilarity('isBold', cell, neighbour, suffix),
        **getSimilarity('isItalic', cell, neighbour, suffix),
        **getSimilarity('isUnderlined', cell, neighbour, suffix),
        **getSimilarity('isEmpty', cell, neighbour, suffix),
        **getSimilarity('isText', cell, neighbour, suffix),
        **getSimilarity('isNumeric', cell, neighbour, suffix),
        **getSimilarity('isDate', cell, neighbour, suffix),
        **getSimilarity('isShortText', cell, neighbour, suffix),
        **getSimilarity('isLongText', cell, neighbour, suffix),
        **getSimilarity('isTotal', cell, neighbour, suffix)        
    }
    return similarityFeatures    
    
def addSimilarityFeatures(normalizedFeatureTable):
    nftWithSimilarity = []
    numRows = len(normalizedFeatureTable)
    for rowIndex, row in enumerate(normalizedFeatureTable):
        nftWithSimilarity.append([])
        for cellIndex, cell in enumerate(row):
            features = cell
            if rowIndex > 0:
                features = {
                    **extractSimilarityFeatures(cell, normalizedFeatureTable[rowIndex - 1][cellIndex], 'u'), 
                    **features
                    }
            if rowIndex < numRows - 1:
                features = {
                    **extractSimilarityFeatures(cell, normalizedFeatureTable[rowIndex + 1][cellIndex], 'l'),
                    **features
                }
            intCellFeatures = mapDictBoolValuesToInt(features)
            nftWithSimilarity[-1].append(intCellFeatures)
    return nftWithSimilarity
    
def cleanOfEmptyCells(table):
    lastEmptyCellIndex = len(table[0])
    for row in table:
        for cellIndex,cell in enumerate(row):
            if cell == 'empty cell':
                lastEmptyCellIndex = min(lastEmptyCellIndex, cellIndex)
    newTable =[]
    for row in table:
        newTable.append(row[:lastEmptyCellIndex])
    return newTable

def normalizedFeatureTable(table):
    soup = BeautifulSoup(table['html'])
    rows = soup.select('tr')
    numRows = len(rows)
    numCols = numNormalisedCols(rows[0])
    nft = [['empty cell' for i in range(numCols)] for j in range(numRows)]
    for rowIndex, row in enumerate(rows):
        cellIndex = 0
        for cell in row.children:
            if type(cell) is not Tag:
                continue
            cellStyle = getCellStyle(cell)
            boolCellFeatures = {
                **extractLayoutFeatures(cell, cellStyle),
                **extractStyleFeatures(cell, cellStyle),
                **extractValueFeatures(cell)
            }
            boolCellFeatures['colCount'] = 1
            colSpan = getColSpan(cell)
            rowSpan = getRowSpan(cell)
            # find next empty cell
            while cellIndex < numCols and nft[rowIndex][cellIndex] != 'empty cell':
                cellIndex += 1
            for rIndex in range(rowIndex, min(rowIndex + rowSpan, numRows)):
                for cIndex in range(cellIndex, min(cellIndex + colSpan, numCols)):
                    nft[rIndex][cIndex] = boolCellFeatures
            cellIndex += colSpan
    nft = cleanOfEmptyCells(nft)
    return addSimilarityFeatures(nft)    
        
def isEmptyTable(table):
    soup = BeautifulSoup(table['html'])
    rows = soup.select('tr')
    return len(rows) == 0
    
def extractFeatures(table):
    if isEmptyTable(table):
        print(table['html'])
        return []
    featureTable = normalizedFeatureTable(table)
    rowFeatureTable = {}
    for rowIndex, row in enumerate(featureTable):
        # count how often every feature is true in a row
        rowFeatures = {}
        for cellFeatures in row:
            rowFeatures = merge(rowFeatures, cellFeatures)
        rowFeatureTable[rowIndex] = rowFeatures
    rowFeatureTable = stringifyDictKeys(rowFeatureTable)
    return rowFeatureTable

In [15]:
tables['features'] = tables.apply(extractFeatures, axis='columns')
tables.head()

ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:67: vertical-align]
ERROR	Property: Invalid value for "CSS Level 2.1" property: center [1:67: vertical-align]
ERROR	Property: Invalid value for "CSS Backgrounds and Borders Module Level 3" property: none none style style [1:48: border-style]


Unnamed: 0,_id,annotatedAt,annotations,features,html,pageID,pageTitle,skipped,tableIndex,tableTitle,taggedHtml
0,5d020c18e7ee03eab39ab0ee,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...",1027342.0,1996 CONCACAF Gold Cup,,1.0,Venues[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
1,5d020c18e7ee03eab39ab0fa,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...",1027385.0,1998 CONCACAF Gold Cup,,4.0,Group C[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
2,5d020c18e7ee03eab39ab111,1560430000000.0,"[Data, Data, Data, Data, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable"" sty...",1027435.0,Tomb of the Unknown Soldier (Warsaw),,0.0,Battles currently featured on the stone tablet...,"<table bgcolor=""#f8f9fa"" class=""wikitable"" sty..."
3,5d020c18e7ee03eab39ab112,1560430000000.0,"[Header, Data, Other, Data, Other, Data, Other...","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...",1027443.0,Groovie Goolies,,0.0,Episodes[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."
4,5d020c18e7ee03eab39ab11b,1560430000000.0,"[Header, Data, Other]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...",1027471.0,List of The Flintstones episodes,,1.0,Pilot (1959)[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."


In [16]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
dictTables = tables.to_dict('records')
for table in dictTables:
    tablesCollection.replace_one({'_id': table['_id']}, table, True)
client.close()

## Logarithmic Binning

In [4]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused

In [None]:
def calcA(c, r):
    if (c == 0):
        return 0
    if (c == r):
        return r
    if (c > r / 2.0):
        return math.floor(math.log2(r - c) + 1)
    return math.floor(math.log2(c) + 1)

def calcB(c, r):
    return math.floor(math.log2(r))

def isInSameBin(rowA, rowB, featureKey):
    return (
        calcB(rowA[featureKey], rowA['colCount']) == calcB(rowB[featureKey], rowB['colCount']) and 
        calcA(rowA[featureKey], rowA['colCount']) == calcA(rowB[featureKey], rowB['colCount'])
    )

def logBinTable(table):
    if len(table['features']) == 0:
        return []
    logBins = {}
    for rowIndex, row in table['features'].items():
        logBin = dict(row)
        colCount = logBin.pop('colCount')
        logBin = { 
            featureKey: { 
                'a': calcA(feature, colCount),
                'b': calcB(feature, colCount)
            } for featureKey, feature in logBin.items() 
        }
        logBins[rowIndex] = logBin
    return logBins

In [None]:
tables['logBin'] = tables.apply(logBinTable, axis='columns')
tables.head()

In [None]:
client = MongoClient()
db = client.bob
tablesCollection = db.tables
dictTables = tables.to_dict('records')
for table in dictTables:
    tablesCollection.replace_one({'_id': table['_id']}, table, True)
client.close()

## Conditional Random Fields

In [5]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
print(len(tables))
client.close()
tables.head()

975


Unnamed: 0,_id,annotatedAt,annotations,features,html,logBin,pageID,pageTitle,skipped,tableIndex,tableTitle,taggedHtml
0,5d020c18e7ee03eab39ab0ee,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027342.0,1996 CONCACAF Gold Cup,,1.0,Venues[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
1,5d020c18e7ee03eab39ab0fa,1560430000000.0,"[Header, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table align=""center"" bgcolor=""#f8f9fa"" class=...","{'0': {'isDateBl': {'a': 0, 'b': 3}, 'isMerged...",1027385.0,1998 CONCACAF Gold Cup,,4.0,Group C[edit],"<table align=""center"" bgcolor=""#f8f9fa"" class=..."
2,5d020c18e7ee03eab39ab111,1560430000000.0,"[Data, Data, Data, Data, Data, Data, Data]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable"" sty...","{'0': {'isDateBl': {'a': 0, 'b': 0}, 'isMerged...",1027435.0,Tomb of the Unknown Soldier (Warsaw),,0.0,Battles currently featured on the stone tablet...,"<table bgcolor=""#f8f9fa"" class=""wikitable"" sty..."
3,5d020c18e7ee03eab39ab112,1560430000000.0,"[Header, Data, Other, Data, Other, Data, Other...","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027443.0,Groovie Goolies,,0.0,Episodes[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."
4,5d020c18e7ee03eab39ab11b,1560430000000.0,"[Header, Data, Other]","{'0': {'isDateBl': 0, 'isMerged': 0, 'isItalic...","<table bgcolor=""#f8f9fa"" class=""wikitable plai...","{'0': {'isDateBl': {'a': 0, 'b': 1}, 'isMerged...",1027471.0,List of The Flintstones episodes,,1.0,Pilot (1959)[edit],"<table bgcolor=""#f8f9fa"" class=""wikitable plai..."


In [20]:
def getCRFFeatures(table):
    tableFeatures = table['logBin']
    tableAnnotations = table['annotations']
    rowFeatures = []
    rowAnnotations = []
    
    for rowIndex in tableFeatures:
        rowFeatures.append(tableFeatures[rowIndex])
        rowAnnotations.append(tableAnnotations[int(rowIndex)])
    tableID = table['_id']
    return (rowFeatures, rowAnnotations, tableID)

tableFeatures = tables.apply(getCRFFeatures, axis='columns')
featureSequence = []
lableSequence = []
for tf in tableFeatures:
    featureSequence.append(tf[0])
    #lableSequence.append(tf[1])
    lableSequence.append((tf[2], tf[1]))

In [21]:
X_train, X_test, y_trainId, y_testId = train_test_split(featureSequence, lableSequence, test_size=0.1)
y_train = [a[1] for a in y_trainId]
y_test = [a[1] for a in y_testId]
testIDs = [a[0] for a in y_trainId]

[['Header', 'Other', 'Data', 'Data', 'Data', 'Other', 'Data', 'Other'], ['Header', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Data', 'Other', 'Other', 'Other'], ['Header', 'Data'], ['Header', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Other'], ['Data', 'Data'], ['Header', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data', 'Data'], ['Header', 'Data', 'Data'], ['Header', 'Data', 'Data', 'Data

In [22]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# params copied from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train('../data/firstTraining.crfsuite')

In [23]:
tagger = pycrfsuite.Tagger()
tagger.open('../data/firstTraining.crfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_test]


In [27]:
tables = db.tables
tables.find
for tableId, predictions in zip(testIDs, y_pred):
    tables.update_one({'_id': tableId}, {'$set': {'predictions': predictions}})

In [28]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"Header": 0, "Data": 1, "Other": 2}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["Header", "Data", "Other"]))


              precision    recall  f1-score   support

      Header       0.96      0.94      0.95       108
        Data       0.98      0.99      0.98      1288
       Other       0.58      0.41      0.48        46

   micro avg       0.97      0.97      0.97      1442
   macro avg       0.84      0.78      0.81      1442
weighted avg       0.96      0.97      0.96      1442

