# Evaluation

## Requirements

In [None]:
import os
import json
import pandas as pd
from pymongo import MongoClient
from collections import Counter
import matplotlib.pyplot as plt

## Evaluation helper methods

In [None]:
# Change global plot settings
plt.rcParams.update({'font.size': 24})

In [None]:
def labelBars(plot):
    for p in plot.patches:
        plot.annotate(
            np.round(p.get_height(), decimals=2),
            (
                p.get_x() + p.get_width() / 2.,
                p.get_height()
            ),
            ha='center',
            va='center',
            xytext=(0, 10),
            textcoords='offset points'
        )
    return plot

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
client.close()
tables.head()

In [None]:
testTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'test_wtc.json'), 'r')).values(), columns=['_id'])
testTables = testTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])

In [None]:
trainTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'train.json'), 'r')).values(), columns=['_id'])
trainTables = trainTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])

### Table count per dataset

In [None]:
print('Total tables count: ', len(tables))
print('Training tables count: ', len(trainTables))
print('Test tables count: ', len(testTables))

### Avg. Table size (row count) per dataset

In [None]:
def getRowCount(tables):
    return tables.apply(lambda table: len(table['annotations']), axis='columns')

print('Avg. table size for training set: ', round(getRowCount(trainTables).mean()))
print('Avg. table size for training set: ', round(getRowCount(testTables).mean()))
print('Avg. table size for training set: ', round(getRowCount(tables).mean()))

## Dataset cell type ratios

In [None]:
totalCounter = Counter([item for sublist in list(tables['annotations'].values) for item in sublist])
trainCounter = Counter([item for sublist in list(trainTables['annotations'].values) for item in sublist])
testCounter = Counter([item for sublist in list(testTables['annotations'].values) for item in sublist])

In [None]:
def getRatio(counter):
    total = sum(list(counter.values()))
    return [counter[annotation] * 100.0 / total for annotation in ['Other', 'Header', 'Data']]
    
ratios = [getRatio(testCounter), getRatio(trainCounter), getRatio(totalCounter)]
ratioLabels = ['Test', 'Train', 'Total']
ratioDf = pd.DataFrame(ratios, index=ratioLabels, columns=['Other', 'Header', 'Data']).T
ratioPlot = ratioDf.plot(kind='barh', figsize=(20, 10))
ratioPlot.set(
    xlabel="Percentage",
    ylabel="Annotation", 
    title="Distribution of cell types accross datasets"
)

## Correctly predicted tables

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"predictions" : {"$exists" : True}})
predictedTables = pd.DataFrame(list(cursor))
client.close()
predictedTables.head()

In [None]:
def getWronglyPredicted(table):
    incorrect = []
    for rowIndex, label in enumerate(table['annotations']):
        if(label != table['predictions'][rowIndex]):
            incorrect.append({
                'annotated': label,
                'predicted': table['predictions'][rowIndex]
            })
    return incorrect

def getCorrectlyPredictedCount(table):
    return len(table['annotations']) - table['predictedIncorrectlyCount']
    
def isWholeTableCorrectlyPredicted(table):
    return table['predictedCorrectlyCount'] == len(table['annotations'])

def getIncorrectlyPredictedCount(table):
    return len(table['wronglyPredicted'])

In [None]:
predictedTables['wronglyPredicted'] = predictedTables.apply(getWronglyPredicted, axis='columns')
predictedTables['predictedIncorrectlyCount'] = predictedTables.apply(getIncorrectlyPredictedCount, axis='columns')
predictedTables['predictedCorrectlyCount'] = predictedTables.apply(getCorrectlyPredictedCount, axis='columns')
predictedTables['predictedCorrectly'] = predictedTables.apply(isWholeTableCorrectlyPredicted, axis='columns')

In [None]:
correctlyPredictedTables = predictedTables.loc[predictedTables['predictedCorrectly']].shape[0]
print('Correctly predicted table count: ' + str(correctlyPredictedTables))

In [None]:
incorrectPredictedTables = predictedTables.loc[~predictedTables['predictedCorrectly']]
incorrectPredictedTables.reset_index(inplace=True)
incorrectPredictedTableCount = incorrectPredictedTables.shape[0]
print('Incorrect predicted table count: ' + str(incorrectPredictedTableCount))

## Correctly predicted rows

In [None]:
numberOfIncorrectRowsPerTablePlot = incorrectPredictedTables['predictedIncorrectlyCount'].value_counts().plot(
    kind='bar', 
    figsize=(20, 10)
)
labelBars(numberOfIncorrectRowsPerTablePlot)
numberOfIncorrectRowsPerTablePlot.set(
   xlabel='Amount of incorrectly labeled rows per table',
    ylabel='Amount of tables',
    title='Incorrectly labled rows per table'
)

In [None]:
def getRowCount(table):
    return len(table['annotations'])

rowCountPerTablePlot = incorrectPredictedTables.apply(getRowCount, axis='columns').value_counts().plot(
    kind='bar', 
    figsize=(20, 10)
)
labelBars(rowCountPerTablePlot)
rowCountPerTablePlot.set(
   xlabel='Table size (total table row count)',
    ylabel='Amount of tables',
    title='Dependency between table size (total row count) and prediction correctness'
)

In [None]:
wronglyPredicted = list(incorrectPredictedTables['wronglyPredicted'])
wronglyPredicted = [item for sublist in wronglyPredicted for item in sublist]
wronglyPredicted = pd.DataFrame(wronglyPredicted)
print('Count of which row type got predicted incorrectly:')
wronglyPredicted.groupby('annotated').count()

In [None]:
def getLabeledAsMatrix(tables):
    labeledAs = {}
    for iRow, table in tables.iterrows():
        for iAnnotation, annotation in enumerate(table['annotations']):
            currentLabelDict = labeledAs.get(annotation, {})
            predictedAs = table['predictions'][iAnnotation]
            currentLabelDict[predictedAs] = currentLabelDict.get(predictedAs, 0) + 1
            labeledAs[annotation] = currentLabelDict
    return labeledAs

In [None]:
labeledAsMatrix = getLabeledAsMatrix(testTables)
labeledAsMatrixPD = pd.DataFrame(labeledAsMatrix.values(), index=labeledAsMatrix.keys())
labeledAsMatrixPD.fillna(0, inplace=True)
labeledAsMatrixPD
# ratioDf = pd.DataFrame(ratios, index=ratioLabels, columns=['Other', 'Header', 'Data']).T


## More detailed analysis

In [None]:
print('Table ids of incorrectly predicted tables')
pd.set_option('display.max_colwidth', -1)
incorrectPredictedTables[['_id', 'predictions']]

## Possible reasons for incorrect labeling (1):
- background color not taken into account enough -> to less examples where background color indicates Header or to many example where a colored cell is not a Header cell
- It's a legend and marked as data while we would label it as 'Other' -> taking into account the occurrence of characters like '=' ':' could help?
- merged cell doesn't seem to be a good indicator that cell should be 'Other' instead of 'Data'
- group header mistaken as real header
- maybe the tables (with many rows) in the test set had no header and therefor the size was more important? (need to check if row/col count is taken as feature)
- 'bold' style doesn't indicate if it's a header for sure -> tr/thead feature is more important -> if tr/thead is missing, but cell is bold the row still gets marked as 'Data' instead of 'Header' -> introduce feature accross whole row for bold too