# Requirements

In [None]:
import os
import json
import pandas as pd
from pymongo import MongoClient
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor

# Random Forests

In [None]:
client = MongoClient()
db = client.bob
tables = db.tables
cursor = tables.find({"annotatedAt" : {"$exists" : True}, "skipped": {"$ne": True}})
tables = pd.DataFrame(list(cursor))
client.close()

In [None]:
trainTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'train.json'), 'r')).values(), columns=['_id'])
testTables = pd.DataFrame(json.load(open(os.path.join('..', 'data', 'test.json'), 'r')).values(), columns=['_id'])
trainTables = trainTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])
testTables = testTables['_id'].apply(lambda tableID: tables.loc[tables['_id'].astype(str) == tableID].iloc[0])

## Feature preparation

In [None]:
# random forests expect float values so logarithmic binning doesn't work
def normalizeRow(rowDict):
    colCount = rowDict['colCount']
    rowDict.pop('colCount')
    for feature in rowDict:
        rowDict[feature] = rowDict[feature]/colCount
    return rowDict
 
def flattenRowFeatures(tables):
    tableFeatures = [tableFeatures for tableFeatures in tables["features"]]
    rowFeatures = []
    # reduce features to one table with features per row
    for table in tableFeatures:  
        for rowKey in table:
            rowDict = table[rowKey]
            rowDict = normalizeRow(rowDict)
            rowDict['normalizedRowIndex'] = int(rowKey)/len(table)
            rowFeatures.append(rowDict)

    rowFeatures = pd.DataFrame(rowFeatures)
    return rowFeatures
    
def removeSimilarityFeatures(rowFeatures):
    # clean of features comparing neighbouring rows (because they are NaN in first and last row)
    columnsToDrop = []
    for columnName in rowFeatures:
        if columnName[-2:] == "Al" or columnName[-2:] == "Au" or columnName[-2:] == "Bl" or columnName[-2:] == "Bu":
            columnsToDrop.append(columnName)
    rowFeatures = rowFeatures.drop(columns = columnsToDrop)
    return rowFeatures

def cleanNaNFeatures(rowFeatures):
    newFeatures = {}
    for feature in rowFeatures:
        featureValues = []
        for value in rowFeatures[feature]:
            if np.isnan(value):
                featureValues.append(-1)
            else:
                featureValues.append(value)
        newFeatures[feature] = featureValues
    newFeatures = pd.DataFrame(newFeatures)
    return newFeatures
    
def getRandomForestFeatures(tables):
    rowFeatures = flattenRowFeatures(tables)
    rowFeatures = cleanNaNFeatures(rowFeatures)
    return rowFeatures

def flattenAnnotations(tables):
    tableLables = [tableAnnotations for tableAnnotations in tables["annotations"]]
    rowLables = reduce(list.__add__, tableLables)
    labels = {"Header": 0, "Data": 1, "Other": 2}
    rowLables = [labels[lable] for lable in rowLables]
    return rowLables

## Train

In [None]:
X_train = getRandomForestFeatures(trainTables)
X_test = getRandomForestFeatures(testTables)
y_train = flattenAnnotations(trainTables)
y_test = flattenAnnotations(testTables)

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(X_train, y_train);

## Test

In [None]:
predictions = rf.predict(X_test)
absolutePredictions = [round(p) for p in predictions]

## Evaluate

In [None]:
# Print out the classification report
print(classification_report(
    y_test, absolutePredictions,
    target_names=["Header", "Data", "Other"]))

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

feature_list = list(X_train.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('{:20} Importance: {}'.format(*pair)) for pair in feature_importances];