In [1]:
from time import time
import os
from os.path import join
import numpy as np
import numpy.ma as ma
import json
import sys
import datetime
import configargparse
from utils import str2bool, str_or_none, name2dic, get_valid_types
import copy
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
from tensorboardX import SummaryWriter
from model.torchcrf import CRF

from model import datasets
from model.models_sherlock import FeatureEncoder, SherlockClassifier, build_sherlock
from sklearn.metrics import classification_report

# =============
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import ConcatDataset

In [32]:
TYPENAME = 'type78'
valid_types = get_valid_types(TYPENAME)
topic_name = 'num-directstr_thr-0_tn-400' #or None
label_enc = LabelEncoder()
label_enc.fit(valid_types)
corpus_list = ['csv-sato-p1'] #['webtables1-p1', 'webtables2-p1'] #['csv-sato-p1']
# corpus = corpus_list[0]
sherlock_feature_groups = ['char', 'word', 'par', 'rest']
MAX_COL_COUNT = 15
whole_corpus = []

for corpus in corpus_list:
    
    corpus_data = datasets.TableFeatures(corpus,
                                        sherlock_feature_groups, 
                                        topic_feature=topic_name, 
                                        label_enc=label_enc, 
                                        id_filter=None,
                                        max_col_count=MAX_COL_COUNT)
    whole_corpus.append(corpus_data)

corpus_data = None
val_dataset = ConcatDataset(whole_corpus) #whole_corpus
# whole_corpus = None

/home/senn/inf-bachpr-21-22-student-SennR-1952135/code/project/sato/tmp/csv-sato-p1_type78_header_valid.pkl pickle file found, loading...
csv-sato-p1_type78_header_valid Load complete. Time 0.04062342643737793
Total data preparation time: 9.199613094329834


In [33]:
print(len(val_dataset))
# print(val_dataset[0])
print(val_dataset[2])

40000
({'char': tensor([[ 1.0000,  0.0000,  0.3333,  ...,  0.0000, -3.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -3.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]), 'word': tensor([[-0.2313,  0.8445,  0.0347,  ...,  0.0729,  0.1239,  0.2908],
        [-0.3253,  0.0104, -0.1658,  ...,  0.4757,  0.6145,  0.2272],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]), 'par': tensor([[-0.0276,  0.0261, -0.0782,  ..., -0.0204, -0.0388, -0.0471],
        [ 0.0003, -0.0003,  0

In [34]:
# feature_group_cols = {}
sherlock_feature_groups = ['char', 'word', 'par', 'rest']
# for f_g in sherlock_feature_groups:
#     feature_group_cols[f_g] = list(pd.read_csv(join(os.environ['BASEPATH'],
#                                           'configs', 'feature_groups', 
#                                           "{}_col.tsv".format(f_g)),
#                                            sep='\t', header=None, 
#                                            index_col=0)[1])

In [35]:
# --model_type=CRF --model_path=CRF+LDA_pre.pt --topic=num-directstr_thr-0_tn-400
MAX_COL_COUNT = 15
TYPENAME = 'type78'
n_worker = 4
device = torch.device('cpu')
batch_size = 100
topic_dim = 400
valid_types = get_valid_types(TYPENAME)
model_path = 'CRF+LDA_pre.pt'

In [36]:
# load pre-trained model
classifier = build_sherlock(sherlock_feature_groups, num_classes=len(valid_types), topic_dim=topic_dim).to(device)
model = CRF(len(valid_types) , batch_first=True).to(device)
    
model_loc = join(os.environ['BASEPATH'],'model','pre_trained_CRF', TYPENAME)
loaded_params = torch.load(join(model_loc, model_path), map_location=device)
classifier.load_state_dict(loaded_params['col_classifier'])
model.load_state_dict(loaded_params['CRF_model'])

classifier.eval()
model.eval()

CRF(num_tags=78)

In [37]:
# evaluate and return prediction & true labels of a table batch
def eval_batch(classifier, model, val_dataset, batch_size, device, n_worker, MAX_COL_COUNT=15):


    validation = datasets.generate_batches(val_dataset,
                                           batch_size=batch_size,
                                           shuffle=False, 
                                           drop_last=True,
                                           device=device,
                                           n_workers=n_worker)
    y_pred, y_true = [], []
    for table_batch, label_batch, mask_batch in tqdm(validation):
        #pred, labels = eval_batch(table_batch, label_batch, mask_batch)
            
        # reshap (table_batch * table_size * features)
        for f_g in table_batch:
            table_batch[f_g] = table_batch[f_g].view(batch_size * MAX_COL_COUNT, -1)

        emissions = classifier(table_batch).view(batch_size, MAX_COL_COUNT, -1)
        pred = model.decode(emissions, mask_batch)

        pred = np.concatenate(pred)
        labels = label_batch.view(-1).cpu().numpy()
        masks = mask_batch.view(-1).cpu().numpy()
        invert_masks = np.invert(masks==1)
        
        y_pred.extend(pred)
        y_true.extend(ma.array(labels, mask=invert_masks).compressed())

#     val_acc = classification_report(y_true, y_pred, output_dict=True)
    return (y_true, y_pred)

In [38]:
with torch.no_grad():
    tt_list, prediction_list = eval_batch(classifier, model, val_dataset, batch_size, device, n_worker, MAX_COL_COUNT=15)

0it [00:00, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 80, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 80, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 75, in default_collate
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 75, in <dictcomp>
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/home/senn/virtualenvs/col2type/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 16 and 19 in dimension 1 at /pytorch/aten/src/TH/generic/THTensor.cpp:689


In [None]:
print(len(tt_list))
print((len(prediction_list)))

In [43]:
print(classification_report(tt_list, prediction_list))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92       889
           1       0.72      0.32      0.45        65
           2       0.97      0.94      0.95       199
           3       0.98      0.99      0.99      7230
           4       0.93      0.94      0.94      1535
           5       0.89      0.80      0.84       288
           6       0.93      0.91      0.92      2176
           7       0.96      0.98      0.97        65
           8       1.00      0.89      0.94        54
           9       0.81      0.81      0.81       140
          10       0.88      0.65      0.75        55
          11       0.96      0.97      0.96      4012
          12       0.96      0.97      0.96      5341
          13       0.89      0.91      0.90      1598
          14       0.96      0.84      0.89       101
          15       0.93      0.96      0.95      2223
          16       0.98      0.97      0.97      2702
          17       0.96    

In [15]:
print(valid_types[46])

name


In [16]:
# # Write out data for further investigation
# path_out_true_types = '../../combined/results/true_types'
# path_out_predictions = '../../combined/results/predictions'

# tt_df = pd.DataFrame(columns=['type'], data=tt_list)
# tt_df.to_parquet(join(path_out_true_types, 'sato.parquet'))

# pred_df = pd.DataFrame(columns=['type'], data=prediction_list)
# pred_df.to_parquet(join(path_out_predictions, 'sato_sato.parquet'))