In [1]:
import json
import cv2
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
import torch
import torch.nn.functional as F

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from networks.vgg_lstm import Vgg_Lstm
from networks.vgg_bert_custom import vgg_bert

import re
import nltk
import pickle
# import pypdfium2
# import pytesseract
import numpy as np
import pandas as pd
import torch.nn as nn

from PIL import Image
from bs4 import BeautifulSoup
from gensim.models import Doc2Vec
from stop_words import get_stop_words
from gensim.models.doc2vec import TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from transformers import AutoTokenizer

In [2]:
feature_length = 1024
device = torch.device('cuda')

model_dbow = Doc2Vec.load("../models/gensim/pdf_split_d2v_gensim_{}_db.mod".format(feature_length))
model_dmm = Doc2Vec.load("../models/gensim/pdf_split_d2v_gensim_{}_dm.mod".format(feature_length))
vectorizer_gensim = ConcatenatedDoc2Vec([model_dbow, model_dmm])

model_lstm = Vgg_Lstm(img_dim=3, embedding_dim=2048, output_dim=2)
if torch.cuda.is_available():
    model_lstm = nn.DataParallel(model_lstm)
model_lstm = model_lstm.to(device)
weight = torch.load('../models/mlp_model/ce_vgg_lstm_50/best_val_loss.pt', map_location=device)
model_lstm.load_state_dict(weight, strict=True)

model_bert = vgg_bert(in_ch=3, out_ch=2)
if torch.cuda.is_available():
    model_bert = nn.DataParallel(model_bert)
model_bert = model_bert.to(device)
weight = torch.load('../models/mlp_model/focal_bert_newdata_75/best_val_loss.pt', map_location=device)
model_bert.load_state_dict(weight, strict=True)

model_lstm.eval()
model_bert.eval()


with open('../models/logreg/logreg_model_1024_dbow_dm_concate.sav', 'rb') as m:
    model_log = pickle.load(m)
bert_tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased", revision="v1")

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.poole

In [3]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

def get_bert_tokenizer(text):
    bert_input = bert_tokenizer(text, padding='max_length', max_length=256,
                               truncation=True, return_tensors="pt")
    _id = bert_input['input_ids']
    _mask = bert_input['attention_mask']
    return _id, _mask

def tag_page(prediction):
    """
    :param prediction: classify prediction array: e.g [1, 0, 0, 1, 0, 0, 1, 0, 1]
    :return: tag page: e.g. [3, 3, 2, 1]
    """
    tag = np.split(prediction, np.argwhere(prediction == 1).flatten())
    tag = [len(tag[i]) for i in range(len(tag)) if len(tag[i])]
    tag = np.array(tag)

    return tag

def cleanText(text):
    text = BeautifulSoup(text, "html.parser").text
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'\\n', r' ', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

def resize_normalize_image(image, img_size=224):
    height, width, _ = image.shape
    if height > width:
        scale = img_size / height
        resized_height = img_size
        resized_width = int(width * scale)
    else:
        scale = img_size / width
        resized_height = int(height * scale)
        resized_width = img_size

    image = cv2.resize(image, (resized_width, resized_height), interpolation=cv2.INTER_LINEAR)

    new_image = np.zeros((img_size, img_size, image.shape[2]))

    offset_w = (img_size - resized_width) // 2
    offset_h = (img_size - resized_height) // 2
    new_image[offset_h:offset_h + resized_height, offset_w:offset_w + resized_width] = image
    return new_image/255.0


def data_loader(df):
    df.fillna('', inplace=True)
    df['text_processed'] = df['text'].apply(cleanText)
    path = df['repaths'].tolist()
    text = df['text_processed'].tolist()
    # labels = df['labels']
    label = df['labels'].tolist()
    return path, text, label

In [4]:
from tqdm import tqdm

class Prediction():
    def __init__(self, path, text, label, data_pct="25", vectorizer_type='gensim', model_type='vgg_bert'):
        self.vectorizer_type = vectorizer_type
        self.model_type = model_type
        self.path = path
        self.text = text
        self.label = label
        self.data_percentage = data_pct
        self.output = []

    def run_batch(self):
        result = []
        labels = []
        for i in tqdm(range(len(self.path))):
            fname = self.path[i]
            image = cv2.imread(fname)
            text = self.text[i]
            predict = self.run_single(image, text)
            result.append(predict)
            labels.append(self.label[i])
        result = np.array(result)
        result = tag_page(result)

        labels = np.array(labels)
        labels = tag_page(labels)
        return result, labels


    def run_single(self, image, text):
        if self.model_type == 'log':
            text = tokenize_text(text)
            gensim_vectors = vectorizer_gensim.infer_vector(text)
            prediction = model_log.predict([gensim_vectors])
            return int(prediction.item())
        elif self.model_type == 'vgg_lstm':
            text = tokenize_text(text)
            gensim_vectors = vectorizer_gensim.infer_vector(text)
            gensim_vectors = torch.from_numpy(gensim_vectors)
            gensim_vectors = torch.unsqueeze(gensim_vectors, dim=0)
            gensim_vectors = torch.unsqueeze(gensim_vectors, dim=0)
            gensim_vectors = gensim_vectors.to(device)
            
            image = resize_normalize_image(image)
            image = torch.from_numpy(image).float()
            image = torch.unsqueeze(image, dim=0)
            image = image.permute(0, 3, 1, 2)
            image = image.to(device)
            
            with torch.no_grad():
                prediction = model_lstm(image, gensim_vectors)
                prediction = F.softmax(prediction, dim=1)
                prediction = torch.argmax(prediction, dim=1)
                
            prediction = prediction.cpu().numpy()
            return int(prediction.item())
        else:
            # print("using bert!")
            b_id, b_mask = get_bert_tokenizer(text)
            b_id = b_id.to(device)
            b_mask = b_mask.to(device)
            
            image = resize_normalize_image(image)
            image = torch.from_numpy(image).float()
            image = torch.unsqueeze(image, dim=0)
            image = image.permute(0, 3, 1, 2)
            image = image.to(device)
            
            with torch.no_grad():
                prediction = model_bert(image, b_id, b_mask)
                prediction = F.softmax(prediction, dim=1)
                prediction = torch.argmax(prediction, dim=1)
                
            prediction = prediction.cpu().numpy()
            return int(prediction.item())


def make_index(split):
    '''Turns a doc length vector like [1,2,1,3,3,5] into a dict with pagenumbers as keys and the set of all 
    pagenumbers in the same document as value.
    This thus is an index which gives for every page its cluster.'''
    l= sum(split)
    pages= list(np.arange(l))
    out = defaultdict(set)
    for block_length in split:
        block= pages[:block_length]
        pages= pages[block_length:]
        for page in block:
            out[page]= set(block)
    return out

#test
# make_index(vb_truth)
#tests

def Bcubed(truth,pred,return_df=False):
    assert sum(truth)==sum(pred)  # same amount of pages
    truth,pred = make_index(truth), make_index(pred)
    if return_df:
        df  ={i:{'size':len(truth[i]),'P':0,'R':0,'F1':0} for i in truth}
        for i in truth:
            df[i]['P']= len(truth[i] & pred[i])/len(pred[i]) 
            df[i]['R']= len(truth[i] & pred[i])/len(truth[i])
            df[i]['F1']= (2*df[i]['P']*df[i]['R'])/(df[i]['P']+df[i]['R'])
        df= pd.DataFrame.from_dict(df, orient='index')
        df.index_name='PageNr'
        return df
    else:
        P = []
        R = []
        F1 = []
        for i in truth:
            P.append(len(truth[i] & pred[i])/len(pred[i]) )
            R.append(len(truth[i] & pred[i])/len(truth[i]))
            F1.append((2*P[i]*R[i])/(P[i]+R[i]))
        return P, R, F1


def MeanBcubed(truth,pred,return_df=False):
    assert sum(truth)==sum(pred)  # same amount of pages
    if return_df:
        return Bcubed(truth,pred).mean()
    else:
        return np.mean(np.array(Bcubed(truth, pred)[0])), np.mean(np.array(Bcubed(truth, pred)[1])), np.mean(np.array(Bcubed(truth, pred)[2]))


In [5]:
path_to_csv = '../data/full_ocred.csv'
whole_df = pd.read_csv(path_to_csv)
whole_df = whole_df.fillna('')
df_list = [whole_df[whole_df['names']==x] for x in list(whole_df['names'].unique())]

rs = []
for df in df_list[:10]:
    path, text, label = data_loader(df)
    model_predict = Prediction(path=path, text=text, label=label, model_type='vgg_bert')
    prediction, labels = model_predict.run_batch()
    rs.append(MeanBcubed(labels, prediction))
    # rs.append(list(model_predict.run_batch()))
print(np.mean(np.array(rs), axis=0))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 211/211 [00:06<00:00, 30.82it/s]
100%|██████████| 2585/2585 [01:23<00:00, 30.89it/s]
100%|██████████| 637/637 [00:22<00:00, 27.80it/s]
100%|██████████| 258/258 [00:08<00:00, 30.97it/s]
100%|██████████| 94/94 [00:02<00:00, 32.47it/s]
100%|██████████| 276/276 [00:09<00:00, 30.29it/s]
100%|██████████| 189/189 [00:05<00:00, 32.75it/s]
100%|██████████| 549/549 [00:18<00:00, 29.32it/s]
100%|██████████| 1110/1110 [00:36<00:00, 30.73it/s]
100%|██████████| 1371/1371 [01:02<00:00, 21.78it/s]


[0.96608545 0.41941058 0.47548609]


# On Corpus

In [7]:
f= open('../corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json')
truth_corpus=json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json'

In [6]:
def fixedpage(truth,docsize=3):
    number_of_blocks= sum(truth)//docsize
    rest = sum(truth) % docsize
    if rest !=0:
        return [docsize for _ in range(number_of_blocks)]+[rest]
    else:
        return [docsize for _ in range(number_of_blocks)]  
    
D ={pdf: MeanBcubed(truth_corpus[pdf], fixedpage(truth_corpus[pdf],6))
   for pdf in truth_corpus}
results= pd.DataFrame.from_dict(D,orient='index')
print(results.describe())
sns.boxplot(data=results[['P','R','F1']]);    

NameError: name 'truth_corpus' is not defined

In [None]:
title='Mean Bcubed P,R,F1 for each document in the corpus1 Train set with fixed median (6) doc. size.'
results[['P','R','F1']].sort_values('R').reset_index().plot( title=title,
                                               figsize=(20,8));

In [None]:
results.plot.scatter(x='R', y='P', figsize=(10,8));