In [6]:
import os
from bs4 import BeautifulSoup
import re
from typing import Optional

In [2]:
data_root = '/home/berkiu/work/datasets/'

In [3]:
pdf_paths = [os.path.join(data_root, 'pdf', x) for x in os.listdir(os.path.join(data_root, 'pdf'))]
npa_paths = [os.path.join(data_root, 'NPA', x) for x in os.listdir(os.path.join(data_root, 'NPA'))]

In [7]:
def send_request_raw(filename: str,
                     parameters: dict = None) -> Optional[dict]:
    
    with open(filename, 'rb') as file:
        files = {'file': (filename, file)}

        r = requests.post("http://0.0.0.0:1231/upload", files=files, data=parameters)

        result = r.content.decode()
        return result

In [5]:
parameters = {"pdf_with_text_layer": "auto",
              "need_pdf_table_analysis": "false",
              "return_format": "html"}

In [6]:
def text_extractor(filepath: str, parameters: dict = parameters) -> str:
    html = send_request_raw(filepath, parameters)
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    text = re.sub("\xa0", "", text)
    text = re.sub(r"id = \d+(\.\d+)* ; type = \w+ *", "", text)
    return text

In [None]:
npa_texts = []

#for path in pdf_paths:
#    res += text_extractor(path)
i = 0
for path in npa_paths:
    npa_texts.append(text_extractor(path))
    if i == 20:
        break
    i += 1

In [None]:
res = ''
for text in npa_texts:
    res += text

In [14]:
with open("res.txt", "w") as text_file:
    text_file.write(res)

In [26]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [27]:
unk_token = "[UNK]"  # token for unknown words
spl_tokens = ["[UNK]", "[SEP]", "[PAD]", "[MASK]", "[CLS]"]  # special tokens

In [28]:
tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
trainer = WordPieceTrainer(special_tokens = spl_tokens)

tokenizer.pre_tokenizer = Whitespace()

In [30]:
files = ["/home/berkiu/work/formunderstanding/res.txt"]

In [31]:
tokenizer.train(files, trainer)






In [36]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [43]:
tokenizer.save("tokenizer.json")

with open("vocab.txt", "w") as vocab:
    for word in list(tokenizer.get_vocab().keys()):
        vocab.write(f"{word}\n")

In [37]:
out = tokenizer.encode("Заказчик оплатил сумму заказа")

In [40]:
out.tokens

['[CLS]', 'Зак', '##аз', '##чик', 'оплат', '##ил', 'сумму', 'заказа', '[SEP]']

In [39]:
tokenizer.decode(out.ids)

'Зак ##аз ##чик оплат ##ил сумму заказа'