In [776]:
# This notebook is an extension to NLP-PIPE (https://nlp.ailab.lv) text tagger. It allows users to tag multiple files and download them in .json, .csv or .parquet format.
# To tag text:
#     1. Run first cell. Then choose tagging parameters and upload one or more .txt files you wish to tag.
#     2. Run last cell and wait untill all files are tagged and download buttons appears.
#     Then download tagged files in desired file format. All files will be placed in one zip file.

In [47]:
import json
import logging
import requests
import base64
import io
import zipfile
import codecs
import pandas as pd

from tqdm import tqdm
from math import ceil
from sys import getsizeof
from IPython.display import display
from datetime import datetime
from multiprocessing.pool import ThreadPool
from ipywidgets import HTML, FileUpload, ToggleButtons, Output, Select, Checkbox, HBox, VBox


logger = logging.getLogger(__name__)

API_URL = 'http://0.0.0.0:9500/api/nlp'
param = ["morpho"]

def iter_jsonl(fpath):
    with open(fpath) as f:
        for l in f:
            yield json.loads(l)

# Executes request
def retry(f, n=3):
    for i in range(n):
        try:
            return f()
        except:
            logging.exception('Try failed %s', i)
    return None

# Taggs text
def process_text(text, steps=None):
    global param
    logger.debug('Process: %s', text)
    t = text['text'][1]
    fun = lambda: requests.post(API_URL, json={'steps': param, 'data': {'text': t}})
    r = retry(fun)
    if not r:
        data = {'error': None}
        logging.warning('Empty response %s', text)
    elif r.status_code != 200:
        logging.warning('Non 200 status code %s %s', r.status_code, text)
        data = {'error': r.status_code}
    else:
        data = r.json()
        if 'data' in data:
            data = {'annotation': data['data']}
        else:
            logging.warning('error %s %s', data, text)

    name = text['text'][0]
    return (name, {**text, **data})

# Processes files for tagging
def process(texts, processes=1, verbose=False, log_each=100):
    logging.info('Start processing with %d processes', processes)
    ts = datetime.now()
    with ThreadPool(processes) as p:
        t = datetime.now()
        i = 0
        taged_texts = []
        # for i, r in enumerate(p.imap(process_text, texts), start=1):
        for i, r in tqdm(enumerate(p.imap_unordered(process_text, texts), start=1), total=len(texts), ascii=True, desc="Tagging files"):
            name = r[0]
            r = r[1]
            if i % log_each == 0:
                logging.info('.. %d, took %s', i, datetime.now() - t)
                t = datetime.now()
            if verbose:
                logger.info('%d %s', i, r)
            taged_texts.append((name, json.dumps(r, ensure_ascii=False)))
        logging.info('Finished processing %d docs, took %s', i, datetime.now() - ts)
        return taged_texts

def set_param(morpho_tokenizer, ner, parser):
    ner = ["ner"] if ner else []
    parser = ["parser"] if parser else []
    
    global param
    if morpho_tokenizer == "morpho":
        param = ["morpho"] + ner + parser 
    elif morpho_tokenizer == "tokenizer":
        param = ["tokenizer"] + ner
    else:
        param = [param[0]] + ner + parser
        
# Returns widget for file unloading
def upload_files():
    upload = FileUpload(accept='.txt', multiple=True)
    upload.layout.margin = "20px 0px 30px 0px"

    uploaded_fnames = Output()
    uploaded_fnames.layout.height = "150px"
    uploaded_fnames.layout.overflow = "auto auto"

    ner_btn = Checkbox(value=False, description="Named Entity Recognition")
    parser_btn = Checkbox(value=False, description="Dependency Parsing")

    ner_btn.layout.height = "17px"
    ner_btn.layout.margin = "0px"
    parser_btn.layout.margin = "0px"

    checkboxes = VBox([ner_btn, parser_btn])
    checkboxes.layout.height = "50px"

    toggle = ToggleButtons(
        options=["Morph. analizer", "Tokenizer"],
        description="Choose tagging parameters:",
        default="Morph. analizer",
        
        display='flex',
        flex_flow='column',
        align_items='stretch', 
        style= {'description_width': 'initial'}
    )
    toggle.layout.height = "80px"
    
    def display_filenames(change):
        if change["new"]:
            with uploaded_fnames:
                for i in upload.value:
                    print(i["name"])
    upload.observe(display_filenames, names="value")

    def on_select_change(change):
        if change["new"] == "Tokenizer":
            parser_btn.value = False
            set_param("tokenizer", ner_btn.value, False)
            checkboxes.children = [ner_btn]
        elif change["new"] == "Morph. analizer":
            set_param("morpho", ner_btn.value, parser_btn.value)
            checkboxes.children = [ner_btn, parser_btn]
        else:
            set_param(False, ner_btn.value, parser_btn.value)
    toggle.observe(on_select_change, names="value")
    ner_btn.observe(on_select_change, names="value")
    parser_btn.observe(on_select_change, names="value")

    options = HBox([toggle, checkboxes])
    options.layout.align_items = "flex-end"
    
    return upload, options, uploaded_fnames

# Splits text into <100Kb chunks. Each chunk ends with the end of the sentence.
def split_text(text, text_size):      
    splits = ceil(text_size/100_000)
    interval = int(len(text)/splits)
    parts = [text[i:i+interval] for i in range(0, len(text), interval)]

    def spl_p(part):
        excess, i = [], None
        while True:
            d = part.rfind(".", 0, i)
            q = part.rfind("?", 0, i)
            e = part.rfind("!", 0, i)
            t = part.rfind("...", 0, i)
            m = max(d, q, e, t) + 1

            excess.append(part[m:])
            part = part[:m]

            if getsizeof(part) < 100_000:
                return part, "".join(excess[::-1])
            i = -1

    excess, new_parts = "", []
    for p in parts:
        part = excess + p        
        part, excess = spl_p(part)
                    
        new_parts.append(part)

    if getsizeof(excess) > 100_000:
        part, excess = spl_p(excess)
        new_parts.append(part)
    new_parts.append(excess)
    
    new_parts = [x for x in new_parts if x != '']
    return new_parts

# Concatinates splitted files
def concatenate_files(tagged_files):
# tagged_files structure:
# [('000_name.txt', '{
#     "text": ["000_name.txt", "Here is text from the file..."],
#     "annotation": {
#         "sentences": [{
#                 "ner": [{...}],                   # for ner
#                 "tokens": [{
#                     "deprel": "obl",              # for parser
#                     "features": "...",            # for morpho
#                     "form": "Here"                # for morpho and tokenizer
#                     "index": 1,                   # for morpho
#                     "lemma": "Here",              # for morpho
#                     "parent": 2,                  # for parser
#                     "pos": "xf",                  # for morpho
#                     "tag": "xf"                   # for morpho
#                     "ufeats": "Foreign=Yes",      # for parser
#                     "upos": "X"                   # for parser
#                  }, {...}]
#         }, {# next sent ...}, {...}],
#      "text": "Here is text from the file..." }
# }'), ('001_name.txt', {...})]

    text, sent, combined_files = [], [], []
    name = tagged_files[0][0].split('_', 1)[1] # 'name.txt'
    for file in tagged_files:
        js = json.loads(file[1])
        
        # when old file name doesn't match new file name. Storing info in combined_files
        if name != js['text'][0].split('_', 1)[1]:
            c_text = "".join(text)
            combined_files.append([name, {'text': c_text, 'annotation': {'sentences': sent, 'text': c_text}}])
            text, sent = [], []

        name = js['text'][0].split('_', 1)[1]
        text.append(js['text'][1])
        for s in js['annotation']['sentences']:
            sent.append(s)
    
    c_text = "".join(text)
    combined_files.append([name, {'text': c_text, 'annotation': {'sentences': sent, 'text': c_text}}])
    return combined_files


def procSentences(sentences, rows):
    for i, sentence in enumerate(sentences):
        for token in sentence.get('tokens'):
            token.setdefault('sent_ndx', i)
            if 'features' in token:
                del token['features']
            rows.append(token)
    return rows


def procChunk(chunk, row):
    if 'annotation' not in chunk:
        print("Missing data in this chunk!")
        return row
    else:
        return procSentences(chunk['annotation']['sentences'], row)


def procChunks(chunks):
    rows = []
    for chunk in chunks:
        rows = procChunk(chunk, rows)
    return rows

# Returns Dataframes with ners or tokens from JSON file 
def getDF(js):
    rows = []
    procChunk(js, rows)
    return pd.DataFrame(rows)

# Prepares tagged files for download 
def download_files(tagged_files):
    def get_zip_file(suffix):
        zip_buffer = io.BytesIO()
        with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
            if suffix == "json":
                for name, data in tagged_files:
                    name = name[:-4]+".json"
                    data = json.dumps(data, ensure_ascii=False)
                    zip_file.writestr(name, data)
            else:
                for name, data in tagged_files:
                    df = getDF(data)
                    if suffix == "csv":
                        zip_file.writestr(f"{name[:-4]}.csv", df.to_csv())
                    else:
                        zip_file.writestr(f"{name[:-4]}.parquet", df.to_parquet())
        
        b64 = base64.b64encode(zip_buffer.getvalue())
        return b64.decode()
    html_buttons = '''<html>
    <head>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    </head>
    <body>
    <div style="display: flex">
    
    <div style="display: flex; flex-direction: column">
    <a download="tagged_{j_name[0]}_files_{j_name[1]}.zip" href="data:text/csv;base64,{j_payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download JSON</button>
    </a>
    </div>
    
    <div style="display: flex; flex-direction: column">
    <a download="tagged_{c_name[0]}_files_{c_name[1]}.zip" href="data:text/csv;base64,{c_payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download CSV</button>
    </a>
    </div>
    
    <div style="display: flex; flex-direction: column">
    <a download="tagged_{p_name[0]}_files_{p_name[1]}.zip" href="data:text/csv;base64,{p_payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download PARQUET</button>
    </a>
    </div>

    </div>
    </body>
    </html>
    '''
    html_button = html_buttons.format(
        j_payload = get_zip_file("json"), j_name = ('json', "_".join(param)),
        c_payload = get_zip_file("csv"), c_name = ('csv', "_".join(param)),
        p_payload = get_zip_file("parquet"), p_name = ('paruqet', "_".join(param)),
    )
    display(HTML(html_button))
    
def tag_files():
    logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', level=logging.INFO)
    
    text_dict = []
    for file in tqdm(upload.value, ascii=True, desc="Preparing files"):
        text = codecs.decode(file['content'], encoding="utf-8") 
        name = file['name']
        
        text_size = getsizeof(text)
        if text_size > 100_000:
            s_texts = split_text(text, text_size)
            text_dict += [{'text': (str(i).zfill(3)+"_"+name, t)} for i, t in enumerate(s_texts)]
        else:
            text_dict.append({'text': ("_"+name, text)})
    
    tagged_files = process(text_dict, processes=3, log_each=50)
    tagged_files = concatenate_files(tagged_files)
    download_files(tagged_files)

upload, options, uploaded_fnames = upload_files()
user_input = HBox([VBox([upload, options]), uploaded_fnames])
display(user_input)

HBox(children=(VBox(children=(FileUpload(value=(), accept='.txt', description='Upload', layout=Layout(margin='…

In [None]:
x = tag_files()