## Notebook for converting NER training dataset to SpanCat dataset
Author: Juraj Dedič

### Load Libraries and parameters

In [2]:
import os 
import spacy
import pandas as pd
import re
from spacy import displacy

import random
from spacy.util import minibatch, compounding
from spacy.training import Example
from tqdm import tqdm
from typing import List
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
from spacy.tokens import DocBin

from spacy.tokens import Span

import string

we're going to instantiate a blank spaCy object and name the key to store our labelled spans "sc":

In [5]:
#instantiate blank spaCy object
nlp = spacy.blank('en')
#define your span key name
span_key = "sc"

In [117]:
db_train = DocBin().from_disk(R"v4-dataset/train.spacy") 
docs_train = db_train.get_docs(nlp.vocab) 
train_data = []

for doc in docs_train:    

    # spans = []
    # for ent in doc.ents:
    #     # sc.append((ent.start_char, ent.end_char, ent.label_))
    #     print(ent.start, ent.end, ent.label_, doc.text, len( doc.text ))
    #     span = Span(doc, ent.start, ent.end, label=ent.label_)
    #     spans.append(span)

    # doc.set_ents(spans)

    # print(doc.spans)
    # break


    sc = []

    for ent in doc.ents:
        sc.append((ent.start_char, ent.end_char, ent.label_))

    spans = {}
    if len(sc) > 0:
        spans = {"spans":{"sc": sc}}

    resultTuple = (doc.text, spans)

    train_data.append(resultTuple)


In [11]:
def convert_to_tuple(doc):
    """
    Converts doc to tuples of text, annotations and filename from user_data
    """

    sc = []
    for ent in doc.ents:
        sc.append((ent.start_char, ent.end_char, ent.label_))
    spans = {}
    # if len(sc) > 0:
    spans = {"spans":{"sc": sc}}

    filename = doc.user_data["filename"]

    resultTuple = (doc.text, spans, filename)
    return resultTuple

def tuple_to_doc(tuple):
    """
    Converts tuple to doc with spans
    """
    text, annotations, filename = tuple

    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    return example.reference 

def generate_span_db_file(input_file, output_file):
    """
    Converts a DocBin file with ents to a DocBin file with spans
    """
    db = DocBin().from_disk(input_file)
    docs = db.get_docs(nlp.vocab)
    i = 0
    new_db = DocBin()
    for doc in docs:
        tuple = convert_to_tuple(doc)
        new_doc = tuple_to_doc(tuple)
        if i < 10: 
            print(tuple)
            displacy.render(new_doc, style="span")
            i += 1
        new_db.add(new_doc)
    print("Saving to", output_file)
    new_db.to_disk(output_file)

In [15]:
name = "test"
dataset_dir = "v5-dataset/"
if dataset_dir[-1] != "/":
    dataset_dir += "/"
generate_span_db_file(dataset_dir+name+".spacy", dataset_dir+name+"_spancat.spacy")

('oscar kilo foxtrot charlie alpha you are leaving tma praha switch to praha info one two six decimal one naslysenou ', {'spans': {'sc': [(0, 32, 'CALLSIGN')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201024_221711.xml')


('one two six decimal one naslysenou dekuji ', {'spans': {'sc': []}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201024_221711.xml')


('oscar kilo papa mike bravo descend flight level one hundred ', {'spans': {'sc': [(0, 26, 'CALLSIGN'), (27, 34, 'COMMAND'), (35, 59, 'VALUE')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.xml')


('level one hundred oscar kilo papa mike bravo ', {'spans': {'sc': [(0, 17, 'VALUE'), (18, 44, 'CALLSIGN')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.xml')


('oscar kilo kilo echo alfa praha radar identified climb flight level one hundred ifr flight starts now time zero five cleared to destination via flight plan route ', {'spans': {'sc': [(0, 25, 'CALLSIGN'), (38, 54, 'COMMAND'), (55, 79, 'VALUE'), (91, 97, 'VALUE'), (98, 116, 'VALUE'), (117, 143, 'COMMAND'), (144, 161, 'VALUE')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_120512.xml')


('ryanair seven three alpha hotel turn left heading three six zero ', {'spans': {'sc': [(0, 31, 'CALLSIGN'), (32, 49, 'COMMAND'), (50, 64, 'VALUE')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.xml')


('ryanair seven three alpha hotel ', {'spans': {'sc': [(0, 31, 'CALLSIGN')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.xml')


('oscar kilo kilo uniform november proceed direct baltu ', {'spans': {'sc': [(0, 32, 'CALLSIGN'), (33, 47, 'COMMAND'), (48, 53, 'VALUE')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.xml')


('proceed direct baltu oscar kilo kilo uniform november ', {'spans': {'sc': [(0, 14, 'COMMAND'), (15, 20, 'VALUE'), (21, 53, 'CALLSIGN')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.xml')


('eurowings seven alfa bravo turn right heading two one zero cleared ils approach runway two four report established ', {'spans': {'sc': [(0, 26, 'CALLSIGN'), (27, 45, 'COMMAND'), (46, 58, 'VALUE'), (59, 79, 'COMMAND'), (80, 95, 'VALUE'), (96, 114, 'COMMAND')]}}, 'LKPR_RUZYNE_Radar_120_520MHz_20201025_140929.xml')


Saving to v5-dataset/test_spancat.spacy


In [118]:
train_data[1012]

('eurotrans one three juliett push and start approved facing foxtrot and squawk four four zero three ',
 {'spans': {'sc': [(0, 27, 'CALLSIGN')]}})

In [31]:
db_spancat = DocBin().from_disk(R"v4-dataset/validation_spancat.spacy")
docs_spancat = db_spancat.get_docs(nlp.vocab)

i = 0
for doc in docs_spancat:
    if i%10 == 0:
        displacy.render(doc, style="span")
    i += 1