In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.insert(0, str(module_path))

In [3]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from src.loader import TextLoader

In [5]:
!mkdir -p data/

In [6]:
loader = TextLoader()

In [7]:
def create_doc(
    doc_type: str, offset: int = 0, limit: int = 0, shuffle: bool = True, seed: int = 8888
) -> None:
    current_ends = []
    output = []
    for text_no, text in enumerate(
        loader.iterate(offset=offset, limit=limit, shuffle=shuffle, seed=seed)
    ):
        print(f"\r{text_no + 1:3} / {limit}", end="")
        current_ends = [
            (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
        ]

        curr_start, curr_end = current_ends.pop(0)
        for word_ind, word in enumerate(text.words):
            if word_ind > curr_end:
                if not current_ends:
                    break

                curr_start, curr_end = current_ends.pop(0)

            if word_ind == curr_start:
                output.append(f"{word} DS\n")
            elif word_ind == curr_end:
                output.append(f"{word} DE\n")
            else:
                output.append(f"{word} O\n")

            if "." in word:
                output.append("\n")

        output[-1] = output[-1].strip()  # remove last newline
        output.append("<DOC>\n")

    output = output[:-1]  # remove last <DOC>

    with open(f"data/NER_{doc_type}.txt", "w") as f:
        f.writelines(output)


In [8]:
len(loader)


15594

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# train_size = 0.8
# dev_size = 0.15
# test_size = 0.05

train_size, dev_size = train_test_split(
    range(len(loader)), test_size=0.2, random_state=8888
)
dev_size, test_size = train_test_split(
    dev_size, test_size=0.25, random_state=8888
)
train_size, test_size, dev_size = len(train_size), len(test_size), len(dev_size)

print(f"Train size: {train_size}, Dev size: {dev_size}, Test size: {test_size}")


Train size: 12475, Dev size: 2339, Test size: 780


In [11]:
create_doc("train", offset=0, limit=train_size)

12475 / 12475

In [12]:
create_doc("dev", offset=train_size, limit=dev_size)

2339 / 2339

In [13]:
create_doc("test", offset=train_size + dev_size, limit=test_size)

780 / 780