In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parents[2]
if module_path not in sys.path:
    sys.path.insert(0, str(module_path))

In [3]:
module_path

PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from src.loader import TextLoader
from src.model import DatasetType

In [6]:
!mkdir -p data/

In [7]:
loader = TextLoader(DatasetType.V1_WITH_PREDICTIONSTRING)

In [8]:
loader.df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,predictionstring
0,423A1CA112E2,1622627660524,7,228,Modern humans today are always on their phone....,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,229,311,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,312,399,Some certain areas in the United States ban ph...,Evidence,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,400,754,When people have phones they know about certai...,Evidence,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,755,882,Driving is one of the way how to get around. P...,Claim,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...
144251,4C471936CD75,1618153340639,2212,3168,if I'm not sure what college I want to attend ...,Evidence,386 387 388 389 390 391 392 393 394 395 396 39...
144252,4C471936CD75,1618153383399,3186,4453,seeking multiple opinions before making a hard...,Evidence,576 577 578 579 580 581 582 583 584 585 586 58...
144253,4C471936CD75,1618024996127,4454,4513,it is better to seek multiple opinions instead...,Position,828 829 830 831 832 833 834 835 836 837 838
144254,4C471936CD75,1618025268756,4514,4860,The impact of asking people to help you make a...,Evidence,839 840 841 842 843 844 845 846 847 848 849 85...


In [9]:
def create_doc(
    doc_type: str, offset: int = 0, limit: int = 0, shuffle: bool = True, seed: int = 8888
) -> None:
    current_ends = []
    output = []
    DS_count = 0
    DE_count = 0
    for text_no, text in enumerate(
        loader.iterate(offset=offset, limit=limit, shuffle=shuffle, seed=seed, purify_text=True, purify_discourses=True)
    ):
        print(f"\r{text_no + 1:3} / {limit}", end="")
        current_ends = [
            (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
        ]

        curr_start, curr_end = current_ends.pop(0)

        output.append("-DOCSTART- -X- O O\n")
        for word_ind, word in enumerate(text.words):
            if word_ind > curr_end:
                if not current_ends:
                    break

                curr_start, curr_end = current_ends.pop(0)

            stripped = word.rstrip(".")

            if word_ind == curr_start:
                output.append(f"{stripped} B-DS\n")
                DS_count += 1

            elif word_ind == curr_end:
                output.append(f"{stripped} B-DE\n")
                DE_count += 1

            else:
                output.append(f"{stripped} O\n")

            if DE_count > DS_count:
                raise Exception(f"Wut for {word_ind}")

            if word.endswith("."):
                output.append(". O\n\n")

    # output = output[:-1]  # remove last <DOC>

    with open(f"data/NER_{doc_type}.txt", "w") as f:
        f.writelines(output)


In [10]:
len(loader)


15590

In [11]:
# train_size = 0.8
# dev_size = 0.15
# test_size = 0.05

train_size, dev_size = train_test_split(
    range(len(loader)), test_size=0.2, random_state=8888
)
dev_size, test_size = train_test_split(
    dev_size, test_size=0.25, random_state=8888
)
train_size, test_size, dev_size = len(train_size), len(test_size), len(dev_size)

print(f"Train size: {train_size}, Dev size: {dev_size}, Test size: {test_size}")


Train size: 12472, Dev size: 2338, Test size: 780


<center> <h2> IMPORTANT NOTE </h2> </center>

#### Dataset can (and does) contain overlapping tags, thus the number of `DS` and `DE` tags is not equal. It is caused by **discourses** consisting of only one word.

In [12]:
create_doc("train", offset=0, limit=train_size, shuffle=True, seed=87655678)

12472 / 12472

In [13]:
create_doc("dev", offset=train_size, limit=dev_size, shuffle=True)

2338 / 2338

In [14]:
create_doc("test", offset=train_size + dev_size, limit=test_size, shuffle=True)

780 / 780