In [None]:
import sys
from pathlib import Path

In [None]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.insert(0, str(module_path))

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from sklearn.model_selection import train_test_split


In [None]:
from src.loader import TextLoader

In [None]:
!mkdir -p data/

In [None]:
loader = TextLoader()

In [None]:
def create_doc(
    doc_type: str, offset: int = 0, limit: int = 0, shuffle: bool = True, seed: int = 8888
) -> None:
    current_ends = []
    output = []
    DS_count = 0
    DE_count = 0
    for text_no, text in enumerate(
        loader.iterate(offset=offset, limit=limit, shuffle=shuffle, seed=seed)
    ):
        print(f"\r{text_no + 1:3} / {limit}", end="")
        current_ends = [
            (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
        ]

        curr_start, curr_end = current_ends.pop(0)
        for word_ind, word in enumerate(text.words):
            if word_ind > curr_end:
                if not current_ends:
                    break

                curr_start, curr_end = current_ends.pop(0)

            if word_ind == curr_start:
                output.append(f"{word} DS\n")
                DS_count += 1
            elif word_ind == curr_end:
                output.append(f"{word} DE\n")
                DE_count += 1
            else:
                output.append(f"{word} O\n")

            if DE_count > DS_count:
                raise Exception(f"Wut for {word_ind}")

            if "." in word:
                output.append("\n")

        output.append("<DOC>\n")

    output = output[:-1]  # remove last <DOC>

    with open(f"data/NER_{doc_type}.txt", "w") as f:
        f.writelines(output)


In [38]:
text = loader.load_text_with_id("DBF7EB6A9E02")
disc, = [disc for disc in text.discourses if disc.id == 1622489430075]
print(disc.text)
print(text[disc.ind_start:disc.ind_end].split())
pred_start, pred_end = int(disc.predictionstring[0]), int(disc.predictionstring[-1])
print(text.words[pred_start:pred_end+1]) # TODO: Problem with predictionstring

. Drivers should not be able to use cell phones in any capacity while operating a motor vehicle. 
['.', 'Drivers', 'should', 'not', 'be', 'able', 'to', 'use', 'cell', 'phones', 'in', 'any', 'capacity', 'while', 'operating', 'a', 'motor', 'vehicle.']
['Drivers', 'should', 'not', 'be', 'able', 'to', 'use', 'cell', 'phones', 'in', 'any', 'capacity', 'while', 'operating', 'a', 'motor', 'vehicle.', 'One']


In [27]:
text = loader.load_text_with_id("DBF7EB6A9E02")
for disc in text.discourses:
    print(disc.id)
    char_start = disc.ind_start
    char_end = disc.ind_end
    word_start = len(text[:char_start].split())
    word_end = word_start + len(text[char_start:char_end].split())
    word_end = min(word_end, len(text.split()))
    print((word_start, word_end))
    
    d_text = disc.text.split()
    e_text = text.words[word_start:word_end]
    if d_text[0] != e_text[0]:
        d_text.remove(d_text[0])
        e_text.remove(e_text[-1])

    for ind, d in enumerate(d_text):
        print(f"{d} == {e_text[ind]} --> {d == e_text[ind]}")

1622489473108
(9, 35)
Being == Being --> True
on == on --> True
your == your --> True
device == device --> True
and == and --> True
driving == driving --> True
could == could --> True
be == be --> True
an == an --> True
overly == overly --> True
dangerous == dangerous --> True
choice == choice --> True
in == in --> True
life. == life. --> True
Many == Many --> True
people == people --> True
around == around --> True
the == the --> True
world == world --> True
are == are --> True
injured == injured --> True
by == by --> True
this == this --> True
situation == situation --> True
every == every --> True
day. == day. --> True
1622489478902
(35, 42)
It == It --> True
could == could --> True
lead == lead --> True
to == to --> True
accidents == accidents --> True
and == and --> True
altercations. == altercations. --> True
1622489485375
(42, 51)
In == In --> True
addition == addition --> True
it == it --> True
would == would --> True
even == even --> True
cost == cost --> True
you == you --> T

In [None]:
current_ends = [
    (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
]
current_ends


In [24]:
curr_start, curr_end = current_ends.pop(0)
for word_ind, word in enumerate(text.words):
    if word_ind > curr_end:
        if not current_ends:
            break

        curr_start, curr_end = current_ends.pop(0)

    if word_ind == curr_start:
        print(f"{word} ({word_ind}) DS")
    elif word_ind == curr_end:
        print(f"{word} ({word_ind}) DE")
    else:
        print(f"{word} ({word_ind}) O")


Operating (0) O
a (1) O
motor (2) O
vehicle (3) O
while (4) O
on (5) O
your (6) O
cell (7) O
phone (8) O
Being (9) DS
on (10) O
your (11) O
device (12) O
and (13) O
driving (14) O
could (15) O
be (16) O
an (17) O
overly (18) O
dangerous (19) O
choice (20) O
in (21) O
life. (22) O
Many (23) O
people (24) O
around (25) O
the (26) O
world (27) O
are (28) O
injured (29) O
by (30) O
this (31) O
situation (32) O
every (33) O
day. (34) DE
It (35) DS
could (36) O
lead (37) O
to (38) O
accidents (39) O
and (40) O
altercations. (41) DE
In (42) DS
addition (43) O
it (44) O
would (45) O
even (46) O
cost (47) O
you (48) O
your (49) O
licences. (50) DE
The (51) DS
most (52) O
detrimental (53) O
outcome (54) O
is (55) O
death. (56) DE
There (57) O
are (58) O
far (59) O
more (60) O
outcomes (61) O
to (62) O
operating (63) O
a (64) O
motor (65) O
vehicle (66) O
while (67) O
being (68) O
on (69) O
a (70) O
cell (71) O
phone. (72) O
Drivers (73) DS
should (74) O
not (75) O
be (76) O
able (77) O
to (78) O

In [None]:
len(loader)


In [None]:
# train_size = 0.8
# dev_size = 0.15
# test_size = 0.05

train_size, dev_size = train_test_split(
    range(len(loader)), test_size=0.2, random_state=8888
)
dev_size, test_size = train_test_split(
    dev_size, test_size=0.25, random_state=8888
)
train_size, test_size, dev_size = len(train_size), len(test_size), len(dev_size)

print(f"Train size: {train_size}, Dev size: {dev_size}, Test size: {test_size}")


In [None]:
create_doc("train", offset=0, limit=train_size, shuffle=False, seed=8888)

In [None]:
create_doc("dev", offset=train_size, limit=dev_size)

In [None]:
create_doc("test", offset=train_size + dev_size, limit=test_size)