In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parents[2]
if module_path not in sys.path:
    sys.path.append(str(module_path))

In [3]:
module_path

PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
import spacy
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin
from spacy.training import Example

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
spacy.prefer_gpu()

True

In [5]:
from src.loader import TextLoader
from src.model import DatasetType, Text

In [6]:
loader = TextLoader(dataset_type=DatasetType.V1_WITH_PREDICTIONSTRING)
nlp = spacy.load("models/spacy_resume/model-best/")

In [7]:
text = loader.load_random_text(purify_discourses=True, purify_text=True)
text.id

'41BEF59908B1'

In [8]:
for disc in text.discourses:
    print(disc)

--- 1620150366219 (0 -> 58 | 0 -> 10) - Position ---
Student should be able to bring their cell phone to school
----------------------------------------------------
--- 1620150374111 (59 -> 140 | 11 -> 24) - Claim ---
because if something goes wrong they can always get in touch with their parent's.
----------------------------------------------------
--- 1620150380668 (141 -> 449 | 25 -> 86) - Evidence ---
some kids walk home after school and they might need to call their they parent to let them know that their okay. when they have their free time and lunch break it's best to use the phone they can be able to text and talk on their cell phone with out anyone taking their phone and telling them to put it away.
--------------------------------------------------------
--- 1620150391574 (450 -> 621 | 87 -> 117) - Claim ---
kids should turn off their cellular dives because while their in class the phone will not interrupt the teacher are the class and it will not throw them off their subjec

In [9]:
def create_manual_doc(text: Text):
    word_idx = []
    for disc in text.discourses:
        word_idx.extend((disc.predictionstring[0], disc.predictionstring[-1]))

    ents = []

    DS_token = "B-DS"
    DE_token = "B-DE"
    use_DS = True
    for ind, word in enumerate(text.words):
        if use_DS:
            curr_token = DS_token
        else:
            curr_token = DE_token

        if ind in word_idx:
            ents.append(curr_token)
            use_DS = not use_DS
        else:
            ents.append("O")

    return Doc(nlp.vocab, text.words, ents=ents)


def display_doc(doc: Doc):
    spacy.displacy.render(doc, style="ent", jupyter=True)

In [10]:
manual_doc = create_manual_doc(text)

In [11]:
display_doc(manual_doc)

In [12]:
doc = nlp(text.text)


In [13]:
display_doc(doc)

In [14]:
doc_bin = DocBin().from_disk("data/NER_test.spacy")
len(doc_bin)

780

In [15]:
examples = []
for ind, doc in enumerate(doc_bin.get_docs(nlp.vocab)):
    examples.append(Example.from_dict(doc, {"entities": doc.ents}))


In [21]:
predicted_examples = []
for ind, example in enumerate(examples):
    doc = nlp(example.text)
    predicted_examples.append(Example(doc, example.reference))


In [35]:
nlp.evaluate(predicted_examples)
    

{'token_acc': 1.0,
 'token_p': 0.9773659229601885,
 'token_r': 0.9885548875528836,
 'token_f': 0.9829285644702777,
 'ents_p': 0.6890407099591491,
 'ents_r': 0.698187268055952,
 'ents_f': 0.6935838355193193,
 'ents_per_type': {'DS': {'p': 0.670701472947353,
   'r': 0.655782118922002,
   'f': 0.6631578947368422},
  'DE': {'p': 0.7061708214139762,
   'r': 0.7406772396056579,
   'f': 0.7230125523012553}},
 'speed': 2025.427141382518}

In [39]:
# Run one example through the model
example = examples[0]
doc = nlp(example.text)


In [46]:
for ent in doc.ents:
    print(f"{ent.start:3} - {ent.end:3} ===> {ent.text:10}: {ent.label_}")

  0 -   1 ===> In        : DS
 73 -  74 ===> country   : DE
 75 -  76 ===> Most      : DS
106 - 107 ===> driving   : DE
126 - 127 ===> car       : DE
143 - 144 ===> insurance : DE
145 - 146 ===> Unknown   : DS
199 - 200 ===> option    : DE
201 - 202 ===> When      : DS
296 - 297 ===> life      : DE
298 - 299 ===> Those     : DS
323 - 324 ===> road      : DE
325 - 326 ===> However   : DS
418 - 419 ===> time      : DE
420 - 421 ===> No        : DS
500 - 501 ===> it        : DE


In [58]:
last_ent = None
last_start_sent_ind = 0
last_end_sent_ind = 0

tokens_fixed = []
ents_fixed = []

for ind, token in enumerate(doc):
    if token.text == ".":
        last_start_sent_ind = ind + 1
        last_end_sent_ind = ind - 1

    if not token.ent_type_:
        tokens_fixed.append(token.text)
        ents_fixed.append(token.ent_iob_)
        continue

    if token.ent_type_ == "DS" and last_ent == "DE" or token.ent_type_ == "DE" and last_ent == "DS":
        last_ent = token.ent_type_
        tokens_fixed.append(token.text)
        ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
        continue

    if last_ent is None:
        assert token.ent_type_ != "DE" and ind == 0
        last_ent = token.ent_type_
        tokens_fixed.append(token.text)
        ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
        continue

    if token.ent_type_ == "DE":
        assert ents_fixed[last_start_sent_ind] not in ("DS", "DE")  # Sanity check
        ents_fixed[last_start_sent_ind] = "B-DS"
    elif token.ent_type_ == "DS":
        assert ents_fixed[last_end_sent_ind] not in ("DS", "DE")  # Sanity check
        ents_fixed[last_end_sent_ind] = "B-DE"

    last_ent = token.ent_type_
    tokens_fixed.append(token.text)
    ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")


doc_fixed = Doc(nlp.vocab, tokens_fixed, ents=ents_fixed)

In [50]:
display_doc(example.reference)


In [59]:
display_doc(doc_fixed)


In [60]:
display_doc(doc)
