In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parents[2]
if module_path not in sys.path:
    sys.path.append(str(module_path))

In [3]:
module_path

PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
import random
import regex as re

In [5]:
import spacy
import torch
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin, Span
from spacy.training import Example

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
torch.cuda.empty_cache()
# spacy.require_gpu()

In [7]:
from src.loader import TextLoader
from src.model import DatasetType, Text

In [8]:
loader = TextLoader(dataset_type=DatasetType.V1_WITH_PREDICTIONSTRING)
nlp = spacy.load("models/spacy_resume/model-best/")

In [9]:
text = loader.load_random_text(purify_discourses=True, purify_text=True)
text.id

'8DD09D4FCD4F'

In [10]:
for disc in text.discourses:
    print(disc)

--- 1621775968881 (0 -> 106 | 0 -> 16) - Lead ---
Is attending classes from home by way online of videoconferencing a favorable method for students to study
-------------------------------------------------
--- 1621776019174 (107 -> 213 | 17 -> 34) - Position ---
Attending to classes from home by way online or videoconferencing is not a highly recommended way to study
--------------------------------------------------------
--- 1621776029151 (214 -> 303 | 35 -> 50) - Claim ---
because students will not have the same learning that they have while they are at school.
-----------------------------------------------------
--- 1621776040861 (304 -> 467 | 51 -> 77) - Claim ---
If attending classes from home becomes a common way to study students will not have the opportunity to build strong friendships with the other students around them
-----------------------------------------------------
--- 1621776052265 (468 -> 489 | 78 -> 81) - Claim ---
nor with the teachers
--------------------------

In [11]:
def create_manual_doc(text: Text):
    word_idx = []
    for disc in text.discourses:
        word_idx.extend((disc.predictionstring[0], disc.predictionstring[-1]))

    ents = []

    DS_token = "B-DS"
    DE_token = "B-DE"
    use_DS = True
    for ind, word in enumerate(text.words):
        if use_DS:
            curr_token = DS_token
        else:
            curr_token = DE_token

        if ind in word_idx:
            ents.append(curr_token)
            use_DS = not use_DS
        else:
            ents.append("O")

    return Doc(nlp.vocab, text.words, ents=ents)


def display_doc(doc: Doc):
    spacy.displacy.render(doc, style="ent", jupyter=True)

In [12]:
manual_doc = create_manual_doc(text)

In [13]:
display_doc(manual_doc)

In [14]:
doc = nlp(text.text)


In [15]:
display_doc(doc)

In [16]:
doc_bin = DocBin().from_disk("data/NER_test.spacy")
len(doc_bin)

780

In [17]:
examples = []
for ind, doc in enumerate(doc_bin.get_docs(nlp.vocab)):
    examples.append(Example.from_dict(doc, {"entities": doc.ents}))


In [18]:
predicted_examples = []
for ind, example in enumerate(examples):
    print(f"\r{ind:3d}/{len(examples)}", end="")
    doc = nlp(example.text)
    predicted_examples.append(Example(doc, example.reference))


779/780

In [19]:
nlp.evaluate(predicted_examples)


{'token_acc': 1.0,
 'token_p': 0.9773659229601885,
 'token_r': 0.9885548875528836,
 'token_f': 0.9829285644702777,
 'ents_p': 0.6890407099591491,
 'ents_r': 0.698187268055952,
 'ents_f': 0.6935838355193193,
 'ents_per_type': {'DS': {'p': 0.670701472947353,
   'r': 0.655782118922002,
   'f': 0.6631578947368422},
  'DE': {'p': 0.7061708214139762,
   'r': 0.7406772396056579,
   'f': 0.7230125523012553}},
 'speed': 1219.6926545476797}

In [20]:
fixed_examples = []
for ind, example in enumerate(examples):
    print(f"\r{ind:3d}/{len(examples)}", end="")

    doc = nlp(example.text)

    last_ent = None
    last_start_sent_ind = 0
    last_end_sent_ind = 0

    tokens_fixed = []
    ents_fixed = []

    for ind, token in enumerate(doc):
        if token.text == ".":
            last_start_sent_ind = ind + 1
            last_end_sent_ind = ind - 1

        if not token.ent_type_:
            tokens_fixed.append(token.text)
            ents_fixed.append(token.ent_iob_)
            continue

        if token.ent_type_ == "DS" and last_ent == "DE" or token.ent_type_ == "DE" and last_ent == "DS":
            last_ent = token.ent_type_
            tokens_fixed.append(token.text)
            ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
            continue

        if token.ent_type_ == "DS" and last_ent == "DS" and ents_fixed[ind - 1] == "B-DS":
            tokens_fixed.append(token.text)
            ents_fixed.append(f"I-{token.ent_type_}")
            continue

        if last_ent is None:
            assert not (token.ent_type_ == "DE" and ind == 0), "First token must not be DE"
            last_ent = token.ent_type_
            tokens_fixed.append(token.text)
            ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
            continue

        if token.ent_type_ == "DE":
            assert ents_fixed[last_start_sent_ind] not in ("DS", "DE"), "Start token already has a tag"
            ents_fixed[last_start_sent_ind] = "B-DS"
        elif token.ent_type_ == "DS":
            assert ents_fixed[last_end_sent_ind] not in ("DS", "DE"), "End token already has a tag"
            ents_fixed[last_end_sent_ind] = "B-DE"

        last_ent = token.ent_type_
        tokens_fixed.append(token.text)
        ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")


    doc_fixed = Doc(nlp.vocab, tokens_fixed, ents=ents_fixed)
    fixed_examples.append(Example(doc_fixed, example.reference))


779/780

In [21]:
nlp.evaluate(fixed_examples)


{'token_acc': 1.0,
 'token_p': 0.3137186987240263,
 'token_r': 0.3173101759073703,
 'token_f': 0.31550421693682423,
 'ents_p': 0.6613260388172463,
 'ents_r': 0.7246645732229517,
 'ents_f': 0.6915480487638765,
 'ents_per_type': {'DS': {'p': 0.632192947961725,
   'r': 0.6877228005133323,
   'f': 0.658789782816555},
  'DE': {'p': 0.6900970873786407,
   'r': 0.7616802400342906,
   'f': 0.7241238793806031}},
 'speed': 1215.698559114586}

In [87]:
ind = random.choice(range(len(predicted_examples)))

In [88]:
ind  # TODO: USE IND 9

424

In [89]:
print("----------")
print("Reference:")
print("----------")
display_doc(predicted_examples[ind].reference)


----------
Reference:
----------


In [90]:
print("----------")
print("Predicted:")
print("----------")
display_doc(predicted_examples[ind].predicted)

----------
Predicted:
----------


In [91]:
print("----------")
print("Fixed_pred:")
print("----------")
display_doc(fixed_examples[ind].predicted)

----------
Fixed_pred:
----------


In [28]:
def extract_discourses(doc: Doc, keep_first_ds: bool = False, keep_first_de: bool = False):
    discourses = []
    tokens = [token.text for token in doc]

    last_ent = None
    ents = []
    deleted_offset = 0
    for ind, ent in enumerate(doc.ents):
        if ent.label_ == "DS" and last_ent == "DS":
            if not keep_first_ds:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        if ent.label_ == "DE" and last_ent == "DE":
            if not keep_first_de:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        ents.append(ent)
        last_ent = ent.label_

    for ent in ents:
        print(ent, ent.label_)

    last_tag = None
    for ind, ent in enumerate(ents):
        if ent.label_ == "DS":
            start_pos = ent.start
            last_tag = "DS"
            continue

        if ent.label_ == "DE":
            assert last_tag == "DS", "DE without DS"
            disc = " ".join(tokens[start_pos:ent.end])
            disc = re.sub(r" \.", ".", disc)
            discourses.append(disc)
            start_pos = None
            last_tag = "DE"
            continue
    
    return discourses


In [48]:
for token in predicted_examples[ind].reference:
    if not token.ent_type_:
        continue

    print(f"{token.text:12} {token.ent_iob_}-{token.ent_type_}")

I            B-DS
computers    B-DE
I'm          B-DS
wrong        B-DE
Someone      B-DS
instence     B-DE
They         B-DS
of           B-DE
Now          B-DS
life         B-DE
So           B-DS
badly        B-DE


In [44]:
for ent in predicted_examples[ind].predicted.ents:
    ent: Span = ent
    print(f"{ent.text:12} {ent.label_}")

I            -DS
cars         -DE
computers    -DE
I            -DS
'm           -DS
machine      -DE
If           -DS
computer     -DE
job          -DE
wrong        -DE
Someone      -DS
nowfunction  -DE
Say          -DS
of           -DE
Now          -DS
others       -DE
We           -DS
life         -DE
So           -DS
badly        -DE


In [92]:
doc = predicted_examples[ind].reference
ref = extract_discourses(doc)

Drivers DS
vehicle DE
Distracted DS
accidents DE
One DS
distracted DE
Which DS
crash DE
Nearly DS
driving DE
In DS
parked DE
People DS
distracted DE
All DS
member DE


In [93]:
doc = predicted_examples[ind].predicted
pred = extract_discourses(doc, keep_first_de=True)

Drivers DS
vehicle DE
Distracted DS
accidents DE
One DS
crash DE
In DS
crashes DE
People DS
lives DE
Even DS
parked DE
People DS
hands DE
Like DS
distracted DE
All DS
member DE


In [94]:
pred

['Drivers should not be able to use cell phones in any capacity while operating a vehicle',
 'Distracted driving is one of the most common causes of road accidents',
 "One of the main reasons why you should n't be using your phone while driving because the human mind is easy to get distracted. Which leads you in a dished. Leading to and unnecessary crash",
 'In the United States approximately 9 people are killed and over a thousand were injured in crashes because they were distracted. Cell phones are leading factors that cause accidents. Teens have been the largest age group reported driving distracted. It has been reported to be over 50 of teens crashes',
 "People should stop driving with cell phones because it end other people 's lives",
 "Even a two minute drive going to the store of you picking up your phone can cause a crash. Might not be a major crash but it could ve been avoided. Should only be on your phone if you 're parked",
 "People should have an hands free set in the car i

In [95]:
search_phrase = "In the United States approximately 9 people are killed and over a thousand were injured"
for text in loader.iterate(purify_discourses=True, purify_text=True, verbose=True):
    if search_phrase in text.text:
        print(text.id)
        break

521 / 155902F159741CBBE


In [None]:
# NOTE:
# 2F159741CBBE text is good for analyzing in the document

In [96]:
text = loader.load_text_with_id("2F159741CBBE")
for disc in text.discourses:
    print(disc)

--- 1622645414624 (0 -> 88 | 0 -> 15) - Position ---
Drivers should not be able to use cell phones in any capacity while operating a vehicle.
----------------------------------------------------
--- 1622645484351 (89 -> 159 | 16 -> 27) - Evidence ---
Distracted driving is one of the most common causes of road accidents.
-------------------------------------------------------
--- 1622645493326 (160 -> 285 | 28 -> 50) - Claim ---
One of the main reasons why you shouldn't be using your phone while driving because the human mind is easy to get distracted.
-----------------------------------------------------
--- 1622645501622 (286 -> 348 | 51 -> 61) - Evidence ---
Which leads you in a dished. Leading to and unnecessary crash.
--------------------------------------------------------
--- 1622645530702 (403 -> 496 | 71 -> 85) - Claim ---
Nearly over 300 hundred k people are injured. From accidents caused by texting while driving.
-----------------------------------------------------
--- 16226

In [32]:
for ind, ref_item in enumerate(ref):
    print(f"Ref:\n----\n{ref_item}")
    if ind < len(pred):
        print(f"----\nPred:\n----\n{pred[ind]}")
    
    print("\n####\n")

Ref:
----
Whats wrong with her My parents had found those words falling out of their mouths faster than they could stop it and louder each time that they were spoken. Years had gone by from when I had been previously hospitalized for a auto immune system disease two to be exact and I was back to where I was before. I began to have seizures and panic attacks regularly without a rhyme or reason which not only was unsettling but dangerous. My mom took me to the doctors and they wanted to send me away to some mental hospital. Both of my parents were taken aback at the lack of bedside manners and were concerned about the negative tone that this doctor was speaking in
----
Pred:
----
What s wrong with her My parents had found those words falling out of their mouths faster than they could stop it and louder each time that they were spoken. Years had gone by from when I had been previously hospitalized for a auto immune system disease two to be exact and I was back to where I was before. I beg