In [1]:
import sys
from pathlib import Path


In [2]:
module_path = Path.cwd().parents[2]
if module_path not in sys.path:
    sys.path.append(str(module_path))


In [3]:
module_path


PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
!mkdir -p metrics/


In [5]:
import json
import pickle
import random
from time import perf_counter

import regex as re

In [6]:
import spacy
import torch
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin, Span
from spacy.training import Example


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
METRICS_PATH = Path.cwd() / "metrics"
METRICS_PATH.mkdir(parents=True, exist_ok=True)

In [8]:
torch.cuda.empty_cache()
# spacy.require_gpu()


In [9]:
from src.loader import TextLoader
from src.model import DatasetType, Text


In [10]:
loader = TextLoader(dataset_type=DatasetType.V1_WITH_PREDICTIONSTRING)
nlp = spacy.load("models/spacy_resume/model-best/")


In [11]:
def create_manual_doc(text: Text):
    word_idx = []
    for disc in text.discourses:
        word_idx.extend((disc.predictionstring[0], disc.predictionstring[-1]))

    ents = []

    DS_token = "B-DS"
    DE_token = "B-DE"
    use_DS = True
    for ind, word in enumerate(text.words):
        if use_DS:
            curr_token = DS_token
        else:
            curr_token = DE_token

        if ind in word_idx:
            ents.append(curr_token)
            use_DS = not use_DS
        else:
            ents.append("O")

    return Doc(nlp.vocab, text.words, ents=ents)


def display_doc(doc: Doc):
    spacy.displacy.render(doc, style="ent", jupyter=True)


In [12]:
doc_bin = DocBin().from_disk("data/NER_test.spacy")
len(doc_bin)


780

In [13]:
path_pred = Path("data/predicted_examples.pkl")

if not path_pred.exists():
    predicted_examples: list[Example] = []
    references = list(doc_bin.get_docs(nlp.vocab))
    for ind, reference in enumerate(references):
        print(f"\r{ind + 1:3d}/{len(references)}", end="")
        doc = nlp(reference.text)
        predicted_examples.append(Example(doc, reference))

    pickle.dump(predicted_examples, open(path_pred, "wb"))

else:
    with open(path_pred, "rb") as f:
        predicted_examples: list[Example] = pickle.load(f)


In [14]:
def get_fixed_doc(example: Example, idx: list[int]):
    doc = example.predicted
    ents = doc.ents

    tokens_fixed = []
    tokens_spaces = []
    ents_fixed = []
    curr_ind = idx.pop(0)
    for token in doc:
        if curr_ind is not None and token.i == curr_ind + 1:
            # tokens_fixex[-1] = tokens_fixex[-1] + token.text
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)
            ents_fixed.append(f"I-{token.ent_type_}")
            if idx:
                curr_ind = idx.pop(0)
            else:
                curr_ind = None
        else:
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)
            if token.ent_iob_ == "O":
                ents_fixed.append(token.ent_iob_)
            else:
                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

    return Doc(nlp.vocab, tokens_fixed, spaces=tokens_spaces, ents=ents_fixed)


In [15]:
def merge_invalid_examples(predicted_examples: list[Example], verbose: bool = False) -> list[Example]:
    merged_examples = []
    counter = 0
    for ind, example in enumerate(predicted_examples):
        ents = example.predicted.ents

        idx = []
        for i, ent in enumerate(ents[:-1]):
            next_ent = ents[i + 1]
            if ent.end == next_ent.start and ent.label_ == next_ent.label_ and "'" in next_ent.text:
                idx.append(ent.start)

        if not idx:
            merged_examples.append(example)
            continue

        counter += 1

        if verbose:
            print(f"ind: {ind}")
            for ent in ents:
                print(f"{ent.start:>3} {ent.end:>3} {ent.label_} {ent.text:12}", end=" ")
                if ent.start in idx:
                    print("<<<<<")
                else:
                    print()
            print("\n----\n")

        fixed_doc = get_fixed_doc(example, idx)

        if verbose:
            for ent in fixed_doc.ents:
                print(f"{ent.start:>3} {ent.end:>3} {ent.label_} {ent.text:12}")
            print("\n----\n")

        merged_examples.append(Example(fixed_doc, example.reference))

    print(f"Fixed {counter} examples.")

    return merged_examples


In [17]:
merged_examples = merge_invalid_examples(predicted_examples, verbose=False)
print(f"All examples: {len(predicted_examples)}")


Fixed 77 examples.
All examples: 780


In [18]:
# Predicted ind: 14 has some issues with consequent DS/DE entities
display_doc(merged_examples[9].predicted)


In [19]:
def inference_missing_tags(
    examples: list[Example], use_first: bool = False, use_sentence_boundaries: bool = True
) -> list[Example]:
    """
    It may happened that consequitive tags are of the same type, e.g. ... DS DE DE ...
    which is not ideal as we cannot extract discourses from it. This function tries to add
    missing tags to such cases based on couple of rules:

    1. For the sequence of tags there musn't be any consequitive tags of the same type.
    2. For missing tags (e.g. DS DE DE we will try to find missing DS tag so that it becomes
       DS DE DS DE).
    3. Missing tag is added only as a start / end of sentence. In case of situation where
       there are couple of sentences between two consequitive tags, we can use either use first
       approach (add missing tag to the first found sentence) or use last approach (add
       missing tag to the last found sentence).
    4. If use_sentence_boundaries is set to True, then we will try to add missing tags only
       at the start / end of sentence. If this is not possible, the middle consequitive tag will be
       removed. If use_sentence_boundaries is set to False, then the missing tag will be added to the
       first / last untagged token before the next consequitive tag if there is no sentence boundary.
    """
    fixed_examples = []
    for ind, example in enumerate(examples):
        print(f"\r{ind:3d}/{len(examples) - 1}", end="")

        doc = example.predicted

        last_ent = None
        last_ent_ind = None

        last_start_sent_ind = 0
        last_end_sent_ind = 0

        saved_first_token_ind = None

        tokens_fixed = []
        tokens_spaces = []
        ents_fixed = []

        for ind, token in enumerate(doc):
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)

            if (
                last_start_sent_ind is not None
                and last_start_sent_ind < ind
                and ents_fixed
                and last_ent is not None
            ):
                # Check if last idx are set on proper tokens, if not remove them
                if last_ent == "DE":
                    if ents_fixed[last_start_sent_ind] in ("B-DS", "I-DS", "B-DE", "I-DE"):
                        last_start_sent_ind, last_end_sent_ind = None, None
                else:
                    if ents_fixed[last_end_sent_ind] in ("B-DS", "I-DS", "B-DE", "I-DE"):
                        last_start_sent_ind, last_end_sent_ind = None, None

            if use_first and last_start_sent_ind is None and token.text == ".":
                last_start_sent_ind = ind + 1
                last_end_sent_ind = ind - 1

            elif not use_first and token.text == ".":
                last_start_sent_ind = ind + 1
                last_end_sent_ind = ind - 1

            if not token.ent_type_:
                if saved_first_token_ind is None:
                    saved_first_token_ind = ind

                ents_fixed.append(token.ent_iob_)
                continue

            if (token.ent_type_ == "DS" and last_ent == "DE") or (
                token.ent_type_ == "DE" and last_ent == "DS"
            ):
                last_ent = token.ent_type_
                last_ent_ind = ind
                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

                # Okay so reset them
                last_start_sent_ind, last_end_sent_ind = None, None
                saved_first_token_ind = None
                continue

            if last_ent is None and token.ent_type_ == "DS":
                last_ent = token.ent_type_
                last_ent_ind = ind

                last_start_sent_ind, last_end_sent_ind = None, None
                saved_first_token_ind = None

                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
                continue

            assert not (token.ent_type_ == "DE" and ind == 0), "First token must not be DE"

            # print(f"Current token: {token.text} {token.ent_iob_}-{token.ent_type_}")
            if last_start_sent_ind is not None:
                if token.ent_type_ == "DE":
                    # print(
                    #     f"Last start sent ind: {last_start_sent_ind} - {tokens_fixed[last_start_sent_ind]} {ents_fixed[last_start_sent_ind]}"
                    # )
                    assert ents_fixed[last_start_sent_ind] not in (
                        "B-DS",
                        "I-DS",
                        "B-DE",
                        "I-DE",
                    ), "The tag for the start of sentence is already set!"
                    ents_fixed[last_start_sent_ind] = "B-DS"
                elif token.ent_type_ == "DS":
                    # print(
                    #     f"Last end sent ind: {last_end_sent_ind} - {tokens_fixed[last_end_sent_ind]} {ents_fixed[last_end_sent_ind]}"
                    # )
                    assert ents_fixed[last_end_sent_ind] not in (
                        "B-DS",
                        "I-DS",
                        "B-DE",
                        "I-DE",
                    ), "The tag for the end of sentence is already set!"
                    ents_fixed[last_end_sent_ind] = "B-DE"
                else:
                    assert False, "Should not happen"
            else:
                if token.ent_iob_ == "I" and ents_fixed[ind - 1] == f"B-{token.ent_type_}":
                    # Fixed case like:
                    # I  B-DS
                    # 'm I-DS
                    last_ent = token.ent_type_
                    last_ent_ind = ind
                    ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
                    continue
                elif ents_fixed[ind - 1] == f"B-{token.ent_type_}":
                    # Two consequitive DS / DE tags that were not fixed so we leave
                    # the first one and remove the second one (for DS) and
                    # remove the first one and leave the second one (for DE)
                    if token.ent_type_ == "DS":
                        ents_fixed.append("O")
                    else:
                        ents_fixed[ind - 1] = "O"
                        ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
                    continue

                if use_sentence_boundaries:
                    # Remove middle consequitive tag
                    ents_fixed[last_ent_ind] = "O"
                else:
                    # Add missing tag to the first / last untagged token
                    if token.ent_type_ == "DE":
                        assert ents_fixed[saved_first_token_ind] not in (
                            "B-DS",
                            "I-DS",
                            "B-DE",
                            "I-DE",
                        ), "Start token already has a tag"
                        ents_fixed[saved_first_token_ind] = "B-DS"
                    elif token.ent_type_ == "DS":
                        assert ents_fixed[ind - 1] not in (
                            "B-DS",
                            "I-DS",
                            "B-DE",
                            "I-DE",
                        ), "End token already has a tag"
                        ents_fixed[ind - 1] = "B-DE"
                    else:
                        assert False, "Should not happen"

            last_start_sent_ind, last_end_sent_ind = None, None
            saved_first_token_ind = None

            last_ent = token.ent_type_
            last_ent_ind = ind
            ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

        # For cases when last token should be DE
        if last_ent == "DS" and last_end_sent_ind is not None:
            ents_fixed[last_end_sent_ind] = "B-DE"
        elif last_ent == "DS":
            assert ents_fixed[-1] not in (
                "B-DS",
                "I-DS",
                "B-DE",
                "I-DE",
            ), "Last token already has a tag"
            ents_fixed[-1] = "B-DE"

        # Check if each DS tag has a DE tag and vice versa
        ents_filtered = [ent for ent in ents_fixed if ent != "O" and not ent.startswith("I-")]
        assert len(ents_filtered) % 2 == 0, f"Example {ind} has uneven number of tags: {ents_filtered}!"


        doc_fixed = Doc(nlp.vocab, tokens_fixed, spaces=tokens_spaces, ents=ents_fixed)
        fixed_examples.append(Example(doc_fixed, example.reference))

    print()

    return fixed_examples


In [20]:
display_doc(merged_examples[108].predicted)

In [None]:
inferenced_last_sents, = inference_missing_tags(
    [merged_examples[108]], use_first=False, use_sentence_boundaries=True
)

In [19]:
inferenced_last_loose = inference_missing_tags(
    merged_examples, use_first=False, use_sentence_boundaries=False
)
inferenced_last_sents = inference_missing_tags(
    merged_examples, use_first=False, use_sentence_boundaries=True
)
inferenced_first_loose = inference_missing_tags(
    merged_examples, use_first=True, use_sentence_boundaries=False
)
inferenced_first_sents = inference_missing_tags(
    merged_examples, use_first=True, use_sentence_boundaries=True
)


779/779
779/779
779/779
779/779


In [20]:
num = 15

display_doc(merged_examples[num].predicted)
print("\nLast loose\n")
display_doc(inferenced_last_loose[num].predicted)
print("\nLast sents\n")
display_doc(inferenced_last_sents[num].predicted)
print("\nFirst loose\n")
display_doc(inferenced_first_loose[num].predicted)
print("\nFirst sents\n")
display_doc(inferenced_first_sents[num].predicted)



Last loose




Last sents




First loose




First sents



In [21]:
if not (metric_path := METRICS_PATH / "predicted_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(predicted_examples, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Predicted done in {perf_counter() - start:.2f}s")

# ----

if not (metric_path := METRICS_PATH / "merged_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(merged_examples, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Merged done in {perf_counter() - start:.2f}s")

# ----

if not (metric_path := METRICS_PATH / "last_loose_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(inferenced_last_loose, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Last loose done in {perf_counter() - start:.2f}s")

# ----

if not (metric_path := METRICS_PATH / "last_sents_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(inferenced_last_sents, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Last sents done in {perf_counter() - start:.2f}s")

# ----

if not (metric_path := METRICS_PATH / "first_loose_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(inferenced_first_loose, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"First loose done in {perf_counter() - start:.2f}s")

# ----

if not (metric_path := METRICS_PATH / "first_sents_metrics.json").exists():
    start = perf_counter()
    metrics = nlp.evaluate(inferenced_first_sents, batch_size=256)
    with open(metric_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"First sents done in {perf_counter() - start:.2f}s")

Predicted done in 221.78s
Merged done in 220.57s
Last loose done in 219.54s
Last sents done in 220.18s
First loose done in 219.92s
First sents done in 220.47s


In [22]:
# Print all metrics
for path in METRICS_PATH.glob("*.json"):
    with open(path) as f:
        metrics = json.load(f)
    
    print(path.stem)
    print(metrics, end="\n\n")

predicted_metrics
{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'ents_p': 0.6890407099591491, 'ents_r': 0.698187268055952, 'ents_f': 0.6935838355193193, 'ents_per_type': {'DS': {'p': 0.670701472947353, 'r': 0.655782118922002, 'f': 0.6631578947368422}, 'DE': {'p': 0.7061708214139762, 'r': 0.7406772396056579, 'f': 0.7230125523012553}}, 'speed': 1549.0483698610783}

merged_metrics
{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'ents_p': 0.6935346660995321, 'ents_r': 0.698187268055952, 'ents_f': 0.6958531901273205, 'ents_per_type': {'DS': {'p': 0.6787190082644629, 'r': 0.655782118922002, 'f': 0.6670534484008993}, 'DE': {'p': 0.7072305593451569, 'r': 0.7406772396056579, 'f': 0.7235675902016889}}, 'speed': 1554.7413010136536}

last_loose_metrics
{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'ents_p': 0.6614661531431556, 'ents_r': 0.7231658578361404, 'ents_f': 0.6909413248781153, 'ents_per_type': {'DS': {'p': 0.63039228

In [23]:
def extract_discourses(doc: Doc, keep_first_ds: bool = False, keep_first_de: bool = False):
    discourses = []
    tokens = [token.text for token in doc]

    last_ent = None
    ents = []
    deleted_offset = 0
    for ind, ent in enumerate(doc.ents):
        if ent.label_ == "DS" and last_ent == "DS":
            if not keep_first_ds:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        if ent.label_ == "DE" and last_ent == "DE":
            if not keep_first_de:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        ents.append(ent)
        last_ent = ent.label_

    last_tag = None
    for ind, ent in enumerate(ents):
        if ent.label_ == "DS":
            start_pos = ent.start
            last_tag = "DS"
            continue

        if ent.label_ == "DE":
            assert last_tag == "DS", "DE without DS"
            disc = " ".join(tokens[start_pos : ent.end])
            disc = re.sub(r" \.", ".", disc)
            discourses.append(disc)
            start_pos = None
            last_tag = "DE"
            continue

    return discourses


In [24]:
doc = inferenced_last_loose[num].reference
ref = extract_discourses(doc)
ref


['When you put cell phones and driving together you wont get a good outcome. Statistics show that every year 1. 6 million crashes happen because of cell phone use',
 'I believe that sometimes you may really need your phone for emergencies',
 'but with the new technology there are safer ways to answer the phone while driving',
 "I truly believe cell phone use shouldn't be allowed while operating a vehicle for the safety of ourselves and others",
 'Over the past few years cell phone use has become one of the leading causes for car accidents. So why would people continue to put themselves in that situation daily',
 "Cell phones are an addiction its proven that swiping on some of the apps we have causes our brain to release dopamine. I know how it is getting that notification and automatically wanting to grab your phone and that's also a reason why so many teen car accidents happen. Teens are the majority of whats on social media and not every teen realizes the seriousness of driving",
 'E

In [25]:
doc = inferenced_last_loose[num].predicted
pred = extract_discourses(doc, keep_first_de=True)
pred


['When you put cell phones and driving together you wo nt get a good outcome. Statistics show that every year 1. 6 million crashes happen because of cell phone use. I believe that sometimes you may really need your phone for emergencies but with the new technology there are safer ways to answer the phone while driving',
 "I truly believe cell phone use should n't be allowed while operating a vehicle for the safety of ourselves and others",
 'Over the past few years cell phone use has become one of the leading causes for car accidents. So why would people continue to put themselves in that situation daily',
 "Cell phones are an addiction its proven that swiping on some of the apps we have causes our brain to release dopamine. I know how it is getting that notification and automatically wanting to grab your phone and that 's also a reason why so many teen car accidents happen. Teens are the majority of what s on social media and not every teen realizes the seriousness of driving",
 'Even

In [26]:
def create_discourse_doc(doc: Doc):
    words = [token.text for token in doc]

    ents = []
    in_disc = False
    disc = "DISC"
    for token in doc:
        if token.ent_type_ == "DS":
            in_disc = True
            if token.ent_iob_ == "I":
                ents.append(f"I-{disc}")
            else:
                ents.append(f"B-{disc}")
        elif token.ent_type_ == "DE":
            in_disc = False
            ents.append(f"I-{disc}")
        elif in_disc:
            ents.append(f"I-{disc}")
        else:
            ents.append("O")

    return Doc(nlp.vocab, words, ents=ents)


In [27]:
disc_doc = inferenced_first_loose[9].reference
disc_doc = create_discourse_doc(disc_doc)
display_doc(disc_doc)

In [28]:
disc_doc = inferenced_first_loose[9].predicted
disc_doc = create_discourse_doc(disc_doc)
display_doc(disc_doc)

In [29]:
raise StopIteration


StopIteration: 

In [None]:
search_phrase = (
    "In the United States approximately 9 people are killed and over a thousand were injured"
)
for text in loader.iterate(purify_discourses=True, purify_text=True, verbose=True):
    if search_phrase in text.text:
        print(text.id)
        break


In [None]:
# NOTE:
# 2F159741CBBE text is good for analyzing in the document


In [None]:
text = loader.load_text_with_id("2F159741CBBE")
for disc in text.discourses:
    print(disc)
