In [1]:
import sys
from pathlib import Path


In [2]:
module_path = Path.cwd().parents[2]
if module_path not in sys.path:
    sys.path.append(str(module_path))


In [3]:
module_path


PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
import random
import regex as re
import pickle


In [5]:
import spacy
import torch
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin, Span
from spacy.training import Example


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
torch.cuda.empty_cache()
# spacy.require_gpu()


In [7]:
from src.loader import TextLoader
from src.model import DatasetType, Text


In [8]:
loader = TextLoader(dataset_type=DatasetType.V1_WITH_PREDICTIONSTRING)
nlp = spacy.load("models/spacy_resume/model-best/")


In [9]:
text = loader.load_random_text(purify_discourses=True, purify_text=True)
text.id


'E506BA4555A6'

In [10]:
for disc in text.discourses:
    print(disc)


--- 1618856290442 (0 -> 88 | 0 -> 16) - Lead ---
Do you think that a summer project would be easier if the teacher created it or yourself
------------------------------------------------
--- 1619480501396 (127 -> 204 | 25 -> 39) - Claim ---
the summer project would be much more easier to do if the teacher created it.
-----------------------------------------------------
--- 1618856698424 (235 -> 371 | 46 -> 72) - Claim ---
if the teacher design the project you might have an easy topic or get a better understanding on what to do or what the project is about.
-----------------------------------------------------
--- 1619480446482 (377 -> 554 | 74 -> 102) - Claim ---
if the teacher creates the project there's a possibility more kids would do it because they understand what their suppose to do instead of over thinking and stressing themselves
------------------------------------------------------
--- 1619480486500 (556 -> 654 | 103 -> 120) - Claim ---
If the students was to created an proj

In [11]:
def create_manual_doc(text: Text):
    word_idx = []
    for disc in text.discourses:
        word_idx.extend((disc.predictionstring[0], disc.predictionstring[-1]))

    ents = []

    DS_token = "B-DS"
    DE_token = "B-DE"
    use_DS = True
    for ind, word in enumerate(text.words):
        if use_DS:
            curr_token = DS_token
        else:
            curr_token = DE_token

        if ind in word_idx:
            ents.append(curr_token)
            use_DS = not use_DS
        else:
            ents.append("O")

    return Doc(nlp.vocab, text.words, ents=ents)


def display_doc(doc: Doc):
    spacy.displacy.render(doc, style="ent", jupyter=True)


In [12]:
manual_doc = create_manual_doc(text)


In [13]:
display_doc(manual_doc)


In [14]:
doc = nlp(text.text)


In [15]:
display_doc(doc)


In [16]:
doc_bin = DocBin().from_disk("data/NER_test.spacy")
len(doc_bin)


780

In [17]:
examples = []
for ind, doc in enumerate(doc_bin.get_docs(nlp.vocab)):
    examples.append(Example.from_dict(doc, {"entities": doc.ents}))


In [18]:
path_pred = Path("data/predicted_examples.pkl")

if not path_pred.exists():
    predicted_examples: list[Example] = []
    for ind, example in enumerate(examples):
        print(f"\r{ind + 1:3d}/{len(examples)}", end="")
        doc = nlp(example.text)
        predicted_examples.append(Example(doc, example.reference))

    pickle.dump(predicted_examples, open(path_pred, "wb"))

else:
    with open(path_pred, "rb") as f:
        predicted_examples: list[Example] = pickle.load(f)


In [19]:
def get_fixed_doc(example: Example, idx: list[int]):
    doc = example.predicted
    ents = doc.ents

    tokens_fixed = []
    tokens_spaces = []
    ents_fixed = []
    curr_ind = idx.pop(0)
    for token in doc:
        if curr_ind is not None and token.i == curr_ind + 1:
            # tokens_fixex[-1] = tokens_fixex[-1] + token.text
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)
            ents_fixed.append(f"I-{token.ent_type_}")
            if idx:
                curr_ind = idx.pop(0)
            else:
                curr_ind = None
        else:
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)
            if token.ent_iob_ == "O":
                ents_fixed.append(token.ent_iob_)
            else:
                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

    return Doc(nlp.vocab, tokens_fixed, spaces=tokens_spaces, ents=ents_fixed)


In [20]:
def merge_invalid_examples(predicted_examples: list[Example], verbose: bool = False) -> list[Example]:
    merged_examples = []
    counter = 0
    for ind, example in enumerate(predicted_examples):
        ents = example.predicted.ents

        idx = []
        for i, ent in enumerate(ents[:-1]):
            next_ent = ents[i + 1]
            if ent.end == next_ent.start and ent.label_ == next_ent.label_ and "'" in next_ent.text:
                idx.append(ent.start)

        if not idx:
            merged_examples.append(example)
            continue

        counter += 1

        if verbose:
            print(f"ind: {ind}")
            for ent in ents:
                print(f"{ent.start:>3} {ent.end:>3} {ent.label_} {ent.text:12}", end=" ")
                if ent.start in idx:
                    print("<<<<<")
                else:
                    print()
            print("\n----\n")

        fixed_doc = get_fixed_doc(example, idx)

        if verbose:
            for ent in fixed_doc.ents:
                print(f"{ent.start:>3} {ent.end:>3} {ent.label_} {ent.text:12}")
            print("\n----\n")

        merged_examples.append(Example(fixed_doc, example.reference))

    print(f"Fixed {counter} examples.")

    return merged_examples


In [21]:
merged_examples = merge_invalid_examples(predicted_examples, verbose=True)
print(f"All examples: {len(examples)}")


ind: 9
  0   1 DS I            
  8   9 DE cars         
 17  18 DE computers    
 19  20 DS I            <<<<<
 20  21 DS 'm           
 32  33 DE machine      
 34  35 DS If           
 52  53 DE computer     
 66  67 DE job          
 83  84 DE wrong        
 85  86 DS Someone      
104 105 DE nowfunction  
106 107 DS Say          
208 209 DE of           
209 210 DS Now          
237 238 DE others       
239 240 DS We           
275 276 DE life         
277 278 DS So           
293 294 DE badly        

----

  0   1 DS I           
  8   9 DE cars        
 17  18 DE computers   
 19  21 DS I'm         
 32  33 DE machine     
 34  35 DS If          
 52  53 DE computer    
 66  67 DE job         
 83  84 DE wrong       
 85  86 DS Someone     
104 105 DE nowfunction 
106 107 DS Say         
208 209 DE of          
209 210 DS Now         
237 238 DE others      
239 240 DS We          
275 276 DE life        
277 278 DS So          
293 294 DE badly       

----

ind: 11
  0   1 DS

In [22]:
# nlp.evaluate(merged_examples)


In [23]:
# Predicted ind: 14 has some issues with consequent DS/DE entities
display_doc(merged_examples[9].predicted)


In [56]:
def inference_missing_tags(
    examples: list[Example], use_first: bool = False, use_sentence_boundaries: bool = True
) -> list[Example]:
    """
    It may happened that consequitive tags are of the same type, e.g. ... DS DE DE ...
    which is not ideal as we cannot extract discourses from it. This function tries to add
    missing tags to such cases based on couple of rules:

    1. For the sequence of tags there musn't be any consequitive tags of the same type.
    2. For missing tags (e.g. DS DE DE we will try to find missing DS tag so that it becomes
       DS DE DS DE).
    3. Missing tag is added only as a start / end of sentence. In case of situation where
       there are couple of sentences between two consequitive tags, we can use either use first
       approach (add missing tag to the first found sentence) or use last approach (add
       missing tag to the last found sentence).
    4. If use_sentence_boundaries is set to True, then we will try to add missing tags only
       at the start / end of sentence. If this is not possible, the middle consequitive tag will be
       removed. If use_sentence_boundaries is set to False, then the missing tag will be added to the
       first / last untagged token before the next consequitive tag if there is no sentence boundary.
    """
    fixed_examples = []
    for ind, example in enumerate(examples):
        print(f"\r{ind:3d}/{len(examples) - 1}", end="")

        doc = example.predicted

        last_ent = None
        last_ent_ind = None

        last_start_sent_ind = 0
        last_end_sent_ind = 0

        saved_first_token_ind = None

        tokens_fixed = []
        tokens_spaces = []
        ents_fixed = []

        for ind, token in enumerate(doc):
            tokens_fixed.append(token.text)
            tokens_spaces.append(token.whitespace_)

            if use_first and last_start_sent_ind is None and token.text == ".":
                last_start_sent_ind = ind + 1
                last_end_sent_ind = ind - 1
            elif not use_first and token.text == ".":
                last_start_sent_ind = ind + 1
                last_end_sent_ind = ind - 1

            if not token.ent_type_:
                if saved_first_token_ind is None:
                    saved_first_token_ind = ind

                ents_fixed.append(token.ent_iob_)
                continue

            if (token.ent_type_ == "DS" and last_ent == "DE") or (
                token.ent_type_ == "DE" and last_ent == "DS"
            ):
                last_ent = token.ent_type_
                last_ent_ind = ind
                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

                # Okay so reset them
                last_start_sent_ind, last_end_sent_ind = None, None
                saved_first_token_ind = None
                continue

            if last_ent is None:
                assert not (token.ent_type_ == "DE" and ind == 0), "First token must not be DE"
                last_ent = token.ent_type_
                last_ent_ind = ind
                ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
                continue

            print(f"Current token: {token.text} {token.ent_iob_}-{token.ent_type_}")
            print(f"Previous token: {tokens_fixed[ind - 1]} {ents_fixed[ind - 1]}")
            if last_start_sent_ind is not None:
                if token.ent_type_ == "DE":
                    assert ents_fixed[last_start_sent_ind] not in (
                        "B-DS",
                        "B-DE",
                    ), "Start token already has a tag"
                    ents_fixed[last_start_sent_ind] = "B-DS"
                elif token.ent_type_ == "DS":
                    assert ents_fixed[last_end_sent_ind] not in (
                        "B-DS",
                        "B-DE",
                    ), "End token already has a tag"
                    ents_fixed[last_end_sent_ind] = "B-DE"
                else:
                    assert False, "Should not happen"
            else:
                if use_sentence_boundaries:
                    # Remove middle consequitive tag
                    ents_fixed[last_ent_ind] = "O"
                else:
                    if token.ent_iob_ == "I" and ents_fixed[ind - 1] == f"B-{token.ent_type_}":
                        # Fixed case like:
                        # I  B-DS
                        # 'm I-DS
                        last_ent = token.ent_type_
                        last_ent_ind = ind
                        ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")
                        continue
                    elif ents_fixed[ind - 1] == f"B-{token.ent_type_}":
                        # Two consequitive DS / DE tags that were not fixed so we leave
                        # the first one and remove the second one
                        ents_fixed.append("O")
                        continue
                    
                    # Add missing tag to the first / last untagged token
                    elif token.ent_type_ == "DE":
                        assert ents_fixed[saved_first_token_ind] not in (
                            "B-DS",
                            "B-DE",
                        ), "Start token already has a tag"
                        ents_fixed[saved_first_token_ind] = "B-DS"
                    elif token.ent_type_ == "DS":
                        assert ents_fixed[ind - 1] not in (
                            "B-DS",
                            "B-DE",
                        ), "End token already has a tag"
                        ents_fixed[ind - 1] = "B-DE"
                    else:
                        assert False, "Should not happen"

            last_start_sent_ind, last_end_sent_ind = None, None
            saved_first_token_ind = None

            last_ent = token.ent_type_
            last_ent_ind = ind
            ents_fixed.append(f"{token.ent_iob_}-{token.ent_type_}")

        doc_fixed = Doc(nlp.vocab, tokens_fixed, spaces=tokens_spaces, ents=ents_fixed)
        fixed_examples.append(Example(doc_fixed, example.reference))

    print()

    return fixed_examples


In [49]:
inferenced_last_loose = inference_missing_tags(merged_examples, use_first=False, use_sentence_boundaries=False)
inferenced_last_sents = inference_missing_tags(merged_examples, use_first=False, use_sentence_boundaries=True)
inferenced_first_loose = inference_missing_tags(merged_examples, use_first=True, use_sentence_boundaries=False)
inferenced_first_sents = inference_missing_tags(merged_examples, use_first=True, use_sentence_boundaries=True)


 47/779

AssertionError: End token already has a tag

In [57]:
inferenced_last_loose = inference_missing_tags([merged_examples[47]], use_first=False, use_sentence_boundaries=False)


  0/0Current token: there B-DS
Previous token: citizens O


AssertionError: End token already has a tag

In [52]:
display_doc(merged_examples[47].predicted)


In [35]:
display_doc(merged_examples[47].predicted)
display_doc(inferenced_last_loose[0].predicted)

In [None]:
display_doc(merged_examples[1].predicted)
print()
display_doc(inferenced_last_loose[1].predicted)
print()
display_doc(inferenced_last_sents[1].predicted)
print()
display_doc(inferenced_first_loose[1].predicted)
print()
display_doc(inferenced_first_sents[1].predicted)


In [None]:
# nlp.evaluate(fixed_examples)


In [None]:
ind = random.choice(range(len(predicted_examples)))


In [None]:
print("----------")
print("Reference:")
print("----------")
display_doc(predicted_examples[ind].reference)


In [None]:
print("----------")
print("Predicted:")
print("----------")
display_doc(predicted_examples[ind].predicted)


In [None]:
print("----------")
print("Fixed_pred:")
print("----------")
display_doc(fixed_examples[ind].predicted)


In [None]:
def extract_discourses(doc: Doc, keep_first_ds: bool = False, keep_first_de: bool = False):
    discourses = []
    tokens = [token.text for token in doc]

    last_ent = None
    ents = []
    deleted_offset = 0
    for ind, ent in enumerate(doc.ents):
        if ent.label_ == "DS" and last_ent == "DS":
            if not keep_first_ds:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        if ent.label_ == "DE" and last_ent == "DE":
            if not keep_first_de:
                ents[ind - deleted_offset - 1] = ent

            deleted_offset += 1
            continue

        ents.append(ent)
        last_ent = ent.label_

    for ent in ents:
        print(ent, ent.label_)

    last_tag = None
    for ind, ent in enumerate(ents):
        if ent.label_ == "DS":
            start_pos = ent.start
            last_tag = "DS"
            continue

        if ent.label_ == "DE":
            assert last_tag == "DS", "DE without DS"
            disc = " ".join(tokens[start_pos : ent.end])
            disc = re.sub(r" \.", ".", disc)
            discourses.append(disc)
            start_pos = None
            last_tag = "DE"
            continue

    return discourses


In [None]:
for token in predicted_examples[ind].reference:
    if not token.ent_type_:
        continue

    print(f"{token.text:12} {token.ent_iob_}-{token.ent_type_}")


In [None]:
for ent in predicted_examples[ind].predicted.ents:
    ent: Span = ent
    print(f"{ent.text:12} {ent.label_}")


In [None]:
doc = predicted_examples[ind].reference
ref = extract_discourses(doc)


In [None]:
doc = predicted_examples[ind].predicted
pred = extract_discourses(doc, keep_first_de=True)


In [None]:
pred


In [None]:
search_phrase = (
    "In the United States approximately 9 people are killed and over a thousand were injured"
)
for text in loader.iterate(purify_discourses=True, purify_text=True, verbose=True):
    if search_phrase in text.text:
        print(text.id)
        break


In [None]:
# NOTE:
# 2F159741CBBE text is good for analyzing in the document


In [None]:
text = loader.load_text_with_id("2F159741CBBE")
for disc in text.discourses:
    print(disc)


In [None]:
for ind, ref_item in enumerate(ref):
    print(f"Ref:\n----\n{ref_item}")
    if ind < len(pred):
        print(f"----\nPred:\n----\n{pred[ind]}")

    print("\n####\n")
