In [1]:
!pip install errant
!pip install spacy
!pip install stanza

Collecting errant
  Downloading errant-3.0.0-py3-none-any.whl.metadata (13 kB)
Downloading errant-3.0.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.3/499.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: errant
Successfully installed errant-3.0.0
Collecting stanza
  Downloading stanza-1.8.1-py3-none-any.whl.metadata (13 kB)
Downloading stanza-1.8.1-py3-none-any.whl (970 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m970.4/970.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stanza
Successfully installed stanza-1.8.1


In [2]:
import subprocess
import sys
import spacy
import stanza
import os
import re
from pathlib import Path

#### Paths definitions

In [3]:
input_folder_path = Path('/kaggle/input')
output_folder_path = Path(os.getcwd())

validation_m2_path = input_folder_path / 'valid-target' / 'valid.m2' # change to m2 valid path
generated_text_folder_path = input_folder_path / 'generated-text'

#### Evaluator class with all helper methods

In [4]:
class EvaluateErrant:
    def __init__(self,
                 generated_text_path: Path,
                 valid_m2_path: Path,
                 output_folder_path: Path,
                 last_id = None
                ):
        self.last_id = last_id
        self.generated_text_path = generated_text_path
        self.valid_m2_path = valid_m2_path
        self.tokenized_generated_path = output_folder_path / f"generated.tok"
        self.tokenized_valid_path = output_folder_path / f"valid.tok"
        self.validation_m2_cropped_path = output_folder_path / f"valid_cropped.m2"
        self.generated_m2_cropped_path = output_folder_path / f"generated_updated.m2"
        self.generated_m2_path = output_folder_path / f"generated.m2"
        self.nlp=stanza.Pipeline(lang="uk", processors="tokenize")

    def _tokenize(self,
                  text: str) -> [str]:
        tokenized = " ".join([t.text for t in self.nlp(text).iter_tokens()])
        return tokenized

    def _tokenize_file(self,
                       input_file: Path, 
                       output_file: Path):
        with open(input_file) as f, open(output_file, "w") as out:
            for line in f:
                line = line.rstrip("\n")
                tokenized = self._tokenize(line)
                out.write(tokenized + "\n")
            
    def _tokenize_generated(self):
        print("Tokenizing submission...", file=sys.stderr)
        self._tokenize_file(self.generated_text_path, self.tokenized_generated_path)
        print(f"Tokenized: {self.tokenized_generated_path}", file=sys.stderr)
            
    def _get_last_id_from_tokenized(self):
        generated_ids = []
        with open(self.tokenized_generated_path) as f:
            for line in f:
                if line.startswith('#'):
                    generated_ids.append(line)

        last_id = re.search(r'\d+', generated_ids[-1]).group()
        return last_id

    def _generate_croped_tokenized_valid_from_m2(self,
                                                 include_last_block = True):
        stop = not include_last_block
        with open(self.valid_m2_path) as f, open(self.tokenized_valid_path, "w") as out:
            for line in f:
                if line.startswith("S "):
                    line = line[2:]
                    if line.startswith('#'):
                        if stop:
                            break
                        if self.last_id in line:
                            stop = True
                    out.write(line)
                
    def _get_last_id_from_m2(self):
        generated_ids = []
        with open(self.generated_m2_path) as f:
            for line in f:
                if line.startswith("S "):
                    line = line[2:]
                    if line.startswith('#'):
                        generated_ids.append(line)
        last_id = re.search(r'\d+', generated_ids[-1]).group()
        return last_id
    
    def _update_m2_file(self, input_path, output_path):
        with open(input_path) as f, open(output_path, "w") as out:
            for line in f:
                if line[2:].startswith('#'):
                    if self.last_id in line:
                        break
                out.write(line)

    def _print_m2_content_length(self, input_path):
        content_len = 0
        block_count = 0
        with open(input_path) as f:
            for line in f:
                if line.startswith("S "):            
                    content_len+=1
                    line = line[2:]
                    if line.startswith('#'):
                        block_count+=1
        print(f'Content length: {content_len}')
        print(f'Blocks count: {block_count}')
    
    def evaluate(self):
        # tokenize generated text
        self._tokenize_generated()
        
        # get validation tokenized from m2 (cropped by the ids from generated text)
        tokenized_last_id = self._get_last_id_from_tokenized()
        self.last_id = min(tokenized_last_id, self.last_id) if self.last_id else tokenized_last_id
        self._generate_croped_tokenized_valid_from_m2(self.last_id)
        
        # generate m2 errant file for generated text
        subprocess.run(["errant_parallel", "-orig", self.tokenized_valid_path, "-cor", self.tokenized_generated_path, "-out", self.generated_m2_path], check=True)
        print(f"Aligned submission: {self.generated_m2_path}", file=sys.stderr)
        
        # crop m2 files to have the same length
        last_id = self._get_last_id_from_m2()
        self.last_id = min(last_id, self.last_id)
        self._update_m2_file(self.valid_m2_path, self.validation_m2_cropped_path)
        self._update_m2_file(self.generated_m2_path, self.generated_m2_cropped_path)
        
        # check the length and blocks count
        print('--Target')
        self._print_m2_content_length(self.validation_m2_cropped_path)
        print('--Generated')
        self._print_m2_content_length(self.generated_m2_cropped_path)
        
        # evaluate
        subprocess.run(["errant_compare", "-hyp", self.generated_m2_cropped_path, "-ref", self.validation_m2_cropped_path])
        subprocess.run(["errant_compare", "-hyp", self.generated_m2_cropped_path, "-ref", self.validation_m2_cropped_path, "-ds"])
        
        return self.last_id
        

### Evaluate raw model

In [5]:
generated_text_raw_path = generated_text_folder_path / 'ouput_base.txt'
output_raw = output_folder_path / 'raw_model'
os.makedirs(output_raw, exist_ok=True)

evaluator = EvaluateErrant(
    generated_text_path = generated_text_raw_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_raw)

last_id = evaluator.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.8.0/models/tokenize/iu.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.8.0/models/mwt/iu.pt:   0%|          | 0.0…

Tokenizing submission...
Tokenized: /kaggle/working/raw_model/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/raw_model/generated.m2


--Target
Content length: 1148
Blocks count: 70
--Generated
Content length: 1148
Blocks count: 70

TP	FP	FN	Prec	Rec	F0.5
20	6232	920	0.0032	0.0213	0.0039


TP	FP	FN	Prec	Rec	F0.5
229	6023	881	0.0366	0.2063	0.0438



### Evaluate fine-tuned model

In [6]:
generated_text_tuned_path = generated_text_folder_path / 'ouput_tuned.txt'
output_tuned = output_folder_path / 'tuned_model'
os.makedirs(output_tuned, exist_ok=True)

evaluator2 = EvaluateErrant(
    generated_text_path = generated_text_tuned_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_tuned,
    last_id = last_id)

evaluator2.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Tokenizing submission...
Tokenized: /kaggle/working/tuned_model/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/tuned_model/generated.m2


--Target
Content length: 1148
Blocks count: 70
--Generated
Content length: 1148
Blocks count: 70

TP	FP	FN	Prec	Rec	F0.5
20	6655	924	0.003	0.0212	0.0036


TP	FP	FN	Prec	Rec	F0.5
303	6372	817	0.0454	0.2705	0.0545



'0298'